/**
*
*/
package com.taobao.top.analysis.node.component;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.zookeeper.ZooKeeper;
import org.jboss.netty.channel.Channel;
import com.taobao.top.analysis.config.MasterConfig;
import com.taobao.top.analysis.exception.AnalysisException;
import com.taobao.top.analysis.node.IJobBuilder;
import com.taobao.top.analysis.node.IJobExporter;
import com.taobao.top.analysis.node.IJobManager;
import com.taobao.top.analysis.node.IJobResultMerger;
import com.taobao.top.analysis.node.event.GetTaskRequestEvent;
import com.taobao.top.analysis.node.event.SendResultsRequestEvent;
import com.taobao.top.analysis.node.job.Job;
import com.taobao.top.analysis.node.job.JobMergedResult;
import com.taobao.top.analysis.node.job.JobTask;
import com.taobao.top.analysis.node.job.JobTaskExecuteInfo;
import com.taobao.top.analysis.node.job.JobTaskResult;
import com.taobao.top.analysis.node.job.JobTaskStatus;
import com.taobao.top.analysis.node.operation.JobDataOperation;
import com.taobao.top.analysis.node.operation.MergeJobOperation;
import com.taobao.top.analysis.util.AnalysisConstants;
import com.taobao.top.analysis.util.AnalyzerZKWatcher;
import com.taobao.top.analysis.util.MasterDataRecoverWorker;
import com.taobao.top.analysis.util.NamedThreadFactory;
import com.taobao.top.analysis.util.ReportUtil;
import com.taobao.top.analysis.util.ZKUtil;
/**
* JobManager会被MasterNode以单线程方式调用
* 需要注意的是所有的内置Builder,Exporter,ResultMerger,ServerConnector都自己必须保证处理速度
*
* @author fangweng
* @Email fangweng@taobao.com 2011-11-28
*
*/
public class JobManager implements IJobManager {
private static final Log logger = LogFactory.getLog(JobManager.class);
private IJobBuilder jobBuilder;
private IJobExporter jobExporter;
private IJobResultMerger jobResultMerger;
private MasterConfig config;
private MasterNode masterNode;
/**
* 所负责的管理的任务集合
*/
private Map<String, Job> jobs;
/**
* slave 返回得结果数据
*/
private Map<String, BlockingQueue<JobTaskResult>> jobTaskResultsQueuePool;
/**
* 任务池
* 任务池的分配方式可能会产生分配不均等,也不是很好的分配策略
*/
private ConcurrentMap<String, JobTask> jobTaskPool;
/**
* 任务队列
*/
private BlockingQueue<JobTask> undoTaskQueue;
/**
* 任务状态池
*/
private ConcurrentMap<String, JobTaskStatus> statusPool;
/**
* 未何并的中间结果
*/
private Map<String, BlockingQueue<JobMergedResult>> branchResultQueuePool;
/**
* 事件处理线程
*/
private ThreadPoolExecutor eventProcessThreadPool;
/**
* 用于合并后台历史数据,当master出错时,slave会纪录一些数据在本地用于恢复
*/
private MasterDataRecoverWorker masterDataRecoverWorker;
/**
* 关闭标志,重启关闭时置为true
* 置为true后,不再分配新的任务,并等待任务merge完成
* 导出中间结果
*/
private volatile boolean stopped = false;
ZooKeeper zk = null;
@Override
public void init() throws AnalysisException {
// 获得任务数量
jobBuilder.setConfig(config);
jobExporter.setConfig(config);
jobResultMerger.setConfig(config);
jobBuilder.init();
jobExporter.init();
jobResultMerger.init();
jobs = jobBuilder.build();
for(Job job : jobs.values()) {
job.reset(null);
}
if (jobs == null || (jobs != null && jobs.size() == 0))
throw new AnalysisException("jobs should not be empty!");
jobTaskPool = new ConcurrentHashMap<String, JobTask>();
undoTaskQueue = new LinkedBlockingDeque<JobTask>();
statusPool = new ConcurrentHashMap<String, JobTaskStatus>();
jobTaskResultsQueuePool = new HashMap<String, BlockingQueue<JobTaskResult>>();
branchResultQueuePool = new HashMap<String, BlockingQueue<JobMergedResult>>();
for (String jobName : jobs.keySet()) {
jobTaskResultsQueuePool.put(jobName, new LinkedBlockingQueue<JobTaskResult>());
branchResultQueuePool.put(jobName, new LinkedBlockingQueue<JobMergedResult>());
}
eventProcessThreadPool =
new ThreadPoolExecutor(this.config.getMaxJobEventWorker(), this.config.getMaxJobEventWorker(), 0,
TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new NamedThreadFactory(
"jobManagerEventProcess_worker"));
masterDataRecoverWorker =
new MasterDataRecoverWorker(config.getMasterName(), config.getTempStoreDataDir(), jobs, this.config);
masterDataRecoverWorker.start();
addJobsToPool();
if (StringUtils.isNotEmpty(config.getZkServer()))
{
try
{
AnalyzerZKWatcher<MasterConfig> analyzerZKWatcher =
new AnalyzerZKWatcher<MasterConfig>(config);
zk = new ZooKeeper(config.getZkServer(),3000,analyzerZKWatcher);
analyzerZKWatcher.setZk(zk);
ZKUtil.createGroupNodesIfNotExist(zk,config.getGroupId());
}
catch(Exception ex)
{
logger.error("zk init error!",ex);
}
}
if (logger.isInfoEnabled())
logger.info("jobManager init end, MaxJobEventWorker size : " + config.getMaxJobEventWorker());
}
@Override
public void releaseResource() {
stopped = true;
try {
// 导出所有结果,暂时不导出中间data,后面看是否需要
//添加中间结果导出,不导出中间结果,会有部分数据丢失
long start = System.currentTimeMillis();
for(JobTask jobTask : this.jobTaskPool.values()) {
while(JobTaskStatus.DOING.equals(jobTask.getStatus())) {
Thread.sleep(10000);
if(System.currentTimeMillis() - start > 60000)
break;
}
}
if (jobs != null)
for (Job j : jobs.values()) {
boolean gotIt = j.getTrunkLock().writeLock().tryLock();
if (gotIt) {
try {
if (!j.isMerged().get()) {
List<Map<String, Map<String, Object>>> mergeResults =
new ArrayList<Map<String, Map<String, Object>>>();
new MergeJobOperation(j, 0, mergeResults, config, branchResultQueuePool.get(j
.getJobName()), true).run();
j.isMerged().set(true);
logger.warn("job is timeout, last merge trunk success!");
}
}
finally {
j.getTrunkLock().writeLock().unlock();
}
}
JobDataOperation jobd =
new JobDataOperation(j, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config);
jobd.run();
logger.info("releaseResouce now, export job : " + j.getJobName());
// while(!j.getTrunkExported().get())
// Thread.sleep(3000);
// if (!j.isExported().get()) {
// jobExporter.exportReport(j, false);
// logger.info("releaseResouce now, export job : " + j.getJobName());
// }
}
if (eventProcessThreadPool != null)
eventProcessThreadPool.shutdown();
if (masterDataRecoverWorker != null)
masterDataRecoverWorker.stopWorker();
} catch (Throwable e) {
logger.error("error when stop the node", e);
}
finally {
if (jobs != null)
jobs.clear();
if (jobTaskPool != null)
jobTaskPool.clear();
if(undoTaskQueue != null)
undoTaskQueue.clear();
if (statusPool != null)
statusPool.clear();
if (jobTaskResultsQueuePool != null)
jobTaskResultsQueuePool.clear();
if (branchResultQueuePool != null)
branchResultQueuePool.clear();
if (jobBuilder != null)
jobBuilder.releaseResource();
if (jobExporter != null)
jobExporter.releaseResource();
if (jobResultMerger != null)
jobResultMerger.releaseResource();
logger.info("jobManager releaseResource end");
}
}
// 分配任务和结果提交处理由于是单线程处理,
// 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
@Override
public void getUnDoJobTasks(GetTaskRequestEvent requestEvent) {
String jobName = requestEvent.getJobName();
int jobCount = requestEvent.getRequestJobCount();
final List<JobTask> jobTasks = new ArrayList<JobTask>();
//如果关闭,则直接返回一个空的JobTask的list给slave
if(this.stopped) {
masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());
return;
}
// 指定job
if (jobName != null && jobs.containsKey(jobName)) {
Job job = jobs.get(jobName);
List<JobTask> tasks = job.getJobTasks();
for (JobTask jobTask : tasks) {
if (jobTask.getStatus().equals(JobTaskStatus.UNDO)) {
if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
this.allocateTask(jobTask);
jobTasks.add(jobTask);
if (jobTasks.size() == jobCount)
break;
}
}
}
}
else {
Iterator<JobTask> taskIter = undoTaskQueue.iterator();
while (taskIter.hasNext()) {
// String taskId = taskIds.next();
// JobTask jobTask = jobTaskPool.get(taskId);
JobTask jobTask = taskIter.next();
if (!jobTaskPool.keySet().contains(jobTask.getTaskId())
|| jobs.get(jobTask.getJobName()).getEpoch().get() > jobTask.getJobEpoch()
|| jobs.get(jobTask.getJobName()).getJobTimeOut().get()) {
taskIter.remove();
continue;
}
if (jobs.get(jobTask.getJobName()).getJobConfig().getSlaveIpCondition() != null) {
try {
Channel channel = (Channel) requestEvent.getChannel();
if (!channel.getRemoteAddress().toString()
.matches(jobs.get(jobTask.getJobName()).getJobConfig().getSlaveIpCondition())) {
continue;
}
}
catch (Throwable e) {
logger.error(e);
}
}
if (statusPool.get(jobTask.getTaskId()).equals(JobTaskStatus.UNDO)) {
if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
this.allocateTask(jobTask);
jobTasks.add(jobTask);
taskIter.remove();
if (jobTasks.size() >= jobCount)
break;
}
} else
taskIter.remove();
}
}
// 是否需要用异步方式发送,减少对jobManager事件处理延时
if (config.isUseAsynModeToSendResponse()) {
final String sequence = requestEvent.getSequence();
final Object channel = requestEvent.getChannel();
// 由于该操作比较慢,开线程执行,保证速度
eventProcessThreadPool.execute(new Runnable() {
public void run() {
try {
masterNode.echoGetJobTasks(sequence, jobTasks, channel);
}
catch (Throwable e) {
logger.error(e);
}
}
});
}
else
masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());
}
private void allocateTask(JobTask jobTask) {
jobTask.setStatus(JobTaskStatus.DOING);
jobTask.setStartTime(System.currentTimeMillis());
}
// 分配任务和结果提交处理由于是单线程处理,
// 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
@Override
public void addTaskResultToQueue(SendResultsRequestEvent jobResponseEvent) {
JobTaskResult jobTaskResult = jobResponseEvent.getJobTaskResult();
if (jobTaskResult.getTaskIds() != null && jobTaskResult.getTaskIds().size() > 0) {
// 判断是否是过期的一些老任务数据,根据task和taskresult的createtime来判断
// 以后要扩展成为如果发现当前的epoch < 结果的epoch,表明这台可能是从属的master,负责reduce,但是速度跟不上了
if(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)) == null) {
logger.error("jobTask is null " + jobTaskResult.getTaskIds().get(0));
masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
return;
}
if (jobTaskResult.getJobEpoch() != jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch() && this.config.getDispatchMaster()) {
// 结果过期, 肯能是任务超时后, 被重新分配了
if (jobTaskResult.getJobEpoch() < jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
logger.error("old task result will be discard! job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",epoch:" + jobTaskResult.getJobEpoch() + ",slave:" + jobResponseEvent.getChannel());
masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
return;
}
else {
// 给一定的容忍时间,暂时定为5秒
jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).blockToResetJob(15000);
// 这块有点疑问, 什么情况会出现
if (jobTaskResult.getJobEpoch() > jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
logger.error("otherMaster can't merge in time!job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",taskResult epoch:" + jobTaskResult.getJobEpoch() + ", task epoch:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch());
masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
if(!this.config.getDispatchMaster()) {
jobs.get(jobTaskResult.getJobName()).reset(this);
} else {
return;
}
}
}
}
if (logger.isWarnEnabled()) {
StringBuilder ts =
new StringBuilder("Receive slave analysis result, jobTaskIds : ")
.append(jobTaskResult.toString()).append(", ").append(jobTaskResult.getTaskIds().size());
logger.warn(ts.toString());
}
if(jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).isMerged().get()) {
masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
return;
}
// 先放入队列,防止小概率多线程并发问题
jobTaskResultsQueuePool.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).offer(
jobTaskResult);
if(logger.isInfoEnabled()) {
StringBuilder sb = new StringBuilder("add result [");
for(String s : jobTaskResult.getTaskIds()) {
sb.append(s).append(",");
}
sb.append("] to queue:").append(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName());
logger.info(sb.toString());
}
Iterator<String> iter = jobTaskResult.getTaskIds().iterator();
while (iter.hasNext()) {
String taskId = iter.next();
JobTask jobTask = jobTaskPool.get(taskId);
if (jobTask == null)
{
logger.error(new StringBuilder("taskId :").append(taskId).append("not exist!").toString());
continue;
}
Job job = jobs.get(jobTask.getJobName());
if(job == null) {
logger.error(new StringBuilder("job :").append(jobTask.getJobName()).append("not exist!").toString());
continue;
}
if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.DONE)
|| statusPool.replace(taskId, JobTaskStatus.UNDO, JobTaskStatus.DONE)) {
logger.info("task " + jobTask.getJobName() + " of job " + job.getJobName() + " done");
jobTask.setStatus(JobTaskStatus.DONE);
jobTask.getTailCursor().compareAndSet(true, false);
jobTask.setEndTime(System.currentTimeMillis());
jobTask.setLastMergedEpoch(job.getEpoch().get());
job.getCompletedTaskCount().incrementAndGet();
} else {
if(!this.config.getDispatchMaster()) {
jobTask.setStatus(JobTaskStatus.DONE);
jobTask.getTailCursor().compareAndSet(true, false);
jobTask.setEndTime(System.currentTimeMillis());
jobTask.setLastMergedEpoch(job.getEpoch().get());
statusPool.put(taskId, JobTaskStatus.DONE);
iter.remove();
}
}
//对jobTask的执行结果打点
StringBuilder log = new StringBuilder(ReportUtil.SLAVE_LOG).append(",timeStamp=")
.append(System.currentTimeMillis()).append(",epoch=")
.append(job.getEpoch()).append(",jobName=");
log.append(jobTask.getJobName()).append(",taskId=")
.append(jobTask.getTaskId()).append(",recycleCounter=")
.append(jobTask.getRecycleCounter().get()).append(",slaveIp=")
.append(jobTaskResult.getSlaveIp()).append(",efficiency=")
.append(jobTaskResult.getEfficiency()).append(",");
JobTaskExecuteInfo executeInfo = jobTaskResult.getTaskExecuteInfos().get(jobTask.getTaskId());
if (executeInfo != null) {
log.append("analysisConsume=").append(executeInfo.getAnalysisConsume()).append(",")
.append("jobDataSize=").append(executeInfo.getJobDataSize()).append(",").append("totalLine=")
.append(executeInfo.getTotalLine()).append(",").append("errorLine=")
.append(executeInfo.getErrorLine()).append(",").append("emptyLine=")
.append(executeInfo.getEmptyLine()).append(",fileBegin=").append(executeInfo.getFileBegin())
.append(",fileLength=").append(executeInfo.getFileLength());
if(jobTask.getInput().startsWith("hub:")) {
jobTask.setJobSourceTimeStamp(executeInfo.getTimestamp());
job.updateCursor(jobTask.getUrl(), executeInfo.getFileBegin(), executeInfo.getFileLength(), executeInfo.getTimestamp());
}
}
else
logger.error(new StringBuilder().append("taskId : ").
append(jobTask.getTaskId()).append(" executeInfo is null!").toString());
ReportUtil.clusterLog(log.toString());
//增加一块对于zookeeper的支持
if (StringUtils.isNotEmpty(config.getZkServer()) && zk != null)
{
try
{
ZKUtil.updateOrCreateNode(zk,new StringBuilder()
.append(ZKUtil.getGroupMasterZKPath(config.getGroupId()))
.append("/").append(config.getMasterName())
.append("/runtime/").append(job.getEpoch())
.append("/").append(jobTask.getJobName())
.append("/").append(jobTask.getTaskId()).toString(),log.toString().getBytes("UTF-8"));
}
catch(Exception ex)
{
logger.error("log to zk error!",ex);
}
}
}
}
// 是否需要用异步方式发送,减少对jobManager事件处理延时
if (config.isUseAsynModeToSendResponse()) {
final String sequence = jobResponseEvent.getSequence();
final Object channel = jobResponseEvent.getChannel();
eventProcessThreadPool.execute(new Runnable() {
public void run() {
try {
masterNode.echoSendJobTaskResults(sequence, "success", channel);
}
catch (Throwable e) {
logger.error(e);
}
}
});
}
else
masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
}
@Override
public void exportJobData(String jobName) {
if (jobs.containsKey(jobName)) {
jobExporter.exportEntryData(jobs.get(jobName));
}
else {
logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
}
}
@Override
public void loadJobData(String jobName) {
if (jobs.containsKey(jobName)) {
jobExporter.loadEntryData(jobs.get(jobName));
}
else {
logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
}
}
/**
* 从某一个备份载入job的临时数据开始恢复
*
* @param jobName
* @param epoch
*/
@Override
public void loadJobBackupData(String jobName, String bckPrefix) {
if (jobs.containsKey(jobName)) {
jobExporter.loadJobBackupData(jobs.get(jobName), bckPrefix);
}
else {
logger.error("loadJobBackupData do nothing, jobName " + jobName + " not exist!");
}
}
@Override
public void loadJobDataToTmp(String jobName) {
if (jobs.containsKey(jobName)) {
jobExporter.loadEntryDataToTmp(jobs.get(jobName));
}
else {
logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
}
}
@Override
public void clearJobData(String jobName) {
Job job = jobs.get(jobName);
if (job != null) {
job.getJobResult().clear();
if (logger.isWarnEnabled())
logger.warn("clear job :" + job.getJobName() + " data.");
}
}
@Override
public synchronized void checkJobStatus() throws AnalysisException {
// 通过外部事件激发重新载入配置
if (jobBuilder.isNeedRebuild()) {
if(logger.isInfoEnabled()) {
logger.info("check job status need to rebuild");
}
jobs = jobBuilder.rebuild(jobs);
if (jobs == null || (jobs != null && jobs.size() == 0))
throw new AnalysisException("jobs should not be empty!");
}
try {
if(this.config.getDispatchMaster())
checkTaskStatus();
} catch (Throwable e) {
logger.error("checkTaskStatus Error", e);
}
// 合并任务,并导出报表
try {
mergeAndExportJobs();
} catch (Throwable e) {
logger.error("mergeAndExport Error", e);
}
//任务全部完成并且没有新加任务的情况下,休息1s
for(Job job : jobs.values()) {
if(!job.isExported().get() || job.getRebuildTag() == 2) {
return;
} else {
try {
Thread.sleep(1000);
}
catch (InterruptedException e) {
logger.error(e);
}
}
}
// 打点观察Direct Memory区域的大小
try {
Class<?> c = Class.forName("java.nio.Bits");
Field maxMemory = c.getDeclaredField("maxMemory");
maxMemory.setAccessible(true);
Field reservedMemory = c.getDeclaredField("reservedMemory");
reservedMemory.setAccessible(true);
synchronized (c) {
Long maxMemoryValue = (Long) maxMemory.get(null);
Long reservedMemoryValue = (Long) reservedMemory.get(null);
if (logger.isInfoEnabled()) {
logger.info("now the maxMemory is " + String.valueOf(maxMemoryValue)
+ " and the reservedMemory is " + String.valueOf(reservedMemoryValue));
}
}
}
catch (Throwable e) {
logger.error("trying to get java.nio.Bits class failed");
}
}
// 重新增加任务到任务池中
protected void addJobsToPool() {
for (Job job : jobs.values()) {
List<JobTask> tasks = job.getJobTasks();
for (JobTask task : tasks) {
jobTaskPool.put(task.getTaskId(), task);
statusPool.put(task.getTaskId(), task.getStatus());
undoTaskQueue.offer(task);
}
if(jobTaskResultsQueuePool.get(job.getJobName()) == null)
jobTaskResultsQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobTaskResult>());
if(branchResultQueuePool.get(job.getJobName()) == null)
branchResultQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobMergedResult>());
}
}
// 做合并和导出,重置任务的检查操作
//所有任务一起来轮询,对Master来讲,有点资源浪费
//可以通过以下几种方式改进:
//1、针对job的属性设置监听器,Listener模式
//2、使用Observer模式
protected void mergeAndExportJobs() {
Iterator<Map.Entry<String, Job>> iter = jobs.entrySet().iterator();
while(iter.hasNext()) {
Job job = iter.next().getValue();
if(job.getRebuildTag() == 2) {
job.rebuild(0, null, this);
continue;
}
if (!job.getJobTimeOut().get())
{
// 需要合并该job的task
if (!job.isMerging().get() && job.needMerge()) {
logger.warn("job " + job.getJobName()
+ " complete tasks:" + job.getCompletedTaskCount().get() + ", merged tasks :" + job.getMergedTaskCount().get());
final Job j = job;
final BlockingQueue<JobMergedResult> branchResultQueue = branchResultQueuePool.get(j.getJobName());
final BlockingQueue<JobTaskResult> jobTaskResultsQueue = jobTaskResultsQueuePool.get(j.getJobName());
if (j.isMerging().compareAndSet(false, true))
eventProcessThreadPool.execute(new Runnable() {
public void run() {
try {
jobResultMerger.merge(j, branchResultQueue, jobTaskResultsQueue, true);
} catch (Throwable e) {
logger.error(e);
}
finally {
j.isMerging().set(false);
}
}
});
}
}
else
{
// Job超时了, 尝试做一次主干merge
//判断是否还有和主干合并的线程,如果没有可以设置完成标识
boolean gotIt = job.getTrunkLock().writeLock().tryLock();
if (gotIt)
{
try
{ if(!job.isMerged().get())
{
List<Map<String, Map<String, Object>>> mergeResults = new ArrayList<Map<String, Map<String, Object>>>();
new MergeJobOperation(job,0,mergeResults,config,branchResultQueuePool.get(job.getJobName())).run();
job.isMerged().set(true);
logger.warn("job is timeout, last merge trunk success!");
}
}
finally
{
job.getTrunkLock().writeLock().unlock();
}
}
}
// 需要导出该job的数据
if (!job.isExporting().get() && job.needExport()) {
final Job j = job;
if (j.isExporting().compareAndSet(false, true))
eventProcessThreadPool.execute(new Runnable() {
public void run() {
try {
// 虽然是多线程,但还是阻塞模式来做
jobExporter.exportReport(j, false);
j.isExported().set(true);
} catch (Throwable e) {
logger.error(e);
}
finally {
j.isExporting().set(false);
}
// 判断是否需要开始导出中间结果,放在外部不妨碍下一次的处理
exportOrCleanTrunk(j);
}
});
}
//做一次任务处理时间判断,如果超时将设置job的超时状态位置
if(this.config.getDispatchMaster())
job.checkJobTimeOut();
// 任务是否需要被重置
if (job.needReset() || (!this.config.getDispatchMaster() && job.isExported().get()) ) {
if(logger.isWarnEnabled())
logger.warn("job " + job.getJobName() + " be reset now.");
//检查任务是否需要重新build
if(job.getRebuildTag() == -1) {
job.rebuild(0, null, this);
iter.remove();
}
if(job.getRebuildTag() == 1) {
job.rebuild(0, null, this);
}
StringBuilder sb = new StringBuilder(ReportUtil.MASTER_LOG).append(",timeStamp=")
.append(System.currentTimeMillis()).append(",epoch=");
sb.append(job.getEpoch()).append(",jobName=")
.append(job.getJobName()).append(",timeConsume=")
.append(System.currentTimeMillis() - job.getStartTime()).append(",jobMergeTime=")
.append(job.getJobMergeTime().get()).append(",jobExportTime=")
.append(job.getJobExportTime()).append(",taskCount=")
.append(job.getTaskCount()).append(",completedTaskCount=")
.append(job.getCompletedTaskCount().get()).append(",mergedTaskCount=")
.append(job.getMergedTaskCount().get()).append(",jobMergeBranchCount=")
.append(job.getJobMergeBranchCount().get());
ReportUtil.clusterLog(sb.toString());
//增加一块对于zookeeper的支持
if (StringUtils.isNotEmpty(config.getZkServer()) && zk != null)
{
try
{
ZKUtil.updateOrCreateNode(zk,new StringBuilder()
.append(ZKUtil.getGroupMasterZKPath(config.getGroupId()))
.append("/").append(config.getMasterName())
.append("/runtime/").append(job.getEpoch())
.append("/").append(job.getJobName()).toString(),sb.toString().getBytes("UTF-8"));
}
catch(Exception ex)
{
logger.error("log to zk error!",ex);
}
}
job.reset(this);
if (logger.isInfoEnabled()) {
sb = new StringBuilder("jobManager:{jobs:").append(jobs.size()).append(
",jobTaskPool:" + jobTaskPool.size());
sb.append(",statusPool:").append(statusPool.size()).append(",undoTasks:")
.append(undoTaskQueue.size()).append("}");
logger.info(sb.toString());
}
List<JobTask> tasks = job.getJobTasks();
for (JobTask task : tasks) {
statusPool.put(task.getTaskId(), task.getStatus());
}
}
}
}
/**
* 在导出数据以后,判断是否需要清空主干,是否需要导出主干
*
* @param job
*/
protected void exportOrCleanTrunk(Job job) {
boolean needToSetJobResultNull = false;
// 判断是否到了报表的有效时间段,支持小时,日,月三种方式
if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_DAY)) {
Calendar calendar = Calendar.getInstance();
int now = calendar.get(Calendar.DAY_OF_MONTH);
if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
needToSetJobResultNull = true;
job.setReportPeriodFlag(now);
}
else {
if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_HOUR)) {
Calendar calendar = Calendar.getInstance();
int now = calendar.get(Calendar.HOUR_OF_DAY);
if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
needToSetJobResultNull = true;
job.setReportPeriodFlag(now);
}
else {
if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_MONTH)) {
Calendar calendar = Calendar.getInstance();
int now = calendar.get(Calendar.MONTH);
if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
needToSetJobResultNull = true;
job.setReportPeriodFlag(now);
}
}
}
if (needToSetJobResultNull) {
job.setJobResult(null);
job.getEpoch().set(0);
// 删除临时文件,防止重复载入使得清空不生效
if (config.getSaveTmpResultToFile()) {
JobDataOperation jobDataOperation =
new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_DEL_DATAFILE, this.config);
jobDataOperation.run();
}
if(logger.isWarnEnabled())
logger.warn("job " + job.getJobName() + " report data be reset.it's a new start. ");
}
// 清除主干数据,到时候自然会载入
if (config.getSaveTmpResultToFile() && (job.getJobConfig().getSaveTmpResultToFile() == null || job.getJobConfig().getSaveTmpResultToFile())) {
logger.warn("@disk2Mem mode: start " + job.getJobName() + " store trunk to disk now .");
JobDataOperation jobDataOperation =
new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_SETNULL_EXPORTDATA, this.config);
jobDataOperation.run();
}
else {
if (job.getLastExportTime() == 0
|| System.currentTimeMillis() - job.getLastExportTime() >= config.getExportInterval() || stopped) {
logger.warn("export job: " + job.getJobName() + " trunk to disk.");
JobDataOperation jobDataOperation =
new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config);
jobDataOperation.run();
}
}
}
// 重置在指定时间内未完成的任务
protected void checkTaskStatus() {
Iterator<String> taskIds = statusPool.keySet().iterator();
while (taskIds.hasNext()) {
String taskId = taskIds.next();
JobTaskStatus taskStatus = statusPool.get(taskId);
JobTask jobTask = jobTaskPool.get(taskId);
if (taskStatus == JobTaskStatus.DOING && jobTask.getStartTime() != 0
&& System.currentTimeMillis() - jobTask.getStartTime() >= jobTask.getTaskRecycleTime() * 1000) {
if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.UNDO)) {
jobTask.setStatus(JobTaskStatus.UNDO);
undoTaskQueue.offer(jobTask);
jobTask.getRecycleCounter().incrementAndGet();
if (logger.isWarnEnabled())
logger.warn("Task : " + jobTask.getTaskId() + " can't complete in time, it be recycle.");
}
}
}
}
@Override
public MasterConfig getConfig() {
return config;
}
@Override
public void setConfig(MasterConfig config) {
this.config = config;
}
@Override
public Map<String, Job> getJobs() {
return jobs;
}
@Override
public void setJobs(Map<String, Job> jobs) {
this.jobs = jobs;
}
@Override
public IJobBuilder getJobBuilder() {
return jobBuilder;
}
@Override
public void setJobBuilder(IJobBuilder jobBuilder) {
this.jobBuilder = jobBuilder;
}
@Override
public IJobExporter getJobExporter() {
return jobExporter;
}
@Override
public void setJobExporter(IJobExporter jobExporter) {
this.jobExporter = jobExporter;
}
@Override
public IJobResultMerger getJobResultMerger() {
return jobResultMerger;
}
@Override
public void setJobResultMerger(IJobResultMerger jobResultMerger) {
this.jobResultMerger = jobResultMerger;
}
@Override
public void setMasterNode(MasterNode masterNode) {
this.masterNode = masterNode;
}
public Map<String, BlockingQueue<JobTaskResult>> getJobTaskResultsQueuePool() {
return jobTaskResultsQueuePool;
}
public void setJobTaskResultsQueuePool(Map<String, BlockingQueue<JobTaskResult>> jobTaskResultsQueuePool) {
this.jobTaskResultsQueuePool = jobTaskResultsQueuePool;
}
public Map<String, BlockingQueue<JobMergedResult>> getBranchResultQueuePool() {
return branchResultQueuePool;
}
public void setBranchResultQueuePool(Map<String, BlockingQueue<JobMergedResult>> branchResultQueuePool) {
this.branchResultQueuePool = branchResultQueuePool;
}
/**
* @return the jobTaskPool
*/
public ConcurrentMap<String, JobTask> getJobTaskPool() {
return jobTaskPool;
}
/**
* @return the statusPool
*/
public ConcurrentMap<String, JobTaskStatus> getStatusPool() {
return statusPool;
}
/**
* @return the undoTaskQueue
*/
public BlockingQueue<JobTask> getUndoTaskQueue() {
return undoTaskQueue;
}
}