/*** 开始执行Task实例的Checkpoint操作* 该方法的触发方式有2种:* 1.CheckpointCoordinator组件周期性的触发Source节点的Checkpoint操作* 2.下游算子通过CheckpointBarrier对齐触发本节点算子的Checkpoint操作* 不管哪种触发,最终都得调用该方法完成状态数据的持久化* 注意:会先将Barrier广播给下游算子,然后本Task才会做自己的Checkpoint。如此循环...*/
private boolean performCheckpoint(CheckpointMetaData checkpointMetaData,CheckpointOptions checkpointOptions,CheckpointMetrics checkpointMetrics,boolean advanceToEndOfTime) throws Exception {LOG.debug("Starting checkpoint ({}) {} on task {}",checkpointMetaData.getCheckpointId(), checkpointOptions.getCheckpointType(), getName());final long checkpointId = checkpointMetaData.getCheckpointId();// 判断当前Task是否正常运行if (isRunning) {// 核心:使用线程池以异步非阻塞的方式执行Checkpoint操作(不会block数据的正常处理)actionExecutor.runThrowing(() -> {if (checkpointOptions.getCheckpointType().isSynchronous()) {setSynchronousSavepointId(checkpointId);if (advanceToEndOfTime) {advanceToEndOfEventTime();}}/*** step 1:让OperatorChain中的所有Operator执行“pre-barrier”*/operatorChain.prepareSnapshotPreBarrier(checkpointId);/*** stpe 2:先将CheckpointBarrier事件广播到下游的节点中* 广播本质:在Task的输出ResultPartition中,会给下游所有的Channel都发送一个CheckpointBarrier事件*/operatorChain.broadcastCheckpointBarrier(checkpointId,checkpointMetaData.getTimestamp(),checkpointOptions);/*** step 3(核心):对StreamTask中的OperatorChain内的所有StreamOperator,异步执行“状态数据快照”操作(异步非阻塞,不影响数据的正常处理)*/checkpointState(checkpointMetaData, checkpointOptions, checkpointMetrics);});return true;} else {/*** 如果isRunning为false,表示Task不在运行状态。这种状态下,需要将“CancelCheckpointMarker消息”发送给OperatorChain中所有的StreamOperator,* 并向下游算子进行广播。当且仅当OperatorChain中的算子还没执行完Checkpoint操作时,下游算子收到“CancelCheckpointMarker消息”后,才会立即取消Checkpoint操作。*/actionExecutor.runThrowing(() -> {final CancelCheckpointMarker message = new CancelCheckpointMarker(checkpointMetaData.getCheckpointId());// 将“CancelCheckpointMarker消息”广播给其他算子recordWriter.broadcastEvent(message);});return false;}
}
StreamTask在执行Checkpoint的过程中,一方面将CheckpointBarrier事件发送给下游的节点,一方面让这个StreamTask内的OperatorChain中的所有的StreamOperator均执行“状态数据快照”操作。当然,如果Task实例并未正常运行,那就将“CancelCheckpointMarker消息”发送给OperatorChain中的所有StreamOperator,并向下游算子进行广播,让它们立即取消Checkpoint操作。
StreamTask会对OperatorChain内的所有StreamOperator,异步执行“状态数据快照”操作。由于CheckpointingOperation实例封装了Checkpoint执行的具体操作流程,以及执行Checkpoint操作需要用到的环境配置信息,因此“千斤重担”就落在了它身上,由CheckpointingOperation负责“快照操作”的指挥。
/*** 对StreamTask中的OperatorChain内的所有StreamOperator,异步执行“状态数据快照”操作*/
private void checkpointState(CheckpointMetaData checkpointMetaData,CheckpointOptions checkpointOptions,CheckpointMetrics checkpointMetrics) throws Exception {// 创建CheckpointStreamFactory实例:就相当于是指向Checkpoint数据的存储位置的指针// 以MemCheckpointStreamFactory为例,它会接受不超过指定字节数限制的序列化后的状态CheckpointStreamFactory storage = checkpointStorage.resolveCheckpointStorageLocation(checkpointMetaData.getCheckpointId(),checkpointOptions.getTargetLocation());// 创建CheckpointingOperation实例(封装了Checkpoint执行的具体操作流程,以及执行Checkpoint操作需要用到的环境配置信息)CheckpointingOperation checkpointingOperation = new CheckpointingOperation(this,checkpointMetaData,checkpointOptions,storage,checkpointMetrics);// 核心:真正地执行Checkpoint操作checkpointingOperation.executeCheckpointing();
}
CheckpointintOperation指挥执行Checkpoint,有3个核心步骤:
/*** 真正地执行Checkpoint操作* 核心逻辑:OperatorSnapshotFutures表示为“状态快照持久化操作”,将其保存到Map集合中*/
public void executeCheckpointing() throws Exception {startSyncPartNano = System.nanoTime();try {// 遍历所有的StreamOperator算子for (StreamOperator> op : allOperators) {/*** 定义每个StreamOperator的“异步执行状态快照”操作,并将其注册给OperatorSnapshotFutures对象等待执行,* 按照“OperatorId:OperatorSnapshotFutures对象”的映射关系,保存到Map中。* 下面会由线程池中的Runnable任务来异步执行这些“操作”*/checkpointStreamOperator(op);}if (LOG.isDebugEnabled()) {LOG.debug("Finished synchronous checkpoints for checkpoint {} on task {}",checkpointMetaData.getCheckpointId(), owner.getName());}startAsyncPartNano = System.nanoTime();checkpointMetrics.setSyncDurationMillis((startAsyncPartNano - startSyncPartNano) / 1_000_000);// 创建Runnable任务:内含“装有每个StreamOperator的执行状态快照操作”所对应的Map集合AsyncCheckpointRunnable asyncCheckpointRunnable = new AsyncCheckpointRunnable(owner,operatorSnapshotsInProgress, // 存放了所有Operator的“状态快照”的结果--OperatorSnapshotFutures对象checkpointMetaData,checkpointMetrics,startAsyncPartNano);owner.cancelables.registerCloseable(asyncCheckpointRunnable);// 使用StreamTask内的ExecutorService(异步快照工作线程池),执行这个Runnable任务(容纳了每个StreamOperator的“快照持久化”的操作)// 目的:不影响数据流的正常处理owner.asyncOperationsThreadPool.execute(asyncCheckpointRunnable);if (LOG.isDebugEnabled()) {LOG.debug("{} - finished synchronous part of checkpoint {}. " +"Alignment duration: {} ms, snapshot duration {} ms",owner.getName(), checkpointMetaData.getCheckpointId(),checkpointMetrics.getAlignmentDurationNanos() / 1_000_000,checkpointMetrics.getSyncDurationMillis());}} catch (Exception ex) {for (OperatorSnapshotFutures operatorSnapshotResult : operatorSnapshotsInProgress.values()) {if (null != operatorSnapshotResult) {try {operatorSnapshotResult.cancel();} catch (Exception e) {LOG.warn("Could not properly cancel an operator snapshot result.", e);}}}if (LOG.isDebugEnabled()) {LOG.debug("{} - did NOT finish synchronous part of checkpoint {}. " +"Alignment duration: {} ms, snapshot duration {} ms",owner.getName(), checkpointMetaData.getCheckpointId(),checkpointMetrics.getAlignmentDurationNanos() / 1_000_000,checkpointMetrics.getSyncDurationMillis());}if (checkpointOptions.getCheckpointType().isSynchronous()) {throw ex;} else {owner.getEnvironment().declineCheckpoint(checkpointMetaData.getCheckpointId(), ex);}}
}
下面就看一下这核心的三板斧,到底是怎么抡的。
遍历所有的StreamOperator算子,将每个StreamOperator要执行“异步执行状态快照”的操作,注册到OperatorSnapshotFutures中等待执行。
/*** 将每个StreamOperator的“异步执行状态快照”的操作,注册给OperatorSnapshotFutures对象等待执行,* 按照“OperatorId:OperatorSnapshotFutures对象”的映射关系,保存到Map中。*/
@SuppressWarnings("deprecation")
private void checkpointStreamOperator(StreamOperator> op) throws Exception {if (null != op) {// 这里会调用StreamOperator#snapshotState()方法,将当前StreamOperator的状态快照操作,“封印”到OperatorSnapshotFutures对象中// tips:如果对Checkpoint过程由特殊逻辑要求,可以在StreamOperator的实现子类中,通过覆写CheckpointedFunction提供的“钩子”方法来满足OperatorSnapshotFutures snapshotInProgress = op.snapshotState(checkpointMetaData.getCheckpointId(),checkpointMetaData.getTimestamp(),checkpointOptions,storageLocation);// 将象征着“异步快照操作”的OperatorSnapshotFutures对象,保存到Map集合中,下一步会使用异步快照工作线程池ExecutorService,异步地执行算子的异步快照操作operatorSnapshotsInProgress.put(op.getOperatorID(), snapshotInProgress);}
}
首先是准备好OperatorSnapshotFutures对象,然后准备好执行状态快照需要用到的上下文。接着将“快照持久化操作”注册到OperatorSnapshotFutures中等待执行。本质就是将KeyedStateBackend、OperatorStateBackend的“快照持久化操作”set到OperatorSnapshotFutures中,后期线程池执行Runnable任务时会将其取出执行。
/*** StreamTask在执行Checkpoint对状态数据进行snapshot时,如果对Checkpoint过程有特殊逻辑要求,可以在StreamOperator的子类中通过覆写CheckpointedFunction接口定义的钩子方法实现。* 然后KeyedStateBackend、OperatorStateBackend的“快照持久化操作”会被set到OperatorSnapshotFutures中等待执行。* 然后会按照“OperatorId:OperatorSnapshotFutures”的映射关系,存到Map集合中。Map集合会被添加到Runnable任务的执行逻辑中。* 后面会有一个专门的异步快照工作线程池--ExecutorService,执行这个Runnable任务*/
@Override
public final OperatorSnapshotFutures snapshotState(long checkpointId, long timestamp, CheckpointOptions checkpointOptions,CheckpointStreamFactory factory) throws Exception {// KeyGroupRange是状态后端在对键控状态处理时,划定key的索引范围的。// 如果keyedStateBackend不为null,就直接获取KeyGroupRange。否则,就new一个KeyGroupRange出来KeyGroupRange keyGroupRange = null != keyedStateBackend ?keyedStateBackend.getKeyGroupRange() : KeyGroupRange.EMPTY_KEY_GROUP_RANGE;// 创建OperatorSnapshotFutures对象OperatorSnapshotFutures snapshotInProgress = new OperatorSnapshotFutures();// StateSnapshotContextSynchronousImpl用来存储“执行快照”的过程中需要用到的上下文信息StateSnapshotContextSynchronousImpl snapshotContext = new StateSnapshotContextSynchronousImpl(checkpointId,timestamp,factory,keyGroupRange,getContainingTask().getCancelables());try {// CheckpointedFunction定义了可以覆写的“钩子”方法,在StreamOperator的子类中(通过自定义Function)可以为其提供具体的实现逻辑,// 以满足Checkpoint时的特殊逻辑要求,例如:删除状态中的某些数据、添加一些特殊数据等。snapshotState(snapshotContext);// 包装OperatorSnapshotFutures对象:指定专门用于处理原生状态数据的快照操作snapshotInProgress.setKeyedStateRawFuture(snapshotContext.getKeyedStateStreamFuture());snapshotInProgress.setOperatorStateRawFuture(snapshotContext.getOperatorStateStreamFuture());// 把OperatorStateBackend的“快照持久化”操作包装到OperatorSnapshotFutures中,等待执行if (null != operatorStateBackend) {snapshotInProgress.setOperatorStateManagedFuture(operatorStateBackend.snapshot(checkpointId, timestamp, factory, checkpointOptions));}// 把KeyedStateBackend的“快照持久化”操作包装到OperatorSnapshotFutures中,等待执行if (null != keyedStateBackend) {snapshotInProgress.setKeyedStateManagedFuture(keyedStateBackend.snapshot(checkpointId, timestamp, factory, checkpointOptions));}} catch (Exception snapshotException) {try {snapshotInProgress.cancel();} catch (Exception e) {snapshotException.addSuppressed(e);}String snapshotFailMessage = "Could not complete snapshot " + checkpointId + " for operator " +getOperatorName() + ".";if (!getContainingTask().isCanceled()) {LOG.info(snapshotFailMessage, snapshotException);}try {snapshotContext.closeExceptionally();} catch (IOException e) {snapshotException.addSuppressed(e);}// 一旦上面做snapshot的过程中出现异常,就会往上层抛。TaskManager接到异常后,会取消所有Task任务并启动Job重启策略throw new CheckpointException(snapshotFailMessage, CheckpointFailureReason.CHECKPOINT_DECLINED, snapshotException);}// 此时,经过包装的OperatorSnapshotFutures对象,拥有一堆的RunnableFuture(等待执行的异步任务)return snapshotInProgress;
}
需要注意的是:如果对Checkpoint过程有特殊逻辑要求,可以在StreamOperator的子类中通过覆写CheckpointedFunction接口定义的钩子方法实现。
最后,每个StreamOperator对应的OperatorSnapshotFutures对象包装完成后,就会被保存到映射关系为:“OperatorId:OperatorSnapshotFutures对象”的Map集合中
AsyncCheckpointRunnable负责执行“快照持久化操作”,容纳所有的OperatorSnapshotFutures对象的Map集合都已经被保存到这个Runnable中了。
// 创建Runnable任务:内含“装有每个StreamOperator的执行状态快照操作”所对应的Map集合
AsyncCheckpointRunnable asyncCheckpointRunnable = new AsyncCheckpointRunnable(owner,operatorSnapshotsInProgress, // 存放了所有Operator的“状态快照”的结果--OperatorSnapshotFutures对象checkpointMetaData,checkpointMetrics,startAsyncPartNano);/*** 负责执行“快照持久化操作”的Runnable任务*/
AsyncCheckpointRunnable(StreamTask, ?> owner,Map operatorSnapshotsInProgress,CheckpointMetaData checkpointMetaData,CheckpointMetrics checkpointMetrics,long asyncStartNanos) {this.owner = Preconditions.checkNotNull(owner);this.operatorSnapshotsInProgress = Preconditions.checkNotNull(operatorSnapshotsInProgress);this.checkpointMetaData = Preconditions.checkNotNull(checkpointMetaData);this.checkpointMetrics = Preconditions.checkNotNull(checkpointMetrics);this.asyncStartNanos = asyncStartNanos;
}
当Runnable任务被执行时,就会调用它的run()方法。本质就是将Runnable任务中的OperatorSnapshotFutures保存的“快照持久化操作”取出来执行
/*** 遍历每个StreamOperator对应的OperatorSnapshotFutures,并执行“快照持久化操作”*/
@Override
public void run() {// 为当前线程初始化“文件系统安全网”,确保数据能够正常写入FileSystemSafetyNet.initializeSafetyNetForThread();try {// 存储和记录发送给JobManager的Checkpoint数据TaskStateSnapshot jobManagerTaskOperatorSubtaskStates =new TaskStateSnapshot(operatorSnapshotsInProgress.size());// 存储TaskExecutor本地的状态数据TaskStateSnapshot localTaskOperatorSubtaskStates =new TaskStateSnapshot(operatorSnapshotsInProgress.size());// 遍历每个OperatorSnapshotFuturesfor (Map.Entry entry : operatorSnapshotsInProgress.entrySet()) {// key:算子idOperatorID operatorID = entry.getKey();// value:算子对应的状态快照持久化操作OperatorSnapshotFutures snapshotInProgress = entry.getValue();// finalize the async part of all by executing all snapshot runnables// OperatorSnapshotFinalizer会被用来(在构造方法中)执行所有算子的(算子状态、键控状态的)“快照持久化操作”OperatorSnapshotFinalizer finalizedSnapshots =new OperatorSnapshotFinalizer(snapshotInProgress);// 从OperatorSnapshotFinalizer中取出(执行完快照持久化操作后的)JobManager和TaskExecutor的Checkpoint数据,并分别存储起来jobManagerTaskOperatorSubtaskStates.putSubtaskStateByOperatorID(operatorID,finalizedSnapshots.getJobManagerOwnedState());localTaskOperatorSubtaskStates.putSubtaskStateByOperatorID(operatorID,finalizedSnapshots.getTaskLocalState());}final long asyncEndNanos = System.nanoTime();final long asyncDurationMillis = (asyncEndNanos - asyncStartNanos) / 1_000_000L;// CheckpointMetrics会记录Checkpoint的执行时间,并汇总到监控系统中checkpointMetrics.setAsyncDurationMillis(asyncDurationMillis);// 如果AsyncCheckpointState的状态为COMPLETED,就向JobManager汇报Checkpoint的执行结果if (asyncCheckpointState.compareAndSet(CheckpointingOperation.AsyncCheckpointState.RUNNING,CheckpointingOperation.AsyncCheckpointState.COMPLETED)) {// 向JobManager汇报Checkpoint的执行结果reportCompletedSnapshotStates(jobManagerTaskOperatorSubtaskStates,localTaskOperatorSubtaskStates,asyncDurationMillis);} else {LOG.debug("{} - asynchronous part of checkpoint {} could not be completed because it was closed before.",owner.getName(),checkpointMetaData.getCheckpointId());}} catch (Exception e) {if (LOG.isDebugEnabled()) {LOG.debug("{} - asynchronous part of checkpoint {} could not be completed.",owner.getName(),checkpointMetaData.getCheckpointId(),e);}// 处理异常情况handleExecutionException(e);} finally {owner.cancelables.unregisterCloseable(this);FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();}
}
OperatorSnapshotFinalizer会被用来(在构造方法中)执行所有算子所对应的(算子状态、键控状态的)“快照持久化操作”。执行完毕后,会向JobManager汇报Checkpoint的执行结果。
public OperatorSnapshotFinalizer(@Nonnull OperatorSnapshotFutures snapshotFutures) throws ExecutionException, InterruptedException {// 执行KeyedState的“快照持久化操作”SnapshotResult keyedManaged =FutureUtils.runIfNotDoneAndGet(snapshotFutures.getKeyedStateManagedFuture());SnapshotResult keyedRaw =FutureUtils.runIfNotDoneAndGet(snapshotFutures.getKeyedStateRawFuture());// 执行OperatorState的“快照持久化操作”SnapshotResult operatorManaged =FutureUtils.runIfNotDoneAndGet(snapshotFutures.getOperatorStateManagedFuture());SnapshotResult operatorRaw =FutureUtils.runIfNotDoneAndGet(snapshotFutures.getOperatorStateRawFuture());// 将Checkpoint数据保存到OperatorSubtaskState中jobManagerOwnedState = new OperatorSubtaskState(operatorManaged.getJobManagerOwnedSnapshot(),operatorRaw.getJobManagerOwnedSnapshot(),keyedManaged.getJobManagerOwnedSnapshot(),keyedRaw.getJobManagerOwnedSnapshot());taskLocalState = new OperatorSubtaskState(operatorManaged.getTaskLocalSnapshot(),operatorRaw.getTaskLocalSnapshot(),keyedManaged.getTaskLocalSnapshot(),keyedRaw.getTaskLocalSnapshot());
}
快照持久化操作的执行,依赖于StateBackend。以HeapKeyedStateBackend为例,若想执行快照持久化,SnapshotStrategy接口定义了对状态数据进行“状态快照持久化”的接口方法
/*** StreamOperator刚刚已经执行完了状态快照,接下来xxxStateBackend要做的就是基于SnapshotStrategy对状态数据进行“状态快照持久化”。* 具体的持久化方式,由SnapshotStrategy接口的实现子类提供具体的实现逻辑。* 该方法就是将持久化操作,注册到OperatorSnapshotFutures中等待执行。*/
@Nonnull
@Override
@SuppressWarnings("unchecked")
public RunnableFuture> snapshot(final long checkpointId,final long timestamp,@Nonnull final CheckpointStreamFactory streamFactory,@Nonnull CheckpointOptions checkpointOptions) throws IOException {long startTime = System.currentTimeMillis();// SnapshotStrategy接口定义了对状态数据进行“状态快照持久化”的接口方法final RunnableFuture> snapshotRunner =snapshotStrategy.snapshot(checkpointId, timestamp, streamFactory, checkpointOptions);snapshotStrategy.logSyncCompleted(streamFactory, startTime);return snapshotRunner;
}
SnapshotStrategy接口的实现子类为其提供了具体的实现逻辑:
/*** HeapSnapshotStrategy对SnapshotStrategy接口提供的“状态快照持久化”接口方法的具体实现逻辑*/
@Nonnull
@Override
public RunnableFuture> snapshot(long checkpointId,long timestamp,@Nonnull CheckpointStreamFactory primaryStreamFactory,@Nonnull CheckpointOptions checkpointOptions) throws IOException {// 省略部分代码...// 对KeyedState、OperatorState的状态快照持久化的处理逻辑processSnapshotMetaInfoForAllStates(metaInfoSnapshots,cowStateStableSnapshots,stateNamesToId,registeredKVStates,StateMetaInfoSnapshot.BackendStateType.KEY_VALUE);processSnapshotMetaInfoForAllStates(metaInfoSnapshots,cowStateStableSnapshots,stateNamesToId,registeredPQStates,StateMetaInfoSnapshot.BackendStateType.PRIORITY_QUEUE);// 省略部分代码...
}
在HeapSnapshotStrategy中会取出StateSnapshotRestore,来创建对应类型的状态快照
/*** 对KeyedState、OperatorState的状态快照持久化的处理逻辑*/
private void processSnapshotMetaInfoForAllStates(List metaInfoSnapshots,Map cowStateStableSnapshots,Map stateNamesToId,Map registeredStates,StateMetaInfoSnapshot.BackendStateType stateType) {for (Map.Entry kvState : registeredStates.entrySet()) {final StateUID stateUid = StateUID.of(kvState.getKey(), stateType);stateNamesToId.put(stateUid, stateNamesToId.size());// Value值:处理状态快照、状态恢复的接口StateSnapshotRestore state = kvState.getValue();if (null != state) {// 核心:(基于不同的StateSnapshotRestore)创建状态快照final StateSnapshot stateSnapshot = state.stateSnapshot();// 将状态快照保存到对应集合中,完成堆内存存储类型KvState的快照操作metaInfoSnapshots.add(stateSnapshot.getMetaInfoSnapshot());cowStateStableSnapshots.put(stateUid, stateSnapshot);}}
}