public class JobMaster extends FencedRpcEndpoint<JobMasterId> implements JobMasterGateway
JobGraph
.
It offers the following methods as part of its rpc interface to interact with the JobMaster remotely:
updateTaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState)
updates the task execution state for
given taskRpcEndpoint.MainThreadExecutor
Modifier and Type | Field and Description |
---|---|
static String |
ARCHIVE_NAME |
static String |
JOB_MANAGER_NAME
Default names for Flink's distributed components
|
log, rpcServer
Constructor and Description |
---|
JobMaster(RpcService rpcService,
ResourceID resourceId,
JobGraph jobGraph,
Configuration configuration,
HighAvailabilityServices highAvailabilityService,
HeartbeatServices heartbeatServices,
ScheduledExecutorService executor,
BlobServer blobServer,
BlobLibraryCacheManager libraryCacheManager,
RestartStrategyFactory restartStrategyFactory,
Time rpcAskTimeout,
JobManagerMetricGroup jobManagerMetricGroup,
OnCompletionActions jobCompletionActions,
FatalErrorHandler errorHandler,
ClassLoader userCodeLoader) |
Modifier and Type | Method and Description |
---|---|
void |
acknowledgeCheckpoint(JobID jobID,
ExecutionAttemptID executionAttemptID,
long checkpointId,
CheckpointMetrics checkpointMetrics,
TaskStateSnapshot checkpointState) |
CompletableFuture<Acknowledge> |
cancel(Time timeout)
Cancels the currently executed job.
|
void |
declineCheckpoint(JobID jobID,
ExecutionAttemptID executionAttemptID,
long checkpointID,
Throwable reason) |
void |
disconnectResourceManager(ResourceManagerId resourceManagerId,
Exception cause)
Disconnects the resource manager from the job manager because of the given cause.
|
void |
disconnectTaskManager(ResourceID resourceID,
Exception cause)
Disconnects the given
TaskExecutor from the
JobMaster . |
void |
failSlot(ResourceID taskManagerId,
AllocationID allocationId,
Exception cause)
Fails the slot with the given allocation id and cause.
|
void |
heartbeatFromResourceManager(ResourceID resourceID)
Sends heartbeat request from the resource manager
|
void |
heartbeatFromTaskManager(ResourceID resourceID)
Sends the heartbeat to job manager from task manager
|
CompletableFuture<KvStateLocation> |
lookupKvStateLocation(String registrationName)
Requests a
KvStateLocation for the specified InternalKvState registration name. |
void |
notifyKvStateRegistered(JobVertexID jobVertexId,
KeyGroupRange keyGroupRange,
String registrationName,
KvStateID kvStateId,
InetSocketAddress kvStateServerAddress)
Notifies that queryable state has been registered.
|
void |
notifyKvStateUnregistered(JobVertexID jobVertexId,
KeyGroupRange keyGroupRange,
String registrationName)
Notifies that queryable state has been unregistered.
|
CompletableFuture<Collection<SlotOffer>> |
offerSlots(ResourceID taskManagerId,
Iterable<SlotOffer> slots,
Time timeout)
Offers the given slots to the job manager.
|
void |
postStop()
Suspend the job and shutdown all other services including rpc.
|
CompletableFuture<RegistrationResponse> |
registerTaskManager(String taskManagerRpcAddress,
TaskManagerLocation taskManagerLocation,
Time timeout)
Registers the task manager at the job manager.
|
CompletableFuture<AccessExecutionGraph> |
requestArchivedExecutionGraph(Time timeout)
Request the
ArchivedExecutionGraph of the currently executed job. |
CompletableFuture<ClassloadingProps> |
requestClassloadingProps()
Request the classloading props of this job.
|
CompletableFuture<JobDetails> |
requestJobDetails(Time timeout)
Request the details of the executed job.
|
CompletableFuture<JobStatus> |
requestJobStatus(Time timeout)
Requests the current job status.
|
CompletableFuture<SerializedInputSplit> |
requestNextInputSplit(JobVertexID vertexID,
ExecutionAttemptID executionAttempt)
Requests the next input split for the
ExecutionJobVertex . |
CompletableFuture<ExecutionState> |
requestPartitionState(IntermediateDataSetID intermediateResultId,
ResultPartitionID resultPartitionId)
Requests the current state of the partition.
|
CompletableFuture<Acknowledge> |
scheduleOrUpdateConsumers(ResultPartitionID partitionID,
Time timeout)
Notifies the JobManager about available data for a produced partition.
|
void |
start()
Starts the rpc endpoint.
|
CompletableFuture<Acknowledge> |
start(JobMasterId newJobMasterId,
Time timeout)
Start the rpc service and begin to run the job.
|
CompletableFuture<Acknowledge> |
stop(Time timeout)
Cancel the currently executed job.
|
CompletableFuture<Acknowledge> |
suspend(Throwable cause,
Time timeout)
Suspending job, all the running tasks will be cancelled, and communication with other components
will be disposed.
|
CompletableFuture<Acknowledge> |
updateTaskExecutionState(TaskExecutionState taskExecutionState)
Updates the task execution state for a given task.
|
callAsyncWithoutFencing, getFencingToken, getMainThreadExecutor, runAsyncWithoutFencing, setFencingToken
callAsync, getAddress, getEndpointId, getHostname, getRpcService, getSelfGateway, getTerminationFuture, runAsync, scheduleRunAsync, scheduleRunAsync, shutDown, stop, validateRunsInMainThread
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
getFencingToken
getAddress, getHostname
public static final String JOB_MANAGER_NAME
public static final String ARCHIVE_NAME
public JobMaster(RpcService rpcService, ResourceID resourceId, JobGraph jobGraph, Configuration configuration, HighAvailabilityServices highAvailabilityService, HeartbeatServices heartbeatServices, ScheduledExecutorService executor, BlobServer blobServer, BlobLibraryCacheManager libraryCacheManager, RestartStrategyFactory restartStrategyFactory, Time rpcAskTimeout, @Nullable JobManagerMetricGroup jobManagerMetricGroup, OnCompletionActions jobCompletionActions, FatalErrorHandler errorHandler, ClassLoader userCodeLoader) throws Exception
Exception
public void start()
RpcEndpoint
start
in class RpcEndpoint
public CompletableFuture<Acknowledge> start(JobMasterId newJobMasterId, Time timeout) throws Exception
newJobMasterId
- The necessary fencing token to run the jobtimeout
- for the operationException
public CompletableFuture<Acknowledge> suspend(Throwable cause, Time timeout)
Mostly job is suspended because of the leadership has been revoked, one can be restart this job by
calling the start(JobMasterId, Time)
method once we take the leadership back again.
This method is executed asynchronously
cause
- The reason of why this job been suspended.timeout
- for this operationpublic void postStop() throws Exception
postStop
in class RpcEndpoint
Exception
- if an error occurs. The exception is returned as result of the termination future.public CompletableFuture<Acknowledge> cancel(Time timeout)
JobMasterGateway
cancel
in interface JobMasterGateway
timeout
- of this operationpublic CompletableFuture<Acknowledge> stop(Time timeout)
JobMasterGateway
stop
in interface JobMasterGateway
timeout
- of this operationpublic CompletableFuture<Acknowledge> updateTaskExecutionState(TaskExecutionState taskExecutionState)
updateTaskExecutionState
in interface JobMasterGateway
taskExecutionState
- New task execution state for a given taskpublic CompletableFuture<SerializedInputSplit> requestNextInputSplit(JobVertexID vertexID, ExecutionAttemptID executionAttempt)
JobMasterGateway
ExecutionJobVertex
.
The next input split is sent back to the sender as a
SerializedInputSplit
message.requestNextInputSplit
in interface JobMasterGateway
vertexID
- The job vertex idexecutionAttempt
- The execution attempt idpublic CompletableFuture<ExecutionState> requestPartitionState(IntermediateDataSetID intermediateResultId, ResultPartitionID resultPartitionId)
JobMasterGateway
requestPartitionState
in interface JobMasterGateway
intermediateResultId
- The execution attempt ID of the task requesting the partition state.resultPartitionId
- The partition ID of the partition to request the state of.public CompletableFuture<Acknowledge> scheduleOrUpdateConsumers(ResultPartitionID partitionID, Time timeout)
JobMasterGateway
There is a call to this method for each ExecutionVertex
instance once per produced
ResultPartition
instance, either when first producing data (for pipelined executions)
or when all data has been produced (for staged executions).
The JobManager then can decide when to schedule the partition consumers of the given session.
scheduleOrUpdateConsumers
in interface JobMasterGateway
partitionID
- The partition which has already produced datatimeout
- before the rpc call failspublic void disconnectTaskManager(ResourceID resourceID, Exception cause)
JobMasterGateway
TaskExecutor
from the
JobMaster
.disconnectTaskManager
in interface JobMasterGateway
resourceID
- identifying the TaskManager to disconnectcause
- for the disconnection of the TaskManagerpublic void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, TaskStateSnapshot checkpointState)
acknowledgeCheckpoint
in interface CheckpointCoordinatorGateway
public void declineCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointID, Throwable reason)
declineCheckpoint
in interface CheckpointCoordinatorGateway
public CompletableFuture<KvStateLocation> lookupKvStateLocation(String registrationName)
JobMasterGateway
KvStateLocation
for the specified InternalKvState
registration name.lookupKvStateLocation
in interface JobMasterGateway
registrationName
- Name under which the KvState has been registered.InternalKvState
locationpublic void notifyKvStateRegistered(JobVertexID jobVertexId, KeyGroupRange keyGroupRange, String registrationName, KvStateID kvStateId, InetSocketAddress kvStateServerAddress)
JobMasterGateway
notifyKvStateRegistered
in interface JobMasterGateway
jobVertexId
- JobVertexID the KvState instance belongs to.keyGroupRange
- Key group range the KvState instance belongs to.registrationName
- Name under which the KvState has been registered.kvStateId
- ID of the registered KvState instance.kvStateServerAddress
- Server address where to find the KvState instance.public void notifyKvStateUnregistered(JobVertexID jobVertexId, KeyGroupRange keyGroupRange, String registrationName)
JobMasterGateway
notifyKvStateUnregistered
in interface JobMasterGateway
jobVertexId
- JobVertexID the KvState instance belongs to.keyGroupRange
- Key group index the KvState instance belongs to.registrationName
- Name under which the KvState has been registered.public CompletableFuture<ClassloadingProps> requestClassloadingProps()
JobMasterGateway
requestClassloadingProps
in interface JobMasterGateway
public CompletableFuture<Collection<SlotOffer>> offerSlots(ResourceID taskManagerId, Iterable<SlotOffer> slots, Time timeout)
JobMasterGateway
offerSlots
in interface JobMasterGateway
taskManagerId
- identifying the task managerslots
- to offer to the job managertimeout
- for the rpc callpublic void failSlot(ResourceID taskManagerId, AllocationID allocationId, Exception cause)
JobMasterGateway
failSlot
in interface JobMasterGateway
taskManagerId
- identifying the task managerallocationId
- identifying the slot to failcause
- of the failingpublic CompletableFuture<RegistrationResponse> registerTaskManager(String taskManagerRpcAddress, TaskManagerLocation taskManagerLocation, Time timeout)
JobMasterGateway
registerTaskManager
in interface JobMasterGateway
taskManagerRpcAddress
- the rpc address of the task managertaskManagerLocation
- location of the task managertimeout
- for the rpc callpublic void disconnectResourceManager(ResourceManagerId resourceManagerId, Exception cause)
JobMasterGateway
disconnectResourceManager
in interface JobMasterGateway
resourceManagerId
- identifying the resource manager leader idcause
- of the disconnectpublic void heartbeatFromTaskManager(ResourceID resourceID)
JobMasterGateway
heartbeatFromTaskManager
in interface JobMasterGateway
resourceID
- unique id of the task managerpublic void heartbeatFromResourceManager(ResourceID resourceID)
JobMasterGateway
heartbeatFromResourceManager
in interface JobMasterGateway
resourceID
- unique id of the resource managerpublic CompletableFuture<JobDetails> requestJobDetails(Time timeout)
JobMasterGateway
requestJobDetails
in interface JobMasterGateway
timeout
- for the rpc callpublic CompletableFuture<AccessExecutionGraph> requestArchivedExecutionGraph(Time timeout)
JobMasterGateway
ArchivedExecutionGraph
of the currently executed job.requestArchivedExecutionGraph
in interface JobMasterGateway
timeout
- for the rpc callpublic CompletableFuture<JobStatus> requestJobStatus(Time timeout)
JobMasterGateway
requestJobStatus
in interface JobMasterGateway
timeout
- for the rpc callCopyright © 2014–2018 The Apache Software Foundation. All rights reserved.