waveterm/pkg/jobcontroller/jobcontroller.go
2026-02-11 14:57:27 -08:00

1615 lines
45 KiB
Go

// Copyright 2025, Command Line Inc.
// SPDX-License-Identifier: Apache-2.0
package jobcontroller
import (
"context"
"encoding/base64"
"fmt"
"io"
"io/fs"
"log"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/wavetermdev/waveterm/pkg/blocklogger"
"github.com/wavetermdev/waveterm/pkg/filestore"
"github.com/wavetermdev/waveterm/pkg/panichandler"
"github.com/wavetermdev/waveterm/pkg/remote/conncontroller"
"github.com/wavetermdev/waveterm/pkg/streamclient"
"github.com/wavetermdev/waveterm/pkg/telemetry"
"github.com/wavetermdev/waveterm/pkg/telemetry/telemetrydata"
"github.com/wavetermdev/waveterm/pkg/util/ds"
"github.com/wavetermdev/waveterm/pkg/util/envutil"
"github.com/wavetermdev/waveterm/pkg/util/shellutil"
"github.com/wavetermdev/waveterm/pkg/util/utilfn"
"github.com/wavetermdev/waveterm/pkg/utilds"
"github.com/wavetermdev/waveterm/pkg/wavebase"
"github.com/wavetermdev/waveterm/pkg/wavejwt"
"github.com/wavetermdev/waveterm/pkg/waveobj"
"github.com/wavetermdev/waveterm/pkg/wconfig"
"github.com/wavetermdev/waveterm/pkg/wcore"
"github.com/wavetermdev/waveterm/pkg/wps"
"github.com/wavetermdev/waveterm/pkg/wshrpc"
"github.com/wavetermdev/waveterm/pkg/wshrpc/wshclient"
"github.com/wavetermdev/waveterm/pkg/wshutil"
"github.com/wavetermdev/waveterm/pkg/wstore"
"golang.org/x/sync/singleflight"
)
const DefaultTimeout = 2 * time.Second
const (
JobManagerStatus_Init = "init"
JobManagerStatus_Running = "running"
JobManagerStatus_Done = "done"
)
const (
JobDoneReason_StartupError = "startuperror"
JobDoneReason_Gone = "gone"
JobDoneReason_Terminated = "terminated"
)
const (
JobConnStatus_Disconnected = "disconnected"
JobConnStatus_Connecting = "connecting"
JobConnStatus_Connected = "connected"
)
const (
JobKind_Shell = "shell"
JobKind_Task = "task"
)
const DefaultStreamRwnd = 64 * 1024
const MetaKey_TotalGap = "totalgap"
const JobOutputFileName = "term"
const AutoReconnectDelay = 1 * time.Second
const AutoReconnectCooldown = 30 * time.Second
type connState struct {
actual bool
processed bool
reconciling bool
}
type connStateManager struct {
sync.Mutex
m map[string]*connState
reconcileCh chan struct{}
}
type jobState struct {
stateLock sync.Mutex
isConnecting bool
connectedStatus string
}
var (
jobConnStates = make(map[string]string)
jobControllerLock sync.Mutex
blockJobStatusVersion utilds.VersionTs
connStates = &connStateManager{
m: make(map[string]*connState),
reconcileCh: make(chan struct{}, 1),
}
jobStreamIds = ds.MakeSyncMap[string]()
jobTerminationMessageWritten = ds.MakeSyncMap[bool]()
lastAutoReconnectAttempt = ds.MakeSyncMap[int64]()
reconnectGroup singleflight.Group
terminateJobManagerGroup singleflight.Group
)
func InitJobController() {
go connReconcileWorker()
go jobPruningWorker()
rpcClient := wshclient.GetBareRpcClient()
rpcClient.EventListener.On(wps.Event_RouteUp, handleRouteUpEvent)
rpcClient.EventListener.On(wps.Event_RouteDown, handleRouteDownEvent)
rpcClient.EventListener.On(wps.Event_ConnChange, handleConnChangeEvent)
rpcClient.EventListener.On(wps.Event_BlockClose, handleBlockCloseEvent)
wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{
Event: wps.Event_RouteUp,
AllScopes: true,
}, nil)
wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{
Event: wps.Event_RouteDown,
AllScopes: true,
}, nil)
wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{
Event: wps.Event_ConnChange,
AllScopes: true,
}, nil)
wshclient.EventSubCommand(rpcClient, wps.SubscriptionRequest{
Event: wps.Event_BlockClose,
AllScopes: true,
}, nil)
}
func isJobManagerRunning(job *waveobj.Job) bool {
return job.JobManagerStatus == JobManagerStatus_Running
}
func GetJobManagerStatus(ctx context.Context, jobId string) (string, error) {
job, err := wstore.DBGet[*waveobj.Job](ctx, jobId)
if err != nil {
return "", fmt.Errorf("failed to get job: %w", err)
}
if job == nil {
return JobManagerStatus_Done, nil
}
return job.JobManagerStatus, nil
}
func GetAllJobManagerStatus(ctx context.Context) ([]*wshrpc.JobManagerStatusUpdate, error) {
allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job)
if err != nil {
return nil, fmt.Errorf("failed to get jobs: %w", err)
}
var statuses []*wshrpc.JobManagerStatusUpdate
for _, job := range allJobs {
statuses = append(statuses, &wshrpc.JobManagerStatusUpdate{
JobId: job.OID,
JobManagerStatus: job.JobManagerStatus,
})
}
return statuses, nil
}
func GetBlockJobStatus(ctx context.Context, blockId string) (*wshrpc.BlockJobStatusData, error) {
block, err := wstore.DBGet[*waveobj.Block](ctx, blockId)
if err != nil {
return nil, fmt.Errorf("failed to get block: %w", err)
}
if block == nil {
return nil, fmt.Errorf("block not found: %s", blockId)
}
data := &wshrpc.BlockJobStatusData{
BlockId: blockId,
VersionTs: blockJobStatusVersion.GetVersionTs(),
}
if block.JobId == "" {
return data, nil
}
job, err := wstore.DBGet[*waveobj.Job](ctx, block.JobId)
if err != nil {
return nil, fmt.Errorf("failed to get job: %w", err)
}
if job == nil {
return data, nil
}
data.JobId = job.OID
data.DoneReason = job.JobManagerDoneReason
data.StartupError = job.JobManagerStartupError
data.CmdExitTs = job.CmdExitTs
data.CmdExitCode = job.CmdExitCode
data.CmdExitSignal = job.CmdExitSignal
if job.JobManagerStatus == JobManagerStatus_Init {
data.Status = "init"
} else if job.JobManagerStatus == JobManagerStatus_Done {
data.Status = "done"
} else if job.JobManagerStatus == JobManagerStatus_Running {
connStatus := GetJobConnStatus(job.OID)
if connStatus == JobConnStatus_Connected {
data.Status = "connected"
} else {
data.Status = "disconnected"
}
}
return data, nil
}
func SendBlockJobStatusEvent(ctx context.Context, blockId string) {
data, err := GetBlockJobStatus(ctx, blockId)
if err != nil {
log.Printf("[block:%s] error getting block job status: %v", blockId, err)
return
}
wps.Broker.Publish(wps.WaveEvent{
Event: wps.Event_BlockJobStatus,
Scopes: []string{fmt.Sprintf("block:%s", blockId)},
Data: data,
})
}
func sendBlockJobStatusEventByJob(ctx context.Context, job *waveobj.Job) {
if job == nil || job.AttachedBlockId == "" {
return
}
SendBlockJobStatusEvent(ctx, job.AttachedBlockId)
}
func connReconcileWorker() {
defer func() {
panichandler.PanicHandler("jobcontroller:connReconcileWorker", recover())
}()
for range connStates.reconcileCh {
reconcileAllConns()
}
}
func reconcileAllConns() {
connStates.Lock()
defer connStates.Unlock()
for connName, cs := range connStates.m {
if cs.reconciling || cs.actual == cs.processed {
continue
}
cs.reconciling = true
actual := cs.actual
go reconcileConn(connName, actual)
}
}
func reconcileConn(connName string, targetState bool) {
defer func() {
panichandler.PanicHandler("jobcontroller:reconcileConn", recover())
}()
if targetState {
onConnectionUp(connName)
} else {
onConnectionDown(connName)
}
connStates.Lock()
defer connStates.Unlock()
if cs, exists := connStates.m[connName]; exists {
cs.processed = targetState
cs.reconciling = false
}
select {
case connStates.reconcileCh <- struct{}{}:
default:
}
}
func getMetaInt64(meta wshrpc.FileMeta, key string) int64 {
val, ok := meta[key]
if !ok {
return 0
}
if intVal, ok := val.(int64); ok {
return intVal
}
if floatVal, ok := val.(float64); ok {
return int64(floatVal)
}
return 0
}
func jobPruningWorker() {
defer func() {
panichandler.PanicHandler("jobcontroller:jobPruningWorker", recover())
}()
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
var previousCandidates []string
for range ticker.C {
previousCandidates = pruneUnusedJobs(previousCandidates)
}
}
func pruneUnusedJobs(previousCandidates []string) []string {
ctx, cancelFn := context.WithTimeout(context.Background(), 30*time.Second)
defer cancelFn()
allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job)
if err != nil {
log.Printf("[jobpruner] error getting all jobs: %v", err)
return previousCandidates
}
var currentCandidates []string
for _, job := range allJobs {
if job.JobManagerStatus == JobManagerStatus_Done && job.AttachedBlockId == "" {
currentCandidates = append(currentCandidates, job.OID)
}
}
jobsToDelete := utilfn.StrSetIntersection(previousCandidates, currentCandidates)
if len(previousCandidates) > 0 || len(currentCandidates) > 0 {
log.Printf("[jobpruner] prev=%d current=%d deleting=%d", len(previousCandidates), len(currentCandidates), len(jobsToDelete))
}
for _, jobId := range jobsToDelete {
err := DeleteJob(ctx, jobId)
if err != nil {
log.Printf("[jobpruner] error deleting job %s: %v", jobId, err)
}
}
return currentCandidates
}
func handleRouteUpEvent(event *wps.WaveEvent) {
handleRouteEvent(event, JobConnStatus_Connected)
}
func handleRouteDownEvent(event *wps.WaveEvent) {
handleRouteEvent(event, JobConnStatus_Disconnected)
}
func handleRouteEvent(event *wps.WaveEvent, newStatus string) {
ctx := context.Background()
for _, scope := range event.Scopes {
if strings.HasPrefix(scope, "job:") {
jobId := strings.TrimPrefix(scope, "job:")
SetJobConnStatus(jobId, newStatus)
log.Printf("[job:%s] connection status changed to %s", jobId, newStatus)
job, err := wstore.DBGet[*waveobj.Job](ctx, jobId)
if err != nil {
log.Printf("[job:%s] error getting job for status event: %v", jobId, err)
continue
}
sendBlockJobStatusEventByJob(ctx, job)
if newStatus == JobConnStatus_Disconnected && job != nil && isJobManagerRunning(job) {
if shouldAttemptAutoReconnect(jobId) {
go attemptAutoReconnect(jobId, job.Connection)
}
}
}
}
}
func shouldAttemptAutoReconnect(jobId string) bool {
now := time.Now().Unix()
lastAttempt, exists := lastAutoReconnectAttempt.GetEx(jobId)
if !exists {
lastAutoReconnectAttempt.Set(jobId, now)
return true
}
timeSinceLastAttempt := time.Duration(now-lastAttempt) * time.Second
if timeSinceLastAttempt >= AutoReconnectCooldown {
lastAutoReconnectAttempt.Set(jobId, now)
return true
}
return false
}
func attemptAutoReconnect(jobId string, connName string) {
defer func() {
panichandler.PanicHandler("jobcontroller:attemptAutoReconnect", recover())
}()
time.Sleep(AutoReconnectDelay)
isConnected, err := conncontroller.IsConnected(connName)
if err != nil || !isConnected {
log.Printf("[job:%s] connection %s is down, skipping auto-reconnect", jobId, connName)
return
}
log.Printf("[job:%s] connection %s still up after route down, attempting auto-reconnect to determine job manager status", jobId, connName)
ctx, cancelFn := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelFn()
err = ReconnectJob(ctx, jobId, nil)
if err != nil {
log.Printf("[job:%s] auto-reconnect failed: %v", jobId, err)
} else {
log.Printf("[job:%s] auto-reconnect succeeded", jobId)
}
}
func handleConnChangeEvent(event *wps.WaveEvent) {
var connStatus wshrpc.ConnStatus
err := utilfn.ReUnmarshal(&connStatus, event.Data)
if err != nil {
log.Printf("[connchange] error unmarshaling ConnStatus: %v", err)
return
}
var connName string
for _, scope := range event.Scopes {
if strings.HasPrefix(scope, "connection:") {
connName = strings.TrimPrefix(scope, "connection:")
break
}
}
if connName == "" {
return
}
connStates.Lock()
cs, exists := connStates.m[connName]
if !exists {
cs = &connState{actual: false, processed: false, reconciling: false}
connStates.m[connName] = cs
}
cs.actual = connStatus.Connected
connStates.Unlock()
select {
case connStates.reconcileCh <- struct{}{}:
default:
}
}
func handleBlockCloseEvent(event *wps.WaveEvent) {
ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelFn()
blockId, ok := event.Data.(string)
if !ok {
log.Printf("[blockclose] invalid event data type")
return
}
jobIds, err := wstore.WithTxRtn(ctx, func(tx *wstore.TxWrap) ([]string, error) {
query := `SELECT oid FROM db_job WHERE json_extract(data, '$.attachedblockid') = ?`
jobIds := tx.SelectStrings(query, blockId)
return jobIds, nil
})
if err != nil {
log.Printf("[block:%s] error looking up jobids: %v", blockId, err)
return
}
if len(jobIds) == 0 {
return
}
for _, jobId := range jobIds {
TerminateAndDetachJob(ctx, jobId)
}
}
func onConnectionUp(connName string) {
log.Printf("[conn:%s] connection became connected, reconnecting jobs", connName)
ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelFn()
allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job)
if err != nil {
log.Printf("[conn:%s] failed to get jobs for reconnection: %v", connName, err)
return
}
var jobsToReconnect []*waveobj.Job
for _, job := range allJobs {
if job.Connection == connName && isJobManagerRunning(job) {
jobsToReconnect = append(jobsToReconnect, job)
}
}
log.Printf("[conn:%s] found %d jobs to reconnect", connName, len(jobsToReconnect))
successCount := 0
for _, job := range jobsToReconnect {
err = ReconnectJob(ctx, job.OID, nil)
if err != nil {
log.Printf("[job:%s] error reconnecting: %v", job.OID, err)
} else {
successCount++
}
}
log.Printf("[conn:%s] finished reconnecting jobs: %d/%d successful", connName, successCount, len(jobsToReconnect))
}
func onConnectionDown(connName string) {
log.Printf("[conn:%s] connection became disconnected", connName)
}
func GetJobConnStatus(jobId string) string {
jobControllerLock.Lock()
defer jobControllerLock.Unlock()
status, exists := jobConnStates[jobId]
if !exists {
return JobConnStatus_Disconnected
}
return status
}
func SetJobConnStatus(jobId string, status string) {
jobControllerLock.Lock()
defer jobControllerLock.Unlock()
if status == JobConnStatus_Disconnected {
delete(jobConnStates, jobId)
} else {
jobConnStates[jobId] = status
}
}
func GetConnectedJobIds() []string {
jobControllerLock.Lock()
defer jobControllerLock.Unlock()
var connectedJobIds []string
for jobId, status := range jobConnStates {
if status == JobConnStatus_Connected {
connectedJobIds = append(connectedJobIds, jobId)
}
}
return connectedJobIds
}
func GetNumJobsRunning() int {
ctx, cancelFn := context.WithTimeout(context.Background(), 2*time.Second)
defer cancelFn()
allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job)
if err != nil {
return 0
}
count := 0
for _, job := range allJobs {
if job.JobManagerStatus == JobManagerStatus_Running {
count++
}
}
return count
}
func GetNumJobsConnected() int {
jobControllerLock.Lock()
defer jobControllerLock.Unlock()
count := 0
for _, status := range jobConnStates {
if status == JobConnStatus_Connected {
count++
}
}
return count
}
func CheckJobConnected(ctx context.Context, jobId string) (*waveobj.Job, error) {
job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId)
if err != nil {
return nil, fmt.Errorf("failed to get job: %w", err)
}
isConnected, err := conncontroller.IsConnected(job.Connection)
if err != nil {
return nil, fmt.Errorf("error checking connection status: %w", err)
}
if !isConnected {
return nil, fmt.Errorf("connection %q is not connected", job.Connection)
}
jobConnStatus := GetJobConnStatus(jobId)
if jobConnStatus != JobConnStatus_Connected {
return nil, fmt.Errorf("job is not connected (status: %s)", jobConnStatus)
}
return job, nil
}
type StartJobParams struct {
ConnName string
JobKind string
Cmd string
Args []string
Env map[string]string
TermSize *waveobj.TermSize
BlockId string
}
func StartJob(ctx context.Context, params StartJobParams) (string, error) {
if params.ConnName == "" {
return "", fmt.Errorf("connection name is required")
}
if params.JobKind != JobKind_Shell && params.JobKind != JobKind_Task {
return "", fmt.Errorf("jobkind must be %q or %q", JobKind_Shell, JobKind_Task)
}
if params.Cmd == "" {
return "", fmt.Errorf("command is required")
}
if params.TermSize == nil {
params.TermSize = &waveobj.TermSize{Rows: 24, Cols: 80}
}
isConnected, err := conncontroller.IsConnected(params.ConnName)
if err != nil {
return "", fmt.Errorf("error checking connection status: %w", err)
}
if !isConnected {
return "", fmt.Errorf("connection %q is not connected", params.ConnName)
}
jobId := uuid.New().String()
jobAuthToken, err := utilfn.RandomHexString(32)
if err != nil {
return "", fmt.Errorf("failed to generate job auth token: %w", err)
}
jobAccessClaims := &wavejwt.WaveJwtClaims{
MainServer: true,
JobId: jobId,
}
jobAccessToken, err := wavejwt.Sign(jobAccessClaims)
if err != nil {
return "", fmt.Errorf("failed to generate job access token: %w", err)
}
job := &waveobj.Job{
OID: jobId,
Connection: params.ConnName,
JobKind: params.JobKind,
Cmd: params.Cmd,
CmdArgs: params.Args,
CmdEnv: params.Env,
CmdTermSize: *params.TermSize,
JobAuthToken: jobAuthToken,
JobManagerStatus: JobManagerStatus_Init,
AttachedBlockId: params.BlockId,
WaveVersion: wavebase.WaveVersion,
Meta: make(waveobj.MetaMapType),
}
err = wstore.DBInsert(ctx, job)
if err != nil {
return "", fmt.Errorf("failed to create job in database: %w", err)
}
if params.BlockId != "" {
// AttachJobToBlock will send status
err = AttachJobToBlock(ctx, jobId, params.BlockId)
if err != nil {
return "", fmt.Errorf("failed to attach job to block: %w", err)
}
}
bareRpc := wshclient.GetBareRpcClient()
broker := bareRpc.StreamBroker
readerRouteId := wshclient.GetBareRpcClientRouteId()
writerRouteId := wshutil.MakeJobRouteId(jobId)
reader, streamMeta := broker.CreateStreamReader(readerRouteId, writerRouteId, DefaultStreamRwnd)
jobStreamIds.Set(jobId, streamMeta.Id)
fileOpts := wshrpc.FileOpts{
MaxSize: 10 * 1024 * 1024,
Circular: true,
}
err = filestore.WFS.MakeFile(ctx, jobId, JobOutputFileName, wshrpc.FileMeta{}, fileOpts)
if err != nil {
return "", fmt.Errorf("failed to create WaveFS file: %w", err)
}
clientId := wstore.GetClientId()
publicKey := wavejwt.GetPublicKey()
publicKeyBase64 := base64.StdEncoding.EncodeToString(publicKey)
jobEnv := envutil.CopyAndAddToEnvMap(params.Env, "WAVETERM_JOBID", jobId)
startJobData := wshrpc.CommandRemoteStartJobData{
Cmd: params.Cmd,
Args: params.Args,
Env: jobEnv,
TermSize: *params.TermSize,
StreamMeta: streamMeta,
JobAuthToken: jobAuthToken,
JobId: jobId,
MainServerJwtToken: jobAccessToken,
ClientId: clientId,
PublicKeyBase64: publicKeyBase64,
}
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeConnectionRouteId(params.ConnName),
Timeout: 30000,
}
writeSessionSeparatorToTerminal(params.BlockId, params.TermSize.Cols)
log.Printf("[job:%s] sending RemoteStartJobCommand to connection %s, cmd=%q, args=%v", jobId, params.ConnName, params.Cmd, params.Args)
log.Printf("[job:%s] env=%v", jobId, params.Env)
rtnData, err := wshclient.RemoteStartJobCommand(bareRpc, startJobData, rpcOpts)
if err != nil {
log.Printf("[job:%s] RemoteStartJobCommand failed: %v", jobId, err)
errMsg := fmt.Sprintf("failed to start job: %v", err)
var updatedJob *waveobj.Job
wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.JobManagerStatus = JobManagerStatus_Done
job.JobManagerDoneReason = JobDoneReason_StartupError
job.JobManagerStartupError = errMsg
updatedJob = job
})
sendBlockJobStatusEventByJob(ctx, updatedJob)
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "job:done",
Props: telemetrydata.TEventProps{
JobDoneReason: JobDoneReason_StartupError,
JobKind: params.JobKind,
},
})
return "", fmt.Errorf("failed to start remote job: %w", err)
}
log.Printf("[job:%s] RemoteStartJobCommand succeeded, cmdpid=%d cmdstartts=%d jobmanagerpid=%d jobmanagerstartts=%d", jobId, rtnData.CmdPid, rtnData.CmdStartTs, rtnData.JobManagerPid, rtnData.JobManagerStartTs)
var updatedJob *waveobj.Job
err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.CmdPid = rtnData.CmdPid
job.CmdStartTs = rtnData.CmdStartTs
job.JobManagerPid = rtnData.JobManagerPid
job.JobManagerStartTs = rtnData.JobManagerStartTs
job.JobManagerStatus = JobManagerStatus_Running
updatedJob = job
})
if err != nil {
log.Printf("[job:%s] warning: failed to update job status to running: %v", jobId, err)
} else {
log.Printf("[job:%s] job status updated to running", jobId)
sendBlockJobStatusEventByJob(ctx, updatedJob)
}
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "job:start",
Props: telemetrydata.TEventProps{
JobKind: params.JobKind,
},
})
go func() {
defer func() {
panichandler.PanicHandler("jobcontroller:runOutputLoop", recover())
}()
runOutputLoop(context.Background(), jobId, streamMeta.Id, reader)
}()
return jobId, nil
}
func doWFSAppend(ctx context.Context, oref waveobj.ORef, fileName string, data []byte) error {
err := filestore.WFS.AppendData(ctx, oref.OID, fileName, data)
if err != nil {
return err
}
wps.Broker.Publish(wps.WaveEvent{
Event: wps.Event_BlockFile,
Scopes: []string{
oref.String(),
},
Data: &wps.WSFileEventData{
ZoneId: oref.OID,
FileName: fileName,
FileOp: wps.FileOp_Append,
Data64: base64.StdEncoding.EncodeToString(data),
},
})
return nil
}
func handleAppendJobFile(ctx context.Context, jobId string, fileName string, data []byte) error {
err := doWFSAppend(ctx, waveobj.MakeORef(waveobj.OType_Job, jobId), fileName, data)
if err != nil {
return fmt.Errorf("error appending to job file: %w", err)
}
job, err := wstore.DBGet[*waveobj.Job](ctx, jobId)
if err != nil {
return fmt.Errorf("error getting job: %w", err)
}
if job != nil && job.AttachedBlockId != "" {
err = doWFSAppend(ctx, waveobj.MakeORef(waveobj.OType_Block, job.AttachedBlockId), fileName, data)
if err != nil {
return fmt.Errorf("error appending to block file: %w", err)
}
}
return nil
}
func runOutputLoop(ctx context.Context, jobId string, streamId string, reader *streamclient.Reader) {
defer reader.Close()
defer func() {
log.Printf("[job:%s] [stream:%s] output loop finished", jobId, streamId)
}()
log.Printf("[job:%s] [stream:%s] output loop started", jobId, streamId)
buf := make([]byte, 4096)
for {
n, err := reader.Read(buf)
currentStreamId, _ := jobStreamIds.GetEx(jobId)
if currentStreamId != streamId {
log.Printf("[job:%s] [stream:%s] stream superseded by [stream:%s], exiting output loop", jobId, streamId, currentStreamId)
break
}
if n > 0 {
appendErr := handleAppendJobFile(ctx, jobId, JobOutputFileName, buf[:n])
if appendErr != nil {
log.Printf("[job:%s] error appending data to WaveFS: %v", jobId, appendErr)
}
}
if err == io.EOF {
log.Printf("[job:%s] stream ended (EOF)", jobId)
updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.StreamDone = true
})
if updateErr != nil {
log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr)
}
tryTerminateJobManager(ctx, jobId)
break
}
if err != nil {
log.Printf("[job:%s] stream error: %v", jobId, err)
streamErr := err.Error()
updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.StreamDone = true
job.StreamError = streamErr
})
if updateErr != nil {
log.Printf("[job:%s] error updating job stream error: %v", jobId, updateErr)
}
tryTerminateJobManager(ctx, jobId)
break
}
}
}
func HandleCmdJobExited(ctx context.Context, jobId string, data wshrpc.CommandJobCmdExitedData) error {
var updatedJob *waveobj.Job
err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.CmdExitError = data.ExitErr
job.CmdExitCode = data.ExitCode
job.CmdExitSignal = data.ExitSignal
job.CmdExitTs = data.ExitTs
updatedJob = job
})
if err != nil {
return fmt.Errorf("failed to update job exit status: %w", err)
}
sendBlockJobStatusEventByJob(ctx, updatedJob)
tryTerminateJobManager(ctx, jobId)
shouldWrite := jobTerminationMessageWritten.TestAndSet(jobId, true, func(val bool, exists bool) bool {
return !exists || !val
})
if shouldWrite {
resetTerminalState(ctx, updatedJob.AttachedBlockId)
msg := "shell terminated"
if updatedJob.CmdExitCode != nil && *updatedJob.CmdExitCode != 0 {
msg = fmt.Sprintf("shell terminated (exit code %d)", *updatedJob.CmdExitCode)
} else if updatedJob.CmdExitSignal != "" {
msg = fmt.Sprintf("shell terminated (signal %s)", updatedJob.CmdExitSignal)
}
writeMutedMessageToTerminal(updatedJob.AttachedBlockId, "["+msg+"]")
}
return nil
}
func tryTerminateJobManager(ctx context.Context, jobId string) {
job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId)
if err != nil {
log.Printf("[job:%s] error getting job for termination check: %v", jobId, err)
return
}
if job.JobManagerStatus != JobManagerStatus_Running {
return
}
cmdExited := job.CmdExitTs != 0
if !cmdExited || !job.StreamDone {
log.Printf("[job:%s] not ready for termination: exited=%v streamDone=%v", jobId, cmdExited, job.StreamDone)
return
}
log.Printf("[job:%s] both job cmd exited and stream finished, terminating job manager", jobId)
err = TerminateJobManager(ctx, jobId)
if err != nil {
log.Printf("[job:%s] error terminating job manager: %v", jobId, err)
}
}
func TerminateAndDetachJob(ctx context.Context, jobId string) {
err := TerminateJobManager(ctx, jobId)
if err != nil {
log.Printf("[job:%s] error terminating job manager: %v", jobId, err)
}
err = DetachJobFromBlock(ctx, jobId, true)
if err != nil {
log.Printf("[job:%s] error detaching job from block: %v", jobId, err)
}
}
func TerminateJobManager(ctx context.Context, jobId string) error {
_, err, _ := terminateJobManagerGroup.Do(jobId, func() (any, error) {
err := doTerminateJobManager(ctx, jobId)
return nil, err
})
return err
}
func doTerminateJobManager(ctx context.Context, jobId string) error {
var shouldTerminate bool
var job *waveobj.Job
err := wstore.DBUpdateFn(ctx, jobId, func(j *waveobj.Job) {
job = j
if j.JobManagerStatus == JobManagerStatus_Done {
shouldTerminate = false
return
}
j.TerminateOnReconnect = true
shouldTerminate = true
})
if err != nil {
return fmt.Errorf("failed to set TerminateOnReconnect: %w", err)
}
if !shouldTerminate {
log.Printf("[job:%s] already terminated, skipping", jobId)
return nil
}
return remoteTerminateJobManager(ctx, job)
}
func DisconnectJob(ctx context.Context, jobId string) error {
job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId)
if err != nil {
return fmt.Errorf("failed to get job: %w", err)
}
bareRpc := wshclient.GetBareRpcClient()
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeConnectionRouteId(job.Connection),
Timeout: 5000,
}
disconnectData := wshrpc.CommandRemoteDisconnectFromJobManagerData{
JobId: jobId,
}
err = wshclient.RemoteDisconnectFromJobManagerCommand(bareRpc, disconnectData, rpcOpts)
if err != nil {
return fmt.Errorf("failed to send disconnect command: %w", err)
}
log.Printf("[job:%s] job disconnect command sent successfully", jobId)
return nil
}
func remoteTerminateJobManager(ctx context.Context, job *waveobj.Job) error {
log.Printf("[job:%s] terminating job manager", job.OID)
shouldWrite := jobTerminationMessageWritten.TestAndSet(job.OID, true, func(val bool, exists bool) bool {
return !exists || !val
})
if shouldWrite {
resetTerminalState(ctx, job.AttachedBlockId)
writeMutedMessageToTerminal(job.AttachedBlockId, "[shell terminated]")
}
if job.JobManagerStatus == JobManagerStatus_Done {
log.Printf("[job:%s] job manager already marked as done, skipping termination", job.OID)
return nil
}
bareRpc := wshclient.GetBareRpcClient()
terminateData := wshrpc.CommandRemoteTerminateJobManagerData{
JobId: job.OID,
JobManagerPid: job.JobManagerPid,
JobManagerStartTs: job.JobManagerStartTs,
}
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeConnectionRouteId(job.Connection),
Timeout: 5000,
}
err := wshclient.RemoteTerminateJobManagerCommand(bareRpc, terminateData, rpcOpts)
if err != nil {
log.Printf("[job:%s] error terminating job manager: %v", job.OID, err)
return fmt.Errorf("failed to terminate job manager: %w", err)
}
var updatedJob *waveobj.Job
updateErr := wstore.DBUpdateFn(ctx, job.OID, func(job *waveobj.Job) {
job.JobManagerStatus = JobManagerStatus_Done
job.JobManagerDoneReason = JobDoneReason_Terminated
job.TerminateOnReconnect = false
if !job.StreamDone {
job.StreamDone = true
job.StreamError = "job manager terminated"
}
updatedJob = job
})
if updateErr != nil {
log.Printf("[job:%s] error updating job status after termination: %v", job.OID, updateErr)
} else {
sendBlockJobStatusEventByJob(ctx, updatedJob)
}
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "job:done",
Props: telemetrydata.TEventProps{
JobDoneReason: JobDoneReason_Terminated,
JobKind: job.JobKind,
},
})
log.Printf("[job:%s] job manager terminated successfully", job.OID)
return nil
}
func ReconnectJob(ctx context.Context, jobId string, rtOpts *waveobj.RuntimeOpts) error {
_, err, _ := reconnectGroup.Do(jobId, func() (any, error) {
return nil, doReconnectJob(ctx, jobId, rtOpts)
})
return err
}
func doReconnectJob(ctx context.Context, jobId string, rtOpts *waveobj.RuntimeOpts) error {
job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId)
if err != nil {
return fmt.Errorf("failed to get job: %w", err)
}
_, err = CheckJobConnected(ctx, jobId)
if err == nil {
log.Printf("[job:%s] already connected, skipping reconnect", jobId)
return nil
}
log.Printf("[job:%s] not connected, proceeding with reconnect: %v", jobId, err)
isConnected, err := conncontroller.IsConnected(job.Connection)
if err != nil {
return fmt.Errorf("error checking connection status: %w", err)
}
if !isConnected {
return fmt.Errorf("connection %q is not connected", job.Connection)
}
if job.TerminateOnReconnect {
return remoteTerminateJobManager(ctx, job)
}
if rtOpts == nil {
rtOpts = &waveobj.RuntimeOpts{
TermSize: job.CmdTermSize,
}
}
bareRpc := wshclient.GetBareRpcClient()
jobAccessClaims := &wavejwt.WaveJwtClaims{
MainServer: true,
JobId: jobId,
}
jobAccessToken, err := wavejwt.Sign(jobAccessClaims)
if err != nil {
return fmt.Errorf("failed to generate job access token: %w", err)
}
reconnectData := wshrpc.CommandRemoteReconnectToJobManagerData{
JobId: jobId,
JobAuthToken: job.JobAuthToken,
MainServerJwtToken: jobAccessToken,
JobManagerPid: job.JobManagerPid,
JobManagerStartTs: job.JobManagerStartTs,
}
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeConnectionRouteId(job.Connection),
Timeout: 5000,
}
log.Printf("[job:%s] sending RemoteReconnectToJobManagerCommand to connection %s", jobId, job.Connection)
rtnData, err := wshclient.RemoteReconnectToJobManagerCommand(bareRpc, reconnectData, rpcOpts)
if err != nil {
log.Printf("[job:%s] RemoteReconnectToJobManagerCommand failed: %v", jobId, err)
return fmt.Errorf("failed to reconnect to job manager: %w", err)
}
if !rtnData.Success {
log.Printf("[job:%s] RemoteReconnectToJobManagerCommand returned error: %s", jobId, rtnData.Error)
if rtnData.JobManagerGone {
var updatedJob *waveobj.Job
updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.JobManagerStatus = JobManagerStatus_Done
job.JobManagerDoneReason = JobDoneReason_Gone
updatedJob = job
})
if updateErr != nil {
log.Printf("[job:%s] error updating job manager running status: %v", jobId, updateErr)
} else {
sendBlockJobStatusEventByJob(ctx, updatedJob)
}
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "job:done",
Props: telemetrydata.TEventProps{
JobDoneReason: JobDoneReason_Gone,
JobKind: job.JobKind,
},
})
writeJobTerminationMessage(ctx, jobId, updatedJob, "[session gone]")
return fmt.Errorf("job manager has exited: %s", rtnData.Error)
}
return fmt.Errorf("failed to reconnect to job manager: %s", rtnData.Error)
}
log.Printf("[job:%s] RemoteReconnectToJobManagerCommand succeeded, waiting for route", jobId)
routeId := wshutil.MakeJobRouteId(jobId)
waitCtx, cancelFn := context.WithTimeout(ctx, 2*time.Second)
defer cancelFn()
err = wshutil.DefaultRouter.WaitForRegister(waitCtx, routeId)
if err != nil {
return fmt.Errorf("route did not establish after successful reconnection: %w", err)
}
SetJobConnStatus(jobId, JobConnStatus_Connected)
sendBlockJobStatusEventByJob(ctx, job)
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "job:reconnect",
Props: telemetrydata.TEventProps{
JobKind: job.JobKind,
},
})
log.Printf("[job:%s] route established, restarting streaming", jobId)
return restartStreaming(ctx, jobId, true, rtOpts)
}
func ReconnectJobsForConn(ctx context.Context, connName string) error {
isConnected, err := conncontroller.IsConnected(connName)
if err != nil {
return fmt.Errorf("error checking connection status: %w", err)
}
if !isConnected {
return fmt.Errorf("connection %q is not connected", connName)
}
allJobs, err := wstore.DBGetAllObjsByType[*waveobj.Job](ctx, waveobj.OType_Job)
if err != nil {
return fmt.Errorf("failed to get jobs: %w", err)
}
var jobsToReconnect []*waveobj.Job
for _, job := range allJobs {
if job.Connection == connName && isJobManagerRunning(job) {
jobsToReconnect = append(jobsToReconnect, job)
}
}
log.Printf("[conn:%s] found %d jobs to reconnect", connName, len(jobsToReconnect))
for _, job := range jobsToReconnect {
err = ReconnectJob(ctx, job.OID, nil)
if err != nil {
log.Printf("[job:%s] error reconnecting: %v", job.OID, err)
}
}
return nil
}
func restartStreaming(ctx context.Context, jobId string, knownConnected bool, rtOpts *waveobj.RuntimeOpts) error {
job, err := wstore.DBMustGet[*waveobj.Job](ctx, jobId)
if err != nil {
return fmt.Errorf("failed to get job: %w", err)
}
termSize := job.CmdTermSize
if rtOpts != nil && rtOpts.TermSize.Rows > 0 && rtOpts.TermSize.Cols > 0 {
termSize = rtOpts.TermSize
err = wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.CmdTermSize = termSize
})
if err != nil {
log.Printf("[job:%s] warning: failed to update termsize in DB: %v", jobId, err)
}
}
if !knownConnected {
isConnected, err := conncontroller.IsConnected(job.Connection)
if err != nil {
return fmt.Errorf("error checking connection status: %w", err)
}
if !isConnected {
return fmt.Errorf("connection %q is not connected", job.Connection)
}
jobConnStatus := GetJobConnStatus(jobId)
if jobConnStatus != JobConnStatus_Connected {
return fmt.Errorf("job manager is not connected (status: %s)", jobConnStatus)
}
}
var currentSeq int64 = 0
var totalGap int64 = 0
waveFile, err := filestore.WFS.Stat(ctx, jobId, JobOutputFileName)
if err == nil {
currentSeq = waveFile.Size
totalGap = getMetaInt64(waveFile.Meta, MetaKey_TotalGap)
currentSeq += totalGap
}
bareRpc := wshclient.GetBareRpcClient()
broker := bareRpc.StreamBroker
readerRouteId := wshclient.GetBareRpcClientRouteId()
writerRouteId := wshutil.MakeJobRouteId(jobId)
reader, streamMeta := broker.CreateStreamReaderWithSeq(readerRouteId, writerRouteId, DefaultStreamRwnd, currentSeq)
jobStreamIds.Set(jobId, streamMeta.Id)
prepareData := wshrpc.CommandJobPrepareConnectData{
StreamMeta: *streamMeta,
Seq: currentSeq,
TermSize: termSize,
}
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeJobRouteId(jobId),
Timeout: 5000,
}
log.Printf("[job:%s] sending JobPrepareConnectCommand with seq=%d (fileSize=%d, totalGap=%d)", jobId, currentSeq, waveFile.Size, totalGap)
rtnData, err := wshclient.JobPrepareConnectCommand(bareRpc, prepareData, rpcOpts)
if err != nil {
reader.Close()
return fmt.Errorf("failed to prepare connect: %w", err)
}
if rtnData.HasExited {
exitCodeStr := "nil"
if rtnData.ExitCode != nil {
exitCodeStr = fmt.Sprintf("%d", *rtnData.ExitCode)
}
log.Printf("[job:%s] job has already exited: code=%s signal=%q err=%q", jobId, exitCodeStr, rtnData.ExitSignal, rtnData.ExitErr)
exitData := wshrpc.CommandJobCmdExitedData{
ExitCode: rtnData.ExitCode,
ExitSignal: rtnData.ExitSignal,
ExitErr: rtnData.ExitErr,
ExitTs: time.Now().UnixMilli(),
}
HandleCmdJobExited(ctx, jobId, exitData)
}
if rtnData.StreamDone {
log.Printf("[job:%s] stream is already done: error=%q", jobId, rtnData.StreamError)
updateErr := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
if !job.StreamDone {
job.StreamDone = true
if rtnData.StreamError != "" {
job.StreamError = rtnData.StreamError
}
}
})
if updateErr != nil {
log.Printf("[job:%s] error updating job stream status: %v", jobId, updateErr)
}
}
if rtnData.StreamDone && rtnData.HasExited {
reader.Close()
log.Printf("[job:%s] both stream done and job exited, calling tryExitJobManager", jobId)
tryTerminateJobManager(ctx, jobId)
return nil
}
if rtnData.StreamDone {
reader.Close()
log.Printf("[job:%s] stream already done, no need to restart streaming", jobId)
return nil
}
if rtnData.Seq > currentSeq {
gap := rtnData.Seq - currentSeq
totalGap += gap
log.Printf("[job:%s] detected gap: our seq=%d, server seq=%d, gap=%d, new totalGap=%d", jobId, currentSeq, rtnData.Seq, gap, totalGap)
metaErr := filestore.WFS.WriteMeta(ctx, jobId, JobOutputFileName, wshrpc.FileMeta{
MetaKey_TotalGap: totalGap,
}, true)
if metaErr != nil {
log.Printf("[job:%s] error updating totalgap metadata: %v", jobId, metaErr)
}
reader.UpdateNextSeq(rtnData.Seq)
}
log.Printf("[job:%s] sending JobStartStreamCommand", jobId)
startStreamData := wshrpc.CommandJobStartStreamData{}
err = wshclient.JobStartStreamCommand(bareRpc, startStreamData, rpcOpts)
if err != nil {
reader.Close()
return fmt.Errorf("failed to start stream: %w", err)
}
go func() {
defer func() {
panichandler.PanicHandler("jobcontroller:RestartStreaming:runOutputLoop", recover())
}()
runOutputLoop(context.Background(), jobId, streamMeta.Id, reader)
}()
log.Printf("[job:%s] streaming restarted successfully", jobId)
return nil
}
// this function must be kept up to date with getBlockTermDurableAtom in frontend/app/store/global.ts
func IsBlockTermDurable(block *waveobj.Block) bool {
if block == nil {
return false
}
// Check if view is "term", and controller is "shell"
if block.Meta.GetString(waveobj.MetaKey_View, "") != "term" || block.Meta.GetString(waveobj.MetaKey_Controller, "") != "shell" {
return false
}
// 1. Check if block has a JobId
if block.JobId != "" {
return true
}
// 2. Check if connection is local or WSL (not durable)
connName := block.Meta.GetString(waveobj.MetaKey_Connection, "")
if conncontroller.IsLocalConnName(connName) || conncontroller.IsWslConnName(connName) {
return false
}
// 3. Check config hierarchy: blockmeta → connection → global (default true)
// Check block meta first
if val, exists := block.Meta[waveobj.MetaKey_TermDurable]; exists {
if boolVal, ok := val.(bool); ok {
return boolVal
}
}
// Check connection config
fullConfig := wconfig.GetWatcher().GetFullConfig()
if connName != "" {
if connConfig, exists := fullConfig.Connections[connName]; exists {
if connConfig.TermDurable != nil {
return *connConfig.TermDurable
}
}
}
// Check global settings
if fullConfig.Settings.TermDurable != nil {
return *fullConfig.Settings.TermDurable
}
// Default to true for non-local connections
return true
}
func IsBlockIdTermDurable(blockId string) bool {
block, err := wstore.DBGet[*waveobj.Block](context.Background(), blockId)
if err != nil || block == nil {
return false
}
return IsBlockTermDurable(block)
}
func DeleteJob(ctx context.Context, jobId string) error {
SetJobConnStatus(jobId, JobConnStatus_Disconnected)
jobTerminationMessageWritten.Delete(jobId)
err := filestore.WFS.DeleteZone(ctx, jobId)
if err != nil {
log.Printf("[job:%s] warning: error deleting WaveFS zone: %v", jobId, err)
}
return wstore.DBDelete(ctx, waveobj.OType_Job, jobId)
}
func AttachJobToBlock(ctx context.Context, jobId string, blockId string) error {
err := wstore.WithTx(ctx, func(tx *wstore.TxWrap) error {
var oldJobId string
err := wstore.DBUpdateFn(tx.Context(), blockId, func(block *waveobj.Block) {
oldJobId = block.JobId
block.JobId = jobId
})
if err != nil {
return fmt.Errorf("failed to update block: %w", err)
}
if oldJobId != "" && oldJobId != jobId {
err = wstore.DBUpdateFn(tx.Context(), oldJobId, func(oldJob *waveobj.Job) {
if oldJob.AttachedBlockId == blockId {
oldJob.AttachedBlockId = ""
}
})
if err != nil {
log.Printf("[job:%s] warning: could not detach old job: %v", oldJobId, err)
}
}
err = wstore.DBUpdateFnErr(tx.Context(), jobId, func(job *waveobj.Job) error {
if job.AttachedBlockId != "" && job.AttachedBlockId != blockId {
return fmt.Errorf("job %s already attached to block %s", jobId, job.AttachedBlockId)
}
job.AttachedBlockId = blockId
return nil
})
if err != nil {
return fmt.Errorf("failed to update job: %w", err)
}
log.Printf("[job:%s] attached to block:%s", jobId, blockId)
return nil
})
if err != nil {
return err
}
SendBlockJobStatusEvent(ctx, blockId)
wcore.SendWaveObjUpdate(waveobj.MakeORef(waveobj.OType_Block, blockId))
return nil
}
func DetachJobFromBlock(ctx context.Context, jobId string, updateBlock bool) error {
var blockId string
var blockUpdated bool
err := wstore.WithTx(ctx, func(tx *wstore.TxWrap) error {
job, err := wstore.DBMustGet[*waveobj.Job](tx.Context(), jobId)
if err != nil {
return fmt.Errorf("failed to get job: %w", err)
}
blockId = job.AttachedBlockId
if blockId == "" {
return nil
}
if updateBlock {
block, err := wstore.DBGet[*waveobj.Block](tx.Context(), blockId)
if err == nil && block != nil {
err = wstore.DBUpdateFn(tx.Context(), blockId, func(block *waveobj.Block) {
block.JobId = ""
})
if err != nil {
log.Printf("[job:%s] warning: failed to clear JobId from block:%s: %v", jobId, blockId, err)
} else {
blockUpdated = true
}
}
}
err = wstore.DBUpdateFn(tx.Context(), jobId, func(job *waveobj.Job) {
job.AttachedBlockId = ""
})
if err != nil {
return fmt.Errorf("failed to update job: %w", err)
}
log.Printf("[job:%s] detached from block:%s", jobId, blockId)
return nil
})
if err != nil {
return err
}
if blockId != "" {
SendBlockJobStatusEvent(ctx, blockId)
if blockUpdated {
wcore.SendWaveObjUpdate(waveobj.MakeORef(waveobj.OType_Block, blockId))
}
}
return nil
}
func SendInput(ctx context.Context, data wshrpc.CommandJobInputData) error {
jobId := data.JobId
if data.TermSize != nil {
err := wstore.DBUpdateFn(ctx, jobId, func(job *waveobj.Job) {
job.CmdTermSize = *data.TermSize
})
if err != nil {
log.Printf("[job:%s] warning: failed to update termsize in DB: %v", jobId, err)
}
}
_, err := CheckJobConnected(ctx, jobId)
if err != nil {
return err
}
rpcOpts := &wshrpc.RpcOpts{
Route: wshutil.MakeJobRouteId(jobId),
Timeout: 5000,
NoResponse: false,
}
bareRpc := wshclient.GetBareRpcClient()
err = wshclient.JobInputCommand(bareRpc, data, rpcOpts)
if err != nil {
return fmt.Errorf("failed to send input to job: %w", err)
}
return nil
}
func resetTerminalState(logCtx context.Context, blockId string) {
if blockId == "" {
return
}
ctx, cancelFn := context.WithTimeout(context.Background(), DefaultTimeout)
defer cancelFn()
if isFileEmpty(ctx, blockId) {
return
}
blocklogger.Debugf(logCtx, "[conndebug] resetTerminalState: resetting terminal state for block\n")
resetSeq := shellutil.GetTerminalResetSeq()
resetSeq += "\r\n"
err := doWFSAppend(ctx, waveobj.MakeORef(waveobj.OType_Block, blockId), JobOutputFileName, []byte(resetSeq))
if err != nil {
log.Printf("error appending terminal reset to block file: %v\n", err)
}
}
func isFileEmpty(ctx context.Context, blockId string) bool {
if blockId == "" {
return true
}
file, statErr := filestore.WFS.Stat(ctx, blockId, JobOutputFileName)
if statErr == fs.ErrNotExist {
return true
}
if statErr != nil {
log.Printf("error statting block output file: %v\n", statErr)
return true
}
return file.Size == 0
}
func writeSessionSeparatorToTerminal(blockId string, termWidth int) {
if blockId == "" {
return
}
ctx, cancelFn := context.WithTimeout(context.Background(), DefaultTimeout)
defer cancelFn()
if isFileEmpty(ctx, blockId) {
return
}
separatorLine := "\r\n"
err := doWFSAppend(ctx, waveobj.MakeORef(waveobj.OType_Block, blockId), JobOutputFileName, []byte(separatorLine))
if err != nil {
log.Printf("error writing session separator to terminal (blockid=%s): %v", blockId, err)
}
}
// msg should not have a terminating newline
func writeMutedMessageToTerminal(blockId string, msg string) {
if blockId == "" {
return
}
ctx, cancelFn := context.WithTimeout(context.Background(), DefaultTimeout)
defer cancelFn()
fullMsg := "\x1b[90m" + msg + "\x1b[0m\r\n"
err := doWFSAppend(ctx, waveobj.MakeORef(waveobj.OType_Block, blockId), JobOutputFileName, []byte(fullMsg))
if err != nil {
log.Printf("error writing muted message to terminal (blockid=%s): %v", blockId, err)
}
}
func writeJobTerminationMessage(ctx context.Context, jobId string, job *waveobj.Job, msg string) {
if job == nil {
return
}
shouldWrite := jobTerminationMessageWritten.TestAndSet(jobId, true, func(val bool, exists bool) bool {
return !exists || !val
})
if shouldWrite {
resetTerminalState(ctx, job.AttachedBlockId)
writeMutedMessageToTerminal(job.AttachedBlockId, msg)
}
}