Fix flaky test async last seen by using channel sync instead of time sleep (#31128)

Attempt at fix: #24652

I used randokiller with up to 100 and 150 retries, where sometimes it
failed after 89 attempts, to verify the flakiness.

I tried a couple of approaches, with one being just upping the
time.sleep to 200ms still without luck.
I then opted for a _sync_ channel that blocks and waits for the timed
flush to execute which allows it to go further and with a timeout that
fails the test if a deadlock is caused.

I looked into [synctest](https://pkg.go.dev/testing/synctest), but that
is still experimental and expected to become non-experimental in 1.26,
so not really usable for now, but sounds like it will help in time/timer
based tests that are a bit flaky.

Successful randokiller run:
https://github.com/fleetdm/fleet/actions/runs/16446175657/job/46478695973
where retries is set to 100 using the channel approach here.

_I do feel it's a bit weird that the last test `cap and timed flush`
does not have the same timer issues, might be due to the setup of only
having one item before each sleep, where if it had multiple it might be
suspect to the same time issues seen in this test `timed flush`_
This commit is contained in:
Magnus Jensen 2025-07-22 18:19:44 +02:00 committed by GitHub
parent a87ec09e16
commit a06603b816
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -11,7 +11,6 @@ import (
)
func TestAsyncLastSeen(t *testing.T) {
t.Skip("Skipping flaky test. Re-enable after fixing #24652")
t.Parallel()
runLoopAndWait := func(t *testing.T, als *asyncLastSeen) (ctx context.Context, stop func()) {
@ -52,6 +51,7 @@ func TestAsyncLastSeen(t *testing.T) {
var mu sync.Mutex
var gotIDs []string
flushCh := make(chan bool, 3) // We expect 3 flushes in this test
als := newAsyncLastSeen(10*time.Millisecond, 10, func(ctx context.Context, ids []string) {
mu.Lock()
defer mu.Unlock()
@ -61,20 +61,22 @@ func TestAsyncLastSeen(t *testing.T) {
gotIDs = append(gotIDs, "|")
}
gotIDs = append(gotIDs, ids...)
flushCh <- true
})
ctx, stop := runLoopAndWait(t, als)
als.markHostSeen(ctx, "1")
als.markHostSeen(ctx, "2")
time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
waitForFlushChannel(t, flushCh, 500*time.Millisecond)
als.markHostSeen(ctx, "3")
time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
waitForFlushChannel(t, flushCh, 500*time.Millisecond)
als.markHostSeen(ctx, "4")
als.markHostSeen(ctx, "5")
als.markHostSeen(ctx, "6")
time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
waitForFlushChannel(t, flushCh, 500*time.Millisecond)
stop()
close(flushCh)
mu.Lock()
defer mu.Unlock()
@ -146,3 +148,12 @@ func TestAsyncLastSeen(t *testing.T) {
require.Equal(t, "123|4|5|6", strings.Join(gotIDs, ""))
})
}
func waitForFlushChannel(t *testing.T, ch <-chan bool, timeout time.Duration) {
select {
case <-ch:
// ok
case <-time.After(timeout):
t.Fatal("timeout waiting for flush channel")
}
}