From a06603b816cd96557e1dc39c6db67dd8a69bc782 Mon Sep 17 00:00:00 2001 From: Magnus Jensen Date: Tue, 22 Jul 2025 18:19:44 +0200 Subject: [PATCH] Fix flaky test async last seen by using channel sync instead of time sleep (#31128) Attempt at fix: #24652 I used randokiller with up to 100 and 150 retries, where sometimes it failed after 89 attempts, to verify the flakiness. I tried a couple of approaches, with one being just upping the time.sleep to 200ms still without luck. I then opted for a _sync_ channel that blocks and waits for the timed flush to execute which allows it to go further and with a timeout that fails the test if a deadlock is caused. I looked into [synctest](https://pkg.go.dev/testing/synctest), but that is still experimental and expected to become non-experimental in 1.26, so not really usable for now, but sounds like it will help in time/timer based tests that are a bit flaky. Successful randokiller run: https://github.com/fleetdm/fleet/actions/runs/16446175657/job/46478695973 where retries is set to 100 using the channel approach here. _I do feel it's a bit weird that the last test `cap and timed flush` does not have the same timer issues, might be due to the setup of only having one item before each sleep, where if it had multiple it might be suspect to the same time issues seen in this test `timed flush`_ --- .../mdm/nanomdm/storage/mysql/async_test.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/server/mdm/nanomdm/storage/mysql/async_test.go b/server/mdm/nanomdm/storage/mysql/async_test.go index 7c58fa21c7..1d13cc71d3 100644 --- a/server/mdm/nanomdm/storage/mysql/async_test.go +++ b/server/mdm/nanomdm/storage/mysql/async_test.go @@ -11,7 +11,6 @@ import ( ) func TestAsyncLastSeen(t *testing.T) { - t.Skip("Skipping flaky test. Re-enable after fixing #24652") t.Parallel() runLoopAndWait := func(t *testing.T, als *asyncLastSeen) (ctx context.Context, stop func()) { @@ -52,6 +51,7 @@ func TestAsyncLastSeen(t *testing.T) { var mu sync.Mutex var gotIDs []string + flushCh := make(chan bool, 3) // We expect 3 flushes in this test als := newAsyncLastSeen(10*time.Millisecond, 10, func(ctx context.Context, ids []string) { mu.Lock() defer mu.Unlock() @@ -61,20 +61,22 @@ func TestAsyncLastSeen(t *testing.T) { gotIDs = append(gotIDs, "|") } gotIDs = append(gotIDs, ids...) + flushCh <- true }) ctx, stop := runLoopAndWait(t, als) als.markHostSeen(ctx, "1") als.markHostSeen(ctx, "2") - time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI + waitForFlushChannel(t, flushCh, 500*time.Millisecond) als.markHostSeen(ctx, "3") - time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI + waitForFlushChannel(t, flushCh, 500*time.Millisecond) als.markHostSeen(ctx, "4") als.markHostSeen(ctx, "5") als.markHostSeen(ctx, "6") - time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI + waitForFlushChannel(t, flushCh, 500*time.Millisecond) stop() + close(flushCh) mu.Lock() defer mu.Unlock() @@ -146,3 +148,12 @@ func TestAsyncLastSeen(t *testing.T) { require.Equal(t, "123|4|5|6", strings.Join(gotIDs, "")) }) } + +func waitForFlushChannel(t *testing.T, ch <-chan bool, timeout time.Duration) { + select { + case <-ch: + // ok + case <-time.After(timeout): + t.Fatal("timeout waiting for flush channel") + } +}