From a06603b816cd96557e1dc39c6db67dd8a69bc782 Mon Sep 17 00:00:00 2001
From: Magnus Jensen <magnus@fleetdm.com>
Date: Tue, 22 Jul 2025 18:19:44 +0200
Subject: [PATCH] Fix flaky test async last seen by using channel sync instead
 of time sleep (#31128)

Attempt at fix: #24652

I used randokiller with up to 100 and 150 retries, where sometimes it
failed after 89 attempts, to verify the flakiness.

I tried a couple of approaches, with one being just upping the
time.sleep to 200ms still without luck.
I then opted for a _sync_ channel that blocks and waits for the timed
flush to execute which allows it to go further and with a timeout that
fails the test if a deadlock is caused.

I looked into [synctest](https://pkg.go.dev/testing/synctest), but that
is still experimental and expected to become non-experimental in 1.26,
so not really usable for now, but sounds like it will help in time/timer
based tests that are a bit flaky.

Successful randokiller run:
https://github.com/fleetdm/fleet/actions/runs/16446175657/job/46478695973
where retries is set to 100 using the channel approach here.

_I do feel it's a bit weird that the last test `cap and timed flush`
does not have the same timer issues, might be due to the setup of only
having one item before each sleep, where if it had multiple it might be
suspect to the same time issues seen in this test `timed flush`_
---
 .../mdm/nanomdm/storage/mysql/async_test.go   | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/server/mdm/nanomdm/storage/mysql/async_test.go b/server/mdm/nanomdm/storage/mysql/async_test.go
index 7c58fa21c7..1d13cc71d3 100644
--- a/server/mdm/nanomdm/storage/mysql/async_test.go
+++ b/server/mdm/nanomdm/storage/mysql/async_test.go
@@ -11,7 +11,6 @@ import (
 )
 
 func TestAsyncLastSeen(t *testing.T) {
-	t.Skip("Skipping flaky test. Re-enable after fixing #24652")
 	t.Parallel()
 
 	runLoopAndWait := func(t *testing.T, als *asyncLastSeen) (ctx context.Context, stop func()) {
@@ -52,6 +51,7 @@ func TestAsyncLastSeen(t *testing.T) {
 
 		var mu sync.Mutex
 		var gotIDs []string
+		flushCh := make(chan bool, 3) // We expect 3 flushes in this test
 		als := newAsyncLastSeen(10*time.Millisecond, 10, func(ctx context.Context, ids []string) {
 			mu.Lock()
 			defer mu.Unlock()
@@ -61,20 +61,22 @@ func TestAsyncLastSeen(t *testing.T) {
 				gotIDs = append(gotIDs, "|")
 			}
 			gotIDs = append(gotIDs, ids...)
+			flushCh <- true
 		})
 		ctx, stop := runLoopAndWait(t, als)
 
 		als.markHostSeen(ctx, "1")
 		als.markHostSeen(ctx, "2")
-		time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
+		waitForFlushChannel(t, flushCh, 500*time.Millisecond)
 		als.markHostSeen(ctx, "3")
-		time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
+		waitForFlushChannel(t, flushCh, 500*time.Millisecond)
 		als.markHostSeen(ctx, "4")
 		als.markHostSeen(ctx, "5")
 		als.markHostSeen(ctx, "6")
-		time.Sleep(100 * time.Millisecond) // oversleep to avoid slow timers issues on CI
+		waitForFlushChannel(t, flushCh, 500*time.Millisecond)
 
 		stop()
+		close(flushCh)
 
 		mu.Lock()
 		defer mu.Unlock()
@@ -146,3 +148,12 @@ func TestAsyncLastSeen(t *testing.T) {
 		require.Equal(t, "123|4|5|6", strings.Join(gotIDs, ""))
 	})
 }
+
+func waitForFlushChannel(t *testing.T, ch <-chan bool, timeout time.Duration) {
+	select {
+	case <-ch:
+		// ok
+	case <-time.After(timeout):
+		t.Fatal("timeout waiting for flush channel")
+	}
+}