Added automatic Android cert retry (#42734)

<!-- Add the related story/sub-task/bug number, like Resolves #123, or
remove if NA -->
**Related issue:** Resolves #37546

Docs: https://github.com/fleetdm/fleet/pull/42780
Demo: https://www.youtube.com/watch?v=K44wRg9_79M

# Checklist for submitter

If some of the following don't apply, delete the relevant line.

- [x] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
See [Changes
files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files)
for more information.

## Testing

- [x] Added/updated automated tests
- [x] QA'd all new/changed functionality manually

## Database migrations

- [x] Checked schema for all modified table for columns that will
auto-update timestamps during migration.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Automatic retry for Android certificate installations: failed installs
are retried up to 3 times before marked terminal.
* Installation activities recorded: install/failed-install events (with
details) are logged for better visibility and troubleshooting.
* Resend/reset actions now reset retry state so retries behave
predictably after manual resend.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Victor Lyuboslavsky 2026-04-01 13:49:24 -05:00 committed by GitHub
parent e7d001414d
commit f8e5a5dc2d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 279 additions and 13 deletions

View file

@ -1,2 +1,3 @@
* Added activity logging when a certificate is installed or fails to install on an Android host.
* Enabled the host activity card on the Android host details page.
- Added automatic retry (up to 3 times) when the Android agent reports a certificate install failure.
- Added activity logging when a certificate is installed or fails to install on an Android host.
- Enabled the host activity card on the Android host details page.

View file

@ -402,8 +402,11 @@ func (ds *Datastore) CreatePendingCertificateTemplatesForNewHost(
return result.RowsAffected()
}
// ResendHostCertificateTemplate resets a certificate template for re-delivery. It sets retry_count
// to MaxCertificateInstallRetries so that the next failure is terminal with no automatic retry,
// giving the resend exactly one attempt. This matches Apple resend behavior.
func (ds *Datastore) ResendHostCertificateTemplate(ctx context.Context, hostID uint, templateID uint) error {
const stmt = `
stmt := fmt.Sprintf(`
UPDATE
host_certificate_templates hct
INNER JOIN
@ -415,11 +418,12 @@ func (ds *Datastore) ResendHostCertificateTemplate(ctx context.Context, hostID u
hct.not_valid_after = NULL,
hct.serial = NULL,
hct.detail = NULL,
hct.retry_count = %d,
hct.status = ?
WHERE
h.id = ? AND
hct.certificate_template_id = ?
`
`, fleet.MaxCertificateInstallRetries)
const deleteChallenge = `
DELETE c FROM

View file

@ -155,7 +155,8 @@ func (ds *Datastore) GetHostCertificateTemplateRecord(ctx context.Context, hostU
updated_at,
not_valid_before,
not_valid_after,
serial
serial,
retry_count
FROM host_certificate_templates
WHERE host_uuid = ? AND certificate_template_id = ?
`
@ -171,6 +172,40 @@ func (ds *Datastore) GetHostCertificateTemplateRecord(ctx context.Context, hostU
return &result, nil
}
// RetryHostCertificateTemplate resets a failed certificate to pending for automatic retry,
// increments retry_count, preserves the error detail, and clears challenge/cert fields.
func (ds *Datastore) RetryHostCertificateTemplate(ctx context.Context, hostUUID string, certificateTemplateID uint, detail string) error {
return ds.withTx(ctx, func(tx sqlx.ExtContext) error {
// Delete associated challenges
_, err := tx.ExecContext(ctx, `
DELETE c FROM challenges c
INNER JOIN host_certificate_templates hct ON hct.fleet_challenge = c.challenge
WHERE hct.host_uuid = ? AND hct.certificate_template_id = ?
`, hostUUID, certificateTemplateID)
if err != nil {
return ctxerr.Wrap(ctx, err, "delete challenges for certificate retry")
}
// Reset to pending, increment retry_count, preserve error detail, clear cert fields
_, err = tx.ExecContext(ctx, fmt.Sprintf(`
UPDATE host_certificate_templates
SET status = '%s',
retry_count = retry_count + 1,
detail = ?,
fleet_challenge = NULL,
uuid = UUID_TO_BIN(UUID(), true),
not_valid_before = NULL,
not_valid_after = NULL,
serial = NULL
WHERE host_uuid = ? AND certificate_template_id = ?
`, fleet.CertificateTemplatePending), detail, hostUUID, certificateTemplateID)
if err != nil {
return ctxerr.Wrap(ctx, err, "retry certificate install")
}
return nil
})
}
// BulkInsertHostCertificateTemplates inserts multiple host_certificate_templates records
func (ds *Datastore) BulkInsertHostCertificateTemplates(ctx context.Context, hostCertTemplates []fleet.HostCertificateTemplate) error {
if len(hostCertTemplates) == 0 {
@ -660,6 +695,7 @@ func (ds *Datastore) SetAndroidCertificateTemplatesForRenewal(
UPDATE host_certificate_templates
SET
status = '%s',
retry_count = 0,
uuid = UUID_TO_BIN(UUID(), true),
not_valid_before = NULL,
not_valid_after = NULL,

View file

@ -41,6 +41,7 @@ func TestHostCertificateTemplates(t *testing.T) {
{"GetAndroidCertificateTemplatesForRenewal", testGetAndroidCertificateTemplatesForRenewal},
{"SetAndroidCertificateTemplatesForRenewal", testSetAndroidCertificateTemplatesForRenewal},
{"GetOrCreateFleetChallengeForCertificateTemplate", testGetOrCreateFleetChallengeForCertificateTemplate},
{"RetryHostCertificateTemplate", testRetryHostCertificateTemplate},
}
for _, c := range cases {
@ -2007,3 +2008,70 @@ func testGetOrCreateFleetChallengeForCertificateTemplate(t *testing.T, ds *Datas
require.Equal(t, 1, count)
})
}
func testRetryHostCertificateTemplate(t *testing.T, ds *Datastore) {
ctx := t.Context()
setup := createCertTemplateTestSetup(t, ctx, ds, "")
challengeVal := "challenge-val"
host := test.NewHost(t, ds, "android-host", "10.0.0.1", "key1", "uuid1", time.Now(),
test.WithPlatform("android"), test.WithTeamID(setup.team.ID))
// Insert a host certificate template in "delivered" state (simulating initial delivery)
err := ds.BulkInsertHostCertificateTemplates(ctx, []fleet.HostCertificateTemplate{{
HostUUID: host.UUID,
CertificateTemplateID: setup.template.ID,
Status: fleet.CertificateTemplateDelivered,
FleetChallenge: &challengeVal,
OperationType: fleet.MDMOperationTypeInstall,
Name: setup.template.Name,
}})
require.NoError(t, err)
// Insert a challenge record to verify it gets deleted on retry
ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error {
_, err := q.ExecContext(ctx, `INSERT INTO challenges (challenge) VALUES (?)`, challengeVal)
return err
})
// Verify initial state
record, err := ds.GetHostCertificateTemplateRecord(ctx, host.UUID, setup.template.ID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplateDelivered, record.Status)
require.Equal(t, uint(0), record.RetryCount)
// First retry: verify status, retry_count, detail, and cleared fields
err = ds.RetryHostCertificateTemplate(ctx, host.UUID, setup.template.ID, "SCEP enrollment failed")
require.NoError(t, err)
record, err = ds.GetHostCertificateTemplateRecord(ctx, host.UUID, setup.template.ID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplatePending, record.Status)
require.Equal(t, uint(1), record.RetryCount)
require.NotNil(t, record.Detail)
require.Equal(t, "SCEP enrollment failed", *record.Detail)
require.Nil(t, record.NotValidBefore)
require.Nil(t, record.NotValidAfter)
require.Nil(t, record.Serial)
// Verify challenge was deleted
var challengeCount int
ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error {
return sqlx.GetContext(ctx, q, &challengeCount,
`SELECT COUNT(*) FROM challenges WHERE challenge = ?`, challengeVal)
})
require.Equal(t, 0, challengeCount)
// Subsequent retries: verify retry_count increments and detail is updated each time
retryDetails := []string{"SCEP server unavailable", "CA unreachable"}
for i, detail := range retryDetails {
err = ds.RetryHostCertificateTemplate(ctx, host.UUID, setup.template.ID, detail)
require.NoError(t, err)
record, err = ds.GetHostCertificateTemplateRecord(ctx, host.UUID, setup.template.ID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplatePending, record.Status)
require.Equal(t, uint(i+2), record.RetryCount)
require.Equal(t, detail, *record.Detail)
}
}

View file

@ -0,0 +1,22 @@
package tables
import (
"database/sql"
"fmt"
)
func init() {
MigrationClient.AddMigration(Up_20260331000000, Down_20260331000000)
}
func Up_20260331000000(tx *sql.Tx) error {
_, err := tx.Exec(`ALTER TABLE host_certificate_templates ADD COLUMN retry_count INT UNSIGNED NOT NULL DEFAULT 0`)
if err != nil {
return fmt.Errorf("adding retry_count to host_certificate_templates: %w", err)
}
return nil
}
func Down_20260331000000(tx *sql.Tx) error {
return nil
}

File diff suppressed because one or more lines are too long

View file

@ -2770,6 +2770,9 @@ type Datastore interface {
// GetHostCertificateTemplateRecord returns the host_certificate_templates record directly without
// requiring the parent certificate_template to exist. Used for status updates on orphaned records.
GetHostCertificateTemplateRecord(ctx context.Context, hostUUID string, certificateTemplateID uint) (*HostCertificateTemplate, error)
// RetryHostCertificateTemplate resets a failed certificate to pending for automatic retry, increments
// retry_count, preserves the error detail, and clears challenge/cert fields.
RetryHostCertificateTemplate(ctx context.Context, hostUUID string, certificateTemplateID uint, detail string) error
// BulkInsertHostCertificateTemplates inserts multiple host_certificate_templates records.
BulkInsertHostCertificateTemplates(ctx context.Context, hostCertTemplates []HostCertificateTemplate) error
// DeleteHostCertificateTemplates deletes specific host_certificate_templates records

View file

@ -5,6 +5,11 @@ import "time"
// AndroidCertificateTemplateProfileID Used by the front-end for determining the displaying logic.
const AndroidCertificateTemplateProfileID = "fleet-host-certificate-template"
// MaxCertificateInstallRetries is the maximum number of automatic retries after the initial attempt
// when the Android agent reports a certificate install failure. Manual resend via the UI sets
// retry_count to this value so the resend gets exactly one attempt with no automatic retry.
const MaxCertificateInstallRetries uint = 3
type HostCertificateTemplate struct {
ID uint `db:"id"`
Name string `db:"name"`
@ -20,6 +25,7 @@ type HostCertificateTemplate struct {
NotValidBefore *time.Time `db:"not_valid_before"`
NotValidAfter *time.Time `db:"not_valid_after"`
Serial *string `db:"serial"` // for future use
RetryCount uint `db:"retry_count"`
}
// ToHostMDMProfile maps a HostCertificateTemplate to a HostMDMProfile, suitable for use in the MDM API

View file

@ -1805,6 +1805,8 @@ type GetCertificateTemplateForHostFunc func(ctx context.Context, hostUUID string
type GetHostCertificateTemplateRecordFunc func(ctx context.Context, hostUUID string, certificateTemplateID uint) (*fleet.HostCertificateTemplate, error)
type RetryHostCertificateTemplateFunc func(ctx context.Context, hostUUID string, certificateTemplateID uint, detail string) error
type BulkInsertHostCertificateTemplatesFunc func(ctx context.Context, hostCertTemplates []fleet.HostCertificateTemplate) error
type DeleteHostCertificateTemplatesFunc func(ctx context.Context, hostCertTemplates []fleet.HostCertificateTemplate) error
@ -4519,6 +4521,9 @@ type DataStore struct {
GetHostCertificateTemplateRecordFunc GetHostCertificateTemplateRecordFunc
GetHostCertificateTemplateRecordFuncInvoked bool
RetryHostCertificateTemplateFunc RetryHostCertificateTemplateFunc
RetryHostCertificateTemplateFuncInvoked bool
BulkInsertHostCertificateTemplatesFunc BulkInsertHostCertificateTemplatesFunc
BulkInsertHostCertificateTemplatesFuncInvoked bool
@ -10819,6 +10824,13 @@ func (s *DataStore) GetHostCertificateTemplateRecord(ctx context.Context, hostUU
return s.GetHostCertificateTemplateRecordFunc(ctx, hostUUID, certificateTemplateID)
}
func (s *DataStore) RetryHostCertificateTemplate(ctx context.Context, hostUUID string, certificateTemplateID uint, detail string) error {
s.mu.Lock()
s.RetryHostCertificateTemplateFuncInvoked = true
s.mu.Unlock()
return s.RetryHostCertificateTemplateFunc(ctx, hostUUID, certificateTemplateID, detail)
}
func (s *DataStore) BulkInsertHostCertificateTemplates(ctx context.Context, hostCertTemplates []fleet.HostCertificateTemplate) error {
s.mu.Lock()
s.BulkInsertHostCertificateTemplatesFuncInvoked = true

View file

@ -695,11 +695,9 @@ func (svc *Service) UpdateCertificateStatus(ctx context.Context, update *fleet.C
// Fill in HostUUID from context
update.HostUUID = host.UUID
if err := svc.ds.UpsertCertificateStatus(ctx, update); err != nil {
return err
}
// Log activity for terminal install statuses only (not removals).
// Log activity for install statuses (not removals). Failures are logged on every attempt
// (including retries) so IT admins have visibility into retry attempts.
if update.OperationType == fleet.MDMOperationTypeInstall {
var actStatus fleet.CertificateActivityStatus
switch update.Status {
@ -721,7 +719,7 @@ func (svc *Service) UpdateCertificateStatus(ctx context.Context, update *fleet.C
Status: string(actStatus),
Detail: detail,
}); err != nil {
// Log and continue since we don't want the client to retry.
// Log and continue since we don't want the client to fail on this.
svc.logger.ErrorContext(ctx, "failed to create certificate install activity", "host.id", host.ID, "activity.status", actStatus,
"err", err)
ctxerr.Handle(ctx, err)
@ -729,7 +727,21 @@ func (svc *Service) UpdateCertificateStatus(ctx context.Context, update *fleet.C
}
}
return nil
// For failed installs, automatically retry if under the retry limit.
if update.OperationType == fleet.MDMOperationTypeInstall && update.Status == fleet.MDMDeliveryFailed {
if record.RetryCount < fleet.MaxCertificateInstallRetries {
detail := ""
if update.Detail != nil {
detail = *update.Detail
}
if err := svc.ds.RetryHostCertificateTemplate(ctx, host.UUID, update.CertificateTemplateID, detail); err != nil {
return ctxerr.Wrap(ctx, err, "retrying certificate install")
}
return nil
}
}
return svc.ds.UpsertCertificateStatus(ctx, update)
}
////////////////////////////////////////////////////////////////////////////////

View file

@ -1568,4 +1568,105 @@ func (s *integrationMDMTestSuite) TestCertificateTemplateResend() {
// Resend for a non-existent template returns 404
s.DoJSON("POST", fmt.Sprintf("/api/latest/fleet/hosts/%d/certificates/%d/resend", host.ID, 99999),
nil, http.StatusNotFound, &struct{}{})
// ---- Automatic retry tests (reusing same host/cert/CA setup) ----
t.Run("automatic retry", func(t *testing.T) {
// Reset the certificate to pending with retry_count=0 for a fresh retry test
mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error {
_, err := q.ExecContext(ctx,
`UPDATE host_certificate_templates SET status = ?, retry_count = 0 WHERE host_uuid = ? AND certificate_template_id = ?`,
fleet.CertificateTemplatePending, host.UUID, certTemplateID)
return err
})
// Helper: report a certificate status from the device
reportCertStatus := func(status string, detail *string) {
req, err := json.Marshal(updateCertificateStatusRequest{Status: status, Detail: detail})
require.NoError(t, err)
resp := s.DoRawWithHeaders("PUT", fmt.Sprintf("/api/fleetd/certificates/%d/status", certTemplateID), req, http.StatusOK, map[string]string{
"Authorization": fmt.Sprintf("Node key %s", orbitNodeKey),
})
_ = resp.Body.Close()
}
// Helper: deliver (pending -> delivered) via cron and verify
deliverCert := func() {
s.awaitTriggerAndroidProfileSchedule(t)
s.verifyCertificateStatus(t, host, orbitNodeKey, certTemplateID, certTemplateName, caID,
fleet.CertificateTemplateDelivered, "")
}
// Fail MaxCertificateInstallRetries times -- each should auto-retry (status resets to pending)
for i := range fleet.MaxCertificateInstallRetries {
deliverCert()
detail := fmt.Sprintf("SCEP failure %d", i+1)
reportCertStatus(string(fleet.MDMDeliveryFailed), &detail)
record, err := s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplatePending, record.Status, "retry %d should auto-retry", i+1)
require.Equal(t, i+1, record.RetryCount)
}
// One more failure with retry_count at max -- should be terminal
deliverCert()
terminalDetail := "final failure"
reportCertStatus(string(fleet.MDMDeliveryFailed), &terminalDetail)
record, err := s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplateFailed, record.Status, "should be terminal after max retries")
// Verify terminal failure activity was logged on the host with correct details
var hostActivitiesResp listActivitiesResponse
s.DoJSON("GET", fmt.Sprintf("/api/latest/fleet/hosts/%d/activities", host.ID), nil, http.StatusOK,
&hostActivitiesResp, "per_page", "10")
foundTerminalFailActivity := false
for _, act := range hostActivitiesResp.Activities {
if act.Type == (fleet.ActivityTypeInstalledCertificate{}).ActivityName() && act.Details != nil {
var details map[string]any
err = json.Unmarshal(*act.Details, &details)
require.NoError(t, err)
if details["status"] == "failed_install" && details["detail"] == terminalDetail {
foundTerminalFailActivity = true
break
}
}
}
require.True(t, foundTerminalFailActivity, "expected installed_certificate activity with status=failed_install and terminal detail")
// Resend after terminal failure -- gets exactly one attempt (retry_count set to max)
s.DoJSON("POST", fmt.Sprintf("/api/latest/fleet/hosts/%d/certificates/%d/resend", host.ID, certTemplateID),
nil, http.StatusOK, &struct{}{})
record, err = s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.MaxCertificateInstallRetries, record.RetryCount, "resend should set retry_count to max")
// Deliver and fail once more -- terminal immediately (no auto-retry after resend)
deliverCert()
reportCertStatus(string(fleet.MDMDeliveryFailed), ptr.String("post-resend failure"))
record, err = s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplateFailed, record.Status, "should be terminal after resend failure")
// Success on retry: reset to fresh, fail once, then succeed
mysql.ExecAdhocSQL(t, s.ds, func(q sqlx.ExtContext) error {
_, err := q.ExecContext(ctx,
`UPDATE host_certificate_templates SET status = ?, retry_count = 0 WHERE host_uuid = ? AND certificate_template_id = ?`,
fleet.CertificateTemplatePending, host.UUID, certTemplateID)
return err
})
deliverCert()
reportCertStatus(string(fleet.MDMDeliveryFailed), ptr.String("transient error"))
record, err = s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplatePending, record.Status)
deliverCert()
reportCertStatus(string(fleet.MDMDeliveryVerified), nil)
record, err = s.ds.GetHostCertificateTemplateRecord(ctx, host.UUID, certTemplateID)
require.NoError(t, err)
require.Equal(t, fleet.CertificateTemplateVerified, record.Status, "should succeed on retry")
}) // end "automatic retry" subtest
}