Merge pull request #3587 from bunkerity/dev

fix(letsencrypt): add purge for stale ACME accounts on JWS rejection
This commit is contained in:
Théophile Diot 2026-05-23 00:08:51 +02:00 committed by GitHub
commit 58f7b794bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 35 additions and 2 deletions

View file

@ -6,7 +6,7 @@
- [SECURITY] `antibot`: Cap.js `script-src` now uses a strict per-request nonce (no more `'unsafe-inline'`); every challenge response also sends `Cache-Control: no-store`. Requires Cap.js widget `0.1.48`+.
- [SECURITY] `letsencrypt` (UI): harden delete + new heal flow — per-request scratch dir, `fcntl.flock`, `.`/`..` rejected in `cert_name`, DOMPurify + `markupsafe.escape` at every HTML sink, 500 on persistence failure; new `/letsencrypt/{orphans,accounts,cache-status,heal}` endpoints, per-row Heal button, sidebar orphan toast.
- [SECURITY] `linux`: `after-remove` hooks now preserve `/var/log/bunkerweb`, `/etc/bunkerweb`, `/var/lib/bunkerweb` and `/var/tmp` upgrade backups on plain uninstall (only purge wipes configs + DB; logs and backups always kept, disposal commands printed); upgrade backups are written via `install -m 0600 -o root -g root` (atomic) and any pre-existing world-readable backups are retro-tightened, closing a local-read window on admin credentials and the SQLite DB.
- [BUGFIX] `letsencrypt` (core): fix self-propagating cache poisoning that caused fleet-wide `certbot AccountNotFound`; add CA-agnostic consistency gate (LE + ZeroSSL paths), server-scoped `select_account_id`, redacted-value `Configurator` WARN logs.
- [BUGFIX] `letsencrypt` (core): fix self-propagating cache poisoning that caused fleet-wide `certbot AccountNotFound`; add CA-agnostic consistency gate (LE + ZeroSSL paths), server-scoped `select_account_id`, auto-purge + re-register when the ACME server reports a pinned `--account` as deleted (stale-account JWS recovery), redacted-value `Configurator` WARN logs.
- [FEATURE] `scheduler`: new `SCHEDULER_MAX_WORKERS` env var caps the job-executor thread pool to bound DB-pool pressure on shared MariaDB/MySQL/PostgreSQL; auto default tightened from `min(8, cpu*4)` to `min(8, max(2, cpu*2))` and a warning is emitted when the resolved value exceeds `DATABASE_POOL_SIZE` + `DATABASE_POOL_MAX_OVERFLOW`.
- [FEATURE] `ui`: `ADMIN_PASSWORD` now also accepts a pre-hashed bcrypt value (`$2a$`/`$2b$`/`$2y$`), stored as-is so the plaintext never lands in env files or secrets (env create + `OVERRIDE_ADMIN_CREDS` paths only; wizard and profile still take plaintext). The strength policy is skipped for a hash, a cost factor below 12 logs a warning.

View file

@ -10,6 +10,7 @@ from os.path import join
from pathlib import Path
from re import MULTILINE, match, search
from select import select
from shutil import rmtree
from subprocess import DEVNULL, PIPE, STDOUT, Popen, run
from sys import exit as sys_exit, path as sys_path
from time import monotonic, sleep
@ -681,6 +682,24 @@ def certbot_delete(service: str, cmd_env: Dict[str, str] = None) -> int:
return process.returncode
def _purge_stale_account(accounts_root: Path, account_id: str) -> None:
"""Remove the on-disk ACME account dir whose server-side record was pruned.
Walks for the `<account_id>/regr.json` under accounts_root (CA-agnostic:
LE 2-level, ZeroSSL 3-level) and rmtree's its parent. Best-effort — failures
are logged, not raised, so the retry still proceeds.
"""
if not account_id or not accounts_root.is_dir():
return
try:
for regr in accounts_root.rglob("regr.json"):
if regr.parent.name == account_id:
LOGGER.warning(f"Purging stale ACME account {account_id} (server reports it no longer exists) so the next attempt re-registers.")
rmtree(regr.parent, ignore_errors=True)
except OSError as e:
LOGGER.error(f"Failed to purge stale account {account_id}: {e}")
def certbot_new(
service: str,
config: Dict[str, Union[str, bool, int, Dict[str, str]]],
@ -730,6 +749,7 @@ def certbot_new(
# certbot certonly invocation (and vice versa) — certbot then constructs the
# account directory from its own --server path and fails with AccountNotFound.
acme_server_url = str(config.get("acme_server_url") or "")
account_id = ""
if config.get("acme_server") == "letsencrypt":
account_id = select_account_id(
@ -818,6 +838,13 @@ def certbot_new(
process = Popen(command, stdin=DEVNULL, stderr=PIPE, universal_newlines=True, env=cmd_env)
# Watch certbot output for a stale-account JWS rejection. When the ACME server
# has pruned the account we hold on disk (common on LE staging), it answers
# `Unable to validate JWS :: Account "<url>" not found`. certbot does NOT
# re-register when `--account` is pinned, so every retry would reuse the dead
# account and fail identically. Detect it, then drop the stale account dir so
# the next attempt (select_account_id → None) registers a fresh account.
stale_account_detected = False
deadline = monotonic() + CERTBOT_TIMEOUT
while process.poll() is None:
if monotonic() > deadline:
@ -829,9 +856,15 @@ def certbot_new(
rlist, _, _ = select([process.stderr], [], [], 2)
if rlist:
for line in process.stderr:
LOGGER_CERTBOT.info(line.strip())
stripped = line.strip()
LOGGER_CERTBOT.info(stripped)
if "Account" in stripped and "not found" in stripped and ("validate JWS" in stripped or "acme/acct" in stripped):
stale_account_detected = True
break
if stale_account_detected and account_id:
_purge_stale_account(paths.config_dir.joinpath("accounts"), account_id)
return process.returncode