Enhance health check and reload functionality with temporary file indicators and improved timeout handling

This commit is contained in:
Théophile Diot 2025-01-08 11:58:05 +01:00
parent dfeefc5442
commit b44492c685
No known key found for this signature in database
GPG key ID: FA995104A0BA376A
4 changed files with 45 additions and 20 deletions

View file

@ -94,6 +94,13 @@ api.global.GET["^/ping$"] = function(self)
end
api.global.GET["^/health$"] = function(self)
-- Check if reload indicator file exists
local f = open("/var/tmp/bunkerweb_reloading", "r")
if f then
f:close()
return self:response(HTTP_OK, "success", "ok")
end
local data, err = get_variable("IS_LOADING", false)
if not data then
logger:log(ERR, "can't get IS_LOADING variable : " .. err)
@ -128,6 +135,15 @@ api.global.POST["^/reload"] = function(self)
return self:response(HTTP_INTERNAL_SERVER_ERROR, "error", "err = " .. err)
end
-- Create temporary file to indicate reconfiguration
local file, err = open("/var/tmp/bunkerweb_reloading", "w")
if file then
file:write(tostring(os.time()))
file:close()
else
logger:log(ERR, "Failed to create reload indicator file: " .. err)
end
return self:response(HTTP_OK, "success", "reload successful")
end
@ -232,7 +248,7 @@ api.global.POST["^/ban$"] = function(self)
if not data then
local data_file = get_body_file()
if data_file then
local file, err = io.open(data_file)
local file, err = open(data_file)
if not file then
return self:response(HTTP_INTERNAL_SERVER_ERROR, "error", err)
end

View file

@ -16,6 +16,7 @@ init_worker_by_lua_block {
local call_plugin = helpers.call_plugin
local get_variable = utils.get_variable
local tostring = tostring
local remove = os.remove
local time = os.time
local randomseed = math.randomseed
@ -267,6 +268,12 @@ init_worker_by_lua_block {
logger:log(ERR, "lock:unlock() failed : " .. err)
end
logger:log(INFO, "init phase ended")
local res, err = remove("/var/tmp/bunkerweb_reloading")
if not res and err ~= "No such file or directory" then
logger:log(WARN, "unable to remove /var/tmp/bunkerweb_reloading file: " .. err)
end
logger:log(NOTICE, "BunkerWeb is ready to fool hackers ! 🚀")
end

View file

@ -145,7 +145,7 @@ class JobScheduler(ApiCaller):
reload_success = self.send_to_apis(
"POST",
f"/reload?test={'no' if self.__env.get('DISABLE_CONFIGURATION_TESTING', 'no').lower() == 'yes' else 'yes'}",
timeout=max(int(reload_min_timeout), 2 * len(self.__env["SERVER_NAME"].split(" "))),
timeout=max(int(reload_min_timeout), 3 * len(self.__env["SERVER_NAME"].split(" "))),
)[0]
if reload_success:
self.__logger.info("Successfully reloaded nginx")
@ -262,7 +262,7 @@ class JobScheduler(ApiCaller):
old_env = environ.copy()
environ.clear()
environ.update(old_env | self.__env)
environ.update(old_env | self.__env | {"LOG_LEVEL": getenv("CUSTOM_LOG_LEVEL", self.__env.get("LOG_LEVEL", "notice"))}),
# Use ThreadPoolExecutor to run jobs
futures = [self.__executor.submit(job.run) for job in pending_jobs]
@ -318,7 +318,7 @@ class JobScheduler(ApiCaller):
old_env = environ.copy()
environ.clear()
environ.update(old_env | self.__env)
environ.update(old_env | self.__env | {"LOG_LEVEL": getenv("CUSTOM_LOG_LEVEL", self.__env.get("LOG_LEVEL", "notice"))})
futures = []
for plugin, jobs in self.__jobs.items():
@ -383,7 +383,7 @@ class JobScheduler(ApiCaller):
old_env = environ.copy()
environ.clear()
environ.update(old_env | self.__env)
environ.update(old_env | self.__env | {"LOG_LEVEL": getenv("CUSTOM_LOG_LEVEL", self.__env.get("LOG_LEVEL", "notice"))})
self.__job_wrapper(
job_to_run["path"],

View file

@ -23,7 +23,6 @@ for deps_path in [join(sep, "usr", "share", "bunkerweb", *paths) for paths in ((
if deps_path not in sys_path:
sys_path.append(deps_path)
from dotenv import dotenv_values
from schedule import every as schedule_every, run_pending
from common_utils import bytes_hash, dict_to_frozenset # type: ignore
@ -388,8 +387,8 @@ def healthcheck_job():
HEALTHCHECK_EVENT.set()
while APPLYING_CHANGES.is_set():
sleep(1)
if APPLYING_CHANGES.is_set():
return
env = SCHEDULER.db.get_config()
@ -404,6 +403,8 @@ def healthcheck_job():
status = 500
resp = {"status": "down", "msg": err}
HEALTHCHECK_LOGGER.debug(resp)
success = True
if not sent:
HEALTHCHECK_LOGGER.warning(
@ -443,7 +444,7 @@ def healthcheck_job():
if not api_caller.send_to_apis(
"POST",
f"/reload?test={'no' if DISABLE_CONFIGURATION_TESTING else 'yes'}",
timeout=max(RELOAD_MIN_TIMEOUT, 2 * len(env["SERVER_NAME"].split(" "))),
timeout=max(RELOAD_MIN_TIMEOUT, 3 * len(env["SERVER_NAME"].split(" "))),
)[0]:
HEALTHCHECK_LOGGER.error(f"Error while reloading instance {bw_instance.endpoint}")
ret = SCHEDULER.db.update_instance(db_instance["hostname"], "loading")
@ -495,7 +496,11 @@ if __name__ == "__main__":
tmp_variables_path = Path(args.variables or join(sep, "var", "tmp", "bunkerweb", "variables.env"))
nginx_variables_path = CONFIG_PATH.joinpath("variables.env")
dotenv_env = dotenv_values(tmp_variables_path.as_posix())
dotenv_env = {}
if tmp_variables_path.is_file():
with tmp_variables_path.open() as f:
dotenv_env = dict(line.strip().split("=", 1) for line in f if line.strip() and not line.startswith("#"))
SCHEDULER = JobScheduler(environ, LOGGER, db=Database(LOGGER, sqlalchemy_string=dotenv_env.get("DATABASE_URI", getenv("DATABASE_URI", None)))) # type: ignore
@ -540,7 +545,7 @@ if __name__ == "__main__":
env["TZ"] = tz
# Instantiate scheduler environment
SCHEDULER.env = env | {"LOG_LEVEL": getenv("CUSTOM_LOG_LEVEL", env.get("LOG_LEVEL", "notice")), "RELOAD_MIN_TIMEOUT": str(RELOAD_MIN_TIMEOUT)}
SCHEDULER.env = env | {"RELOAD_MIN_TIMEOUT": str(RELOAD_MIN_TIMEOUT)}
threads = []
@ -740,12 +745,7 @@ if __name__ == "__main__":
if RUN_JOBS_ONCE:
# Only run jobs once
if not SCHEDULER.reload(
env
| {
"TZ": getenv("TZ", "UTC"),
"LOG_LEVEL": getenv("CUSTOM_LOG_LEVEL", env.get("LOG_LEVEL", "notice")),
"RELOAD_MIN_TIMEOUT": str(RELOAD_MIN_TIMEOUT),
},
env | {"TZ": getenv("TZ", "UTC"), "RELOAD_MIN_TIMEOUT": str(RELOAD_MIN_TIMEOUT)},
changed_plugins=changed_plugins,
):
LOGGER.error("At least one job in run_once() failed")
@ -802,12 +802,14 @@ if __name__ == "__main__":
success, responses = SCHEDULER.send_to_apis(
"POST",
f"/reload?test={'no' if DISABLE_CONFIGURATION_TESTING else 'yes'}",
timeout=max(RELOAD_MIN_TIMEOUT, 2 * len(env["SERVER_NAME"].split(" "))),
timeout=max(RELOAD_MIN_TIMEOUT, 3 * len(env["SERVER_NAME"].split(" "))),
response=True,
)
if not success:
reachable = False
LOGGER.debug(f"Error while reloading all bunkerweb instances: {responses}")
LOGGER.debug("Error while reloading all bunkerweb instances")
LOGGER.debug(responses)
for db_instance in SCHEDULER.db.get_instances():
status = responses.get(db_instance["hostname"], {"status": "down"}).get("status", "down")
@ -881,7 +883,7 @@ if __name__ == "__main__":
if not SCHEDULER.send_to_apis(
"POST",
f"/reload?test={'no' if DISABLE_CONFIGURATION_TESTING else 'yes'}",
timeout=max(RELOAD_MIN_TIMEOUT, 2 * len(env["SERVER_NAME"].split(" "))),
timeout=max(RELOAD_MIN_TIMEOUT, 3 * len(env["SERVER_NAME"].split(" "))),
)[0]:
LOGGER.error("Error while reloading bunkerweb with failover configuration, skipping ...")
elif not reachable: