chore: Start adding BunkerWeb instances healthcheck logic in the scheduler

This commit adds the "HEALTHCHECK_INTERVAL" configuration option to the scheduler in the "main.py" file. The "HEALTHCHECK_INTERVAL" option allows specifying the interval (in seconds) at which the health check job should run. If the value is not a valid integer, it defaults to 10 seconds.
This commit is contained in:
Théophile Diot 2024-05-23 19:01:12 +01:00
parent d84629b7cb
commit 171229fe3a
No known key found for this signature in database
GPG key ID: 248FEA4BAE400D06
2 changed files with 80 additions and 5 deletions

View file

@ -194,6 +194,7 @@ class Configurator:
"SLAVE_MODE",
"MASTER_MODE",
"CUSTOM_LOG_LEVEL",
"HEALTHCHECK_INTERVAL",
"GPG_KEY",
"HOME",
"HOSTNAME",

View file

@ -15,7 +15,7 @@ from stat import S_IEXEC
from subprocess import run as subprocess_run, DEVNULL, STDOUT, PIPE
from sys import path as sys_path
from tarfile import open as tar_open
from threading import Thread
from threading import Event, Thread
from time import sleep
from traceback import format_exc
from typing import Any, Dict, List, Literal, Optional, Union
@ -25,12 +25,14 @@ for deps_path in [join(sep, "usr", "share", "bunkerweb", *paths) for paths in ((
sys_path.append(deps_path)
from dotenv import dotenv_values
from schedule import every as schedule_every, run_pending
from common_utils import bytes_hash, dict_to_frozenset, get_integration # type: ignore
from logger import setup_logger # type: ignore
from Database import Database # type: ignore
from JobScheduler import JobScheduler
from API import API # type: ignore
from ApiCaller import ApiCaller # type: ignore
RUN = True
SCHEDULER: Optional[JobScheduler] = None
@ -71,8 +73,17 @@ SCHEDULER_TMP_ENV_PATH.touch()
DB_LOCK_FILE = Path(sep, "var", "lib", "bunkerweb", "db.lock")
LOGGER = setup_logger("Scheduler", getenv("CUSTOM_LOG_LEVEL", getenv("LOG_LEVEL", "INFO")))
SLAVE_MODE = environ.get("SLAVE_MODE", "no") == "yes"
MASTER_MODE = environ.get("MASTER_MODE", "no") == "yes"
HEALTHCHECK_INTERVAL = getenv("HEALTHCHECK_INTERVAL", "10")
if not HEALTHCHECK_INTERVAL.isdigit():
LOGGER.error("HEALTHCHECK_INTERVAL must be an integer, defaulting to 10")
HEALTHCHECK_INTERVAL = 10
HEALTHCHECK_INTERVAL = int(HEALTHCHECK_INTERVAL)
HEALTHCHECK_EVENT = Event()
SLAVE_MODE = getenv("SLAVE_MODE", "no") == "yes"
MASTER_MODE = getenv("MASTER_MODE", "no") == "yes"
def handle_stop(signum, frame):
@ -341,7 +352,67 @@ def run_in_slave_mode(): # TODO: Refactor this feature
sleep(5)
# TODO: add a health check for the scheduler's BunkerWeb instances
def healthcheck_job():
return # TODO: remove this when the healthcheck endpoint is ready @fl0ppy-d1sk
if HEALTHCHECK_EVENT.is_set():
LOGGER.warning("Healthcheck job is already running, skipping execution ...")
return
try:
assert SCHEDULER is not None
except AssertionError:
return
HEALTHCHECK_EVENT.set()
for db_instance in SCHEDULER.db.get_instances():
try:
bw_instance = API(f"http://{db_instance['hostname']}:{db_instance['port']}", db_instance["server_name"])
sent, err, status, resp = bw_instance.request("GET", "health")
if not sent:
LOGGER.warning(
f"instances_healthcheck - Can't send API request to {bw_instance.endpoint}health : {err}, healthcheck will be retried in {HEALTHCHECK_INTERVAL} seconds ..."
)
if bw_instance in SCHEDULER.apis:
SCHEDULER.apis.remove(bw_instance)
continue
if status != 200:
LOGGER.warning(
f"instances_healthcheck - Error while sending API request to {bw_instance.endpoint}health : status = {resp['status']}, msg = {resp['msg']}, healthcheck will be retried in {HEALTHCHECK_INTERVAL} seconds ..."
)
if bw_instance in SCHEDULER.apis:
SCHEDULER.apis.remove(bw_instance)
continue
if resp["state"] == "loading":
LOGGER.info(f"instances_healthcheck - Instance {bw_instance.endpoint} is loading, sending config ...")
api_caller = ApiCaller([bw_instance])
api_caller.send_files(CUSTOM_CONFIGS_PATH, "/custom_configs")
api_caller.send_files(EXTERNAL_PLUGINS_PATH, "/plugins")
api_caller.send_files(PRO_PLUGINS_PATH, "/pro_plugins")
api_caller.send_files(join(sep, "etc", "nginx"), "/confs")
api_caller.send_files(CACHE_PATH, "/cache")
if api_caller.send_to_apis("POST", "/reload"):
LOGGER.info(f"Successfully reloaded instance {bw_instance.endpoint}")
else:
LOGGER.error(f"Error while reloading instance {bw_instance.endpoint}")
elif resp["state"] == "down":
LOGGER.warning(f"instances_healthcheck - Instance {bw_instance.endpoint} is down")
if bw_instance in SCHEDULER.apis:
SCHEDULER.apis.remove(bw_instance)
continue
if bw_instance not in SCHEDULER.apis:
SCHEDULER.apis.append(bw_instance)
except BaseException:
LOGGER.exception(f"instances_healthcheck - Exception while checking instance {bw_instance.endpoint}")
if bw_instance in SCHEDULER.apis:
SCHEDULER.apis.remove(bw_instance)
HEALTHCHECK_EVENT.clear()
if __name__ == "__main__":
try:
@ -372,6 +443,8 @@ if __name__ == "__main__":
run_in_slave_mode()
stop(1)
schedule_every(HEALTHCHECK_INTERVAL).seconds.do(healthcheck_job)
db_metadata = SCHEDULER.db.get_metadata()
if (
@ -685,8 +758,9 @@ if __name__ == "__main__":
errors = 0
while RUN and not NEED_RELOAD:
try:
SCHEDULER.run_pending()
sleep(1)
run_pending()
SCHEDULER.run_pending()
current_time = datetime.now()
while DB_LOCK_FILE.is_file() and DB_LOCK_FILE.stat().st_ctime + 30 > current_time.timestamp():