From f3d6f860e08c9ee7b4e761cfcfb6726636e7f42e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Diot?= Date: Fri, 22 Sep 2023 18:38:05 +0100 Subject: [PATCH] Remove old cached files if urls are empty --- .../core/blacklist/jobs/blacklist-download.py | 45 ++++++++++--------- .../core/greylist/jobs/greylist-download.py | 21 +++++---- .../core/realip/jobs/realip-download.py | 17 ++++--- .../core/whitelist/jobs/whitelist-download.py | 21 +++++---- 4 files changed, 59 insertions(+), 45 deletions(-) diff --git a/src/common/core/blacklist/jobs/blacklist-download.py b/src/common/core/blacklist/jobs/blacklist-download.py index f9740211d..2818982da 100755 --- a/src/common/core/blacklist/jobs/blacklist-download.py +++ b/src/common/core/blacklist/jobs/blacklist-download.py @@ -21,7 +21,7 @@ from requests import get from Database import Database # type: ignore from logger import setup_logger # type: ignore -from jobs import cache_file, cache_hash, is_cached_file, file_hash +from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE) asn_rx = re_compile(rb"^\d+$") @@ -85,8 +85,23 @@ try: tmp_blacklist_path = Path(sep, "var", "tmp", "bunkerweb", "blacklist") tmp_blacklist_path.mkdir(parents=True, exist_ok=True) - # Our urls data - urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []} + # Get URLs + urls = { + "IP": [], + "RDNS": [], + "ASN": [], + "USER_AGENT": [], + "URI": [], + "IGNORE_IP": [], + "IGNORE_RDNS": [], + "IGNORE_ASN": [], + "IGNORE_USER_AGENT": [], + "IGNORE_URI": [], + } + for kind in urls: + for url in getenv(f"BLACKLIST_{kind}_URLS", "").split(" "): + if url and url not in urls[kind]: + urls[kind].append(url) # Don't go further if the cache is fresh kinds_fresh = { @@ -113,27 +128,15 @@ try: logger.info( f"Blacklist for {kind} is already in cache, skipping downloads...", ) + + if not urls[kind]: + blacklist_path.joinpath(f"{kind}.list").unlink(missing_ok=True) + deleted, err = del_file_in_db(f"{kind}.list", db) + if not deleted: + logger.warning(f"Coudn't delete {kind}.list from cache : {err}") if all_fresh: _exit(0) - # Get URLs - urls = { - "IP": [], - "RDNS": [], - "ASN": [], - "USER_AGENT": [], - "URI": [], - "IGNORE_IP": [], - "IGNORE_RDNS": [], - "IGNORE_ASN": [], - "IGNORE_USER_AGENT": [], - "IGNORE_URI": [], - } - for kind in urls: - for url in getenv(f"BLACKLIST_{kind}_URLS", "").split(" "): - if url and url not in urls[kind]: - urls[kind].append(url) - # Loop on kinds for kind, urls_list in urls.items(): if kinds_fresh[kind]: diff --git a/src/common/core/greylist/jobs/greylist-download.py b/src/common/core/greylist/jobs/greylist-download.py index 62d51b055..a0181e753 100755 --- a/src/common/core/greylist/jobs/greylist-download.py +++ b/src/common/core/greylist/jobs/greylist-download.py @@ -21,7 +21,7 @@ from requests import get from Database import Database # type: ignore from logger import setup_logger # type: ignore -from jobs import cache_file, cache_hash, is_cached_file, file_hash +from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE) asn_rx = re_compile(rb"^\d+$") @@ -85,8 +85,12 @@ try: tmp_greylist_path = Path(sep, "var", "tmp", "bunkerweb", "greylist") tmp_greylist_path.mkdir(parents=True, exist_ok=True) - # Our urls data + # Get URLs urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []} + for kind in urls: + for url in getenv(f"GREYLIST_{kind}_URLS", "").split(" "): + if url and url not in urls[kind]: + urls[kind].append(url) # Don't go further if the cache is fresh kinds_fresh = { @@ -108,16 +112,15 @@ try: logger.info( f"Greylist for {kind} is already in cache, skipping downloads...", ) + + if not urls[kind]: + greylist_path.joinpath(f"{kind}.list").unlink(missing_ok=True) + deleted, err = del_file_in_db(f"{kind}.list", db) + if not deleted: + logger.warning(f"Coudn't delete {kind}.list from cache : {err}") if all_fresh: _exit(0) - # Get URLs - urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []} - for kind in urls: - for url in getenv(f"GREYLIST_{kind}_URLS", "").split(" "): - if url and url not in urls[kind]: - urls[kind].append(url) - # Loop on kinds for kind, urls_list in urls.items(): if kinds_fresh[kind]: diff --git a/src/common/core/realip/jobs/realip-download.py b/src/common/core/realip/jobs/realip-download.py index 856082c39..8217cfbeb 100755 --- a/src/common/core/realip/jobs/realip-download.py +++ b/src/common/core/realip/jobs/realip-download.py @@ -23,7 +23,7 @@ from requests import get from Database import Database # type: ignore from logger import setup_logger # type: ignore -from jobs import cache_file, cache_hash, file_hash, is_cached_file +from jobs import cache_file, cache_hash, del_file_in_db, file_hash, is_cached_file def check_line(line): @@ -75,14 +75,19 @@ try: db = Database(logger, sqlalchemy_string=getenv("DATABASE_URI", None), pool=False) - # Don't go further if the cache is fresh - if is_cached_file(realip_path.joinpath("combined.list"), "hour", db): - logger.info("RealIP list is already in cache, skipping download...") - _exit(0) - # Get URLs urls = [url for url in getenv("REAL_IP_FROM_URLS", "").split(" ") if url] + # Don't go further if the cache is fresh + if is_cached_file(realip_path.joinpath("combined.list"), "hour", db): + if not urls: + tmp_realip_path.joinpath("combined.list").unlink(missing_ok=True) + deleted, err = del_file_in_db("combined.list", db) + if not deleted: + logger.warning(f"Coudn't delete combined.list from cache : {err}") + logger.info("RealIP list is already in cache, skipping download...") + _exit(0) + # Download and write data to temp file i = 0 content = b"" diff --git a/src/common/core/whitelist/jobs/whitelist-download.py b/src/common/core/whitelist/jobs/whitelist-download.py index 3c8fd5b26..e30026dad 100755 --- a/src/common/core/whitelist/jobs/whitelist-download.py +++ b/src/common/core/whitelist/jobs/whitelist-download.py @@ -21,7 +21,7 @@ from requests import get from Database import Database # type: ignore from logger import setup_logger # type: ignore -from jobs import cache_file, cache_hash, is_cached_file, file_hash +from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE) asn_rx = re_compile(rb"^\d+$") @@ -85,8 +85,12 @@ try: tmp_whitelist_path = Path(sep, "var", "tmp", "bunkerweb", "whitelist") tmp_whitelist_path.mkdir(parents=True, exist_ok=True) - # Our urls data + # Get URLs urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []} + for kind in urls: + for url in getenv(f"WHITELIST_{kind}_URLS", "").split(" "): + if url and url not in urls[kind]: + urls[kind].append(url) # Don't go further if the cache is fresh kinds_fresh = { @@ -108,16 +112,15 @@ try: logger.info( f"Whitelist for {kind} is already in cache, skipping downloads...", ) + + if not urls[kind]: + whitelist_path.joinpath(f"{kind}.list").unlink(missing_ok=True) + deleted, err = del_file_in_db(f"{kind}.list", db) + if not deleted: + logger.warning(f"Coudn't delete {kind}.list from cache : {err}") if all_fresh: _exit(0) - # Get URLs - urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []} - for kind in urls: - for url in getenv(f"WHITELIST_{kind}_URLS", "").split(" "): - if url and url not in urls[kind]: - urls[kind].append(url) - # Loop on kinds for kind, urls_list in urls.items(): if kinds_fresh[kind]: