From f3d6f860e08c9ee7b4e761cfcfb6726636e7f42e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Diot?= <tdiot@bunkerity.com>
Date: Fri, 22 Sep 2023 18:38:05 +0100
Subject: [PATCH] Remove old cached files if urls are empty

---
 .../core/blacklist/jobs/blacklist-download.py | 45 ++++++++++---------
 .../core/greylist/jobs/greylist-download.py   | 21 +++++----
 .../core/realip/jobs/realip-download.py       | 17 ++++---
 .../core/whitelist/jobs/whitelist-download.py | 21 +++++----
 4 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/src/common/core/blacklist/jobs/blacklist-download.py b/src/common/core/blacklist/jobs/blacklist-download.py
index f9740211d..2818982da 100755
--- a/src/common/core/blacklist/jobs/blacklist-download.py
+++ b/src/common/core/blacklist/jobs/blacklist-download.py
@@ -21,7 +21,7 @@ from requests import get
 
 from Database import Database  # type: ignore
 from logger import setup_logger  # type: ignore
-from jobs import cache_file, cache_hash, is_cached_file, file_hash
+from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash
 
 rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE)
 asn_rx = re_compile(rb"^\d+$")
@@ -85,8 +85,23 @@ try:
     tmp_blacklist_path = Path(sep, "var", "tmp", "bunkerweb", "blacklist")
     tmp_blacklist_path.mkdir(parents=True, exist_ok=True)
 
-    # Our urls data
-    urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []}
+    # Get URLs
+    urls = {
+        "IP": [],
+        "RDNS": [],
+        "ASN": [],
+        "USER_AGENT": [],
+        "URI": [],
+        "IGNORE_IP": [],
+        "IGNORE_RDNS": [],
+        "IGNORE_ASN": [],
+        "IGNORE_USER_AGENT": [],
+        "IGNORE_URI": [],
+    }
+    for kind in urls:
+        for url in getenv(f"BLACKLIST_{kind}_URLS", "").split(" "):
+            if url and url not in urls[kind]:
+                urls[kind].append(url)
 
     # Don't go further if the cache is fresh
     kinds_fresh = {
@@ -113,27 +128,15 @@ try:
             logger.info(
                 f"Blacklist for {kind} is already in cache, skipping downloads...",
             )
+
+            if not urls[kind]:
+                blacklist_path.joinpath(f"{kind}.list").unlink(missing_ok=True)
+                deleted, err = del_file_in_db(f"{kind}.list", db)
+                if not deleted:
+                    logger.warning(f"Coudn't delete {kind}.list from cache : {err}")
     if all_fresh:
         _exit(0)
 
-    # Get URLs
-    urls = {
-        "IP": [],
-        "RDNS": [],
-        "ASN": [],
-        "USER_AGENT": [],
-        "URI": [],
-        "IGNORE_IP": [],
-        "IGNORE_RDNS": [],
-        "IGNORE_ASN": [],
-        "IGNORE_USER_AGENT": [],
-        "IGNORE_URI": [],
-    }
-    for kind in urls:
-        for url in getenv(f"BLACKLIST_{kind}_URLS", "").split(" "):
-            if url and url not in urls[kind]:
-                urls[kind].append(url)
-
     # Loop on kinds
     for kind, urls_list in urls.items():
         if kinds_fresh[kind]:
diff --git a/src/common/core/greylist/jobs/greylist-download.py b/src/common/core/greylist/jobs/greylist-download.py
index 62d51b055..a0181e753 100755
--- a/src/common/core/greylist/jobs/greylist-download.py
+++ b/src/common/core/greylist/jobs/greylist-download.py
@@ -21,7 +21,7 @@ from requests import get
 
 from Database import Database  # type: ignore
 from logger import setup_logger  # type: ignore
-from jobs import cache_file, cache_hash, is_cached_file, file_hash
+from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash
 
 rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE)
 asn_rx = re_compile(rb"^\d+$")
@@ -85,8 +85,12 @@ try:
     tmp_greylist_path = Path(sep, "var", "tmp", "bunkerweb", "greylist")
     tmp_greylist_path.mkdir(parents=True, exist_ok=True)
 
-    # Our urls data
+    # Get URLs
     urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []}
+    for kind in urls:
+        for url in getenv(f"GREYLIST_{kind}_URLS", "").split(" "):
+            if url and url not in urls[kind]:
+                urls[kind].append(url)
 
     # Don't go further if the cache is fresh
     kinds_fresh = {
@@ -108,16 +112,15 @@ try:
             logger.info(
                 f"Greylist for {kind} is already in cache, skipping downloads...",
             )
+
+            if not urls[kind]:
+                greylist_path.joinpath(f"{kind}.list").unlink(missing_ok=True)
+                deleted, err = del_file_in_db(f"{kind}.list", db)
+                if not deleted:
+                    logger.warning(f"Coudn't delete {kind}.list from cache : {err}")
     if all_fresh:
         _exit(0)
 
-    # Get URLs
-    urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []}
-    for kind in urls:
-        for url in getenv(f"GREYLIST_{kind}_URLS", "").split(" "):
-            if url and url not in urls[kind]:
-                urls[kind].append(url)
-
     # Loop on kinds
     for kind, urls_list in urls.items():
         if kinds_fresh[kind]:
diff --git a/src/common/core/realip/jobs/realip-download.py b/src/common/core/realip/jobs/realip-download.py
index 856082c39..8217cfbeb 100755
--- a/src/common/core/realip/jobs/realip-download.py
+++ b/src/common/core/realip/jobs/realip-download.py
@@ -23,7 +23,7 @@ from requests import get
 
 from Database import Database  # type: ignore
 from logger import setup_logger  # type: ignore
-from jobs import cache_file, cache_hash, file_hash, is_cached_file
+from jobs import cache_file, cache_hash, del_file_in_db, file_hash, is_cached_file
 
 
 def check_line(line):
@@ -75,14 +75,19 @@ try:
 
     db = Database(logger, sqlalchemy_string=getenv("DATABASE_URI", None), pool=False)
 
-    # Don't go further if the cache is fresh
-    if is_cached_file(realip_path.joinpath("combined.list"), "hour", db):
-        logger.info("RealIP list is already in cache, skipping download...")
-        _exit(0)
-
     # Get URLs
     urls = [url for url in getenv("REAL_IP_FROM_URLS", "").split(" ") if url]
 
+    # Don't go further if the cache is fresh
+    if is_cached_file(realip_path.joinpath("combined.list"), "hour", db):
+        if not urls:
+            tmp_realip_path.joinpath("combined.list").unlink(missing_ok=True)
+            deleted, err = del_file_in_db("combined.list", db)
+            if not deleted:
+                logger.warning(f"Coudn't delete combined.list from cache : {err}")
+        logger.info("RealIP list is already in cache, skipping download...")
+        _exit(0)
+
     # Download and write data to temp file
     i = 0
     content = b""
diff --git a/src/common/core/whitelist/jobs/whitelist-download.py b/src/common/core/whitelist/jobs/whitelist-download.py
index 3c8fd5b26..e30026dad 100755
--- a/src/common/core/whitelist/jobs/whitelist-download.py
+++ b/src/common/core/whitelist/jobs/whitelist-download.py
@@ -21,7 +21,7 @@ from requests import get
 
 from Database import Database  # type: ignore
 from logger import setup_logger  # type: ignore
-from jobs import cache_file, cache_hash, is_cached_file, file_hash
+from jobs import cache_file, cache_hash, del_file_in_db, is_cached_file, file_hash
 
 rdns_rx = re_compile(rb"^[^ ]+$", IGNORECASE)
 asn_rx = re_compile(rb"^\d+$")
@@ -85,8 +85,12 @@ try:
     tmp_whitelist_path = Path(sep, "var", "tmp", "bunkerweb", "whitelist")
     tmp_whitelist_path.mkdir(parents=True, exist_ok=True)
 
-    # Our urls data
+    # Get URLs
     urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []}
+    for kind in urls:
+        for url in getenv(f"WHITELIST_{kind}_URLS", "").split(" "):
+            if url and url not in urls[kind]:
+                urls[kind].append(url)
 
     # Don't go further if the cache is fresh
     kinds_fresh = {
@@ -108,16 +112,15 @@ try:
             logger.info(
                 f"Whitelist for {kind} is already in cache, skipping downloads...",
             )
+
+            if not urls[kind]:
+                whitelist_path.joinpath(f"{kind}.list").unlink(missing_ok=True)
+                deleted, err = del_file_in_db(f"{kind}.list", db)
+                if not deleted:
+                    logger.warning(f"Coudn't delete {kind}.list from cache : {err}")
     if all_fresh:
         _exit(0)
 
-    # Get URLs
-    urls = {"IP": [], "RDNS": [], "ASN": [], "USER_AGENT": [], "URI": []}
-    for kind in urls:
-        for url in getenv(f"WHITELIST_{kind}_URLS", "").split(" "):
-            if url and url not in urls[kind]:
-                urls[kind].append(url)
-
     # Loop on kinds
     for kind, urls_list in urls.items():
         if kinds_fresh[kind]: