Refactor realip and greylist download scripts to improve caching logic and reduce unnecessary downloads

2026-05-24 09:28:37 +00:00 · 2024-10-30 17:20:48 +01:00 · 2024-10-30 17:20:48 +01:00 · 1c3fea7f09
commit 1c3fea7f09
parent aaa7aa2fb8
4 changed files with 52 additions and 137 deletions
--- a/src/common/core/blacklist/jobs/blacklist-download.py
+++ b/src/common/core/blacklist/jobs/blacklist-download.py
@ -3,12 +3,10 @@
 from contextlib import suppress
 from datetime import datetime, timedelta
 from ipaddress import ip_address, ip_network
-from json import dumps, loads
 from os import getenv, sep
 from os.path import join, normpath
 from pathlib import Path
 from re import compile as re_compile
-from shutil import rmtree
 from sys import exit as sys_exit, path as sys_path
 from traceback import format_exc
 from typing import Tuple
@ -112,19 +110,13 @@ try:
                LOGGER.warning(f"Couldn't delete blacklist URLs from cache : {err}")
        sys_exit(0)

-    cached_urls = loads(JOB.get_cache("urls.json") or "{}")
-
-    tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "blacklist")
-    tmp_downloads.mkdir(parents=True, exist_ok=True)
-    downloaded_urls = {}
    failed_urls = set()
-    current_timestamp = datetime.now().astimezone().timestamp()

    # Loop on kinds
    for service, kinds in services_blacklist_urls.items():
        for kind, urls_list in kinds.items():
            if not urls_list:
-                if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
+                if JOB.job_path.joinpath(service, f"{kind}.list").exists():
                    LOGGER.warning(f"{service} blacklist for {kind} is cached but no URL is configured, removing from cache...")
                    deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
                    if not deleted:
@ -134,25 +126,16 @@ try:
            # Write combined data of the kind in memory and check if it has changed
            content = b""
            for url in urls_list:
+                url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
+                cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
                try:
-                    cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
-                    # Check if the URL's last download timestamp is younger than 1 hour
-                    if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
-                        downloaded_urls[url] = {
-                            "time": cached_url["time"],
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
-                        LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
-                        failed_urls.add(url)
-                        status = 1 if status == 1 else 0
-                        continue
-
                    # Check if the URL has already been downloaded
                    if url in failed_urls:
                        continue
-                    elif url in downloaded_urls:
-                        LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
-                        content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
+                    elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
+                        LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
+                        # Remove first line (URL) and add to content
+                        content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
                    else:
                        LOGGER.info(f"Downloading blacklist data from {url} ...")
                        if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
                                i += 1

                        LOGGER.info(f"Downloaded {i} bad {kind}")
-                        downloaded_urls[url] = {
-                            "time": current_timestamp,
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
+
+                        cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
+                        if not cached:
+                            LOGGER.error(f"Error while caching url content : {err}")
                except BaseException as e:
                    status = 2
                    LOGGER.error(f"Exception while getting {service} blacklist from {url} :\n{e}")
                    failed_urls.add(url)

+            LOGGER.debug(f"Content for {service} {kind} : {content}")
+
            # Check if file has changed
            new_hash = bytes_hash(content)
            old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
                continue

            status = 1
-
-    cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
-    if not cached:
-        LOGGER.error(f"Error while caching blacklist URLs : {err}")
-
-    rmtree(tmp_downloads, ignore_errors=True)
 except SystemExit as e:
    status = e.code
 except:
--- a/src/common/core/greylist/jobs/greylist-download.py
+++ b/src/common/core/greylist/jobs/greylist-download.py
@ -3,12 +3,10 @@
 from contextlib import suppress
 from datetime import datetime, timedelta
 from ipaddress import ip_address, ip_network
-from json import dumps, loads
 from os import getenv, sep
 from os.path import join, normpath
 from pathlib import Path
 from re import compile as re_compile
-from shutil import rmtree
 from sys import exit as sys_exit, path as sys_path
 from traceback import format_exc
 from typing import Tuple
@ -112,19 +110,13 @@ try:
                LOGGER.warning(f"Couldn't delete greylist URLs from cache : {err}")
        sys_exit(0)

-    cached_urls = loads(JOB.get_cache("urls.json") or "{}")
-
-    tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "greylist")
-    tmp_downloads.mkdir(parents=True, exist_ok=True)
-    downloaded_urls = {}
    failed_urls = set()
-    current_timestamp = datetime.now().astimezone().timestamp()

    # Loop on kinds
    for service, kinds in services_greylist_urls.items():
        for kind, urls_list in kinds.items():
            if not urls_list:
-                if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
+                if JOB.job_path.joinpath(service, f"{kind}.list").exists():
                    LOGGER.warning(f"{service} greylist for {kind} is cached but no URL is configured, removing from cache...")
                    deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
                    if not deleted:
@ -134,25 +126,16 @@ try:
            # Write combined data of the kind in memory and check if it has changed
            content = b""
            for url in urls_list:
+                url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
+                cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
                try:
-                    cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
-                    # Check if the URL's last download timestamp is younger than 1 hour
-                    if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
-                        downloaded_urls[url] = {
-                            "time": cached_url["time"],
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
-                        LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
-                        failed_urls.add(url)
-                        status = 1 if status == 1 else 0
-                        continue
-
                    # Check if the URL has already been downloaded
                    if url in failed_urls:
                        continue
-                    elif url in downloaded_urls:
-                        LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
-                        content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
+                    elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
+                        LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
+                        # Remove first line (URL) and add to content
+                        content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
                    else:
                        LOGGER.info(f"Downloading greylist data from {url} ...")
                        if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
                                i += 1

                        LOGGER.info(f"Downloaded {i} bad {kind}")
-                        downloaded_urls[url] = {
-                            "time": current_timestamp,
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
+
+                        cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
+                        if not cached:
+                            LOGGER.error(f"Error while caching url content : {err}")
                except BaseException as e:
                    status = 2
                    LOGGER.error(f"Exception while getting {service} greylist from {url} :\n{e}")
                    failed_urls.add(url)

+            LOGGER.debug(f"Content for {service} {kind} : {content}")
+
            # Check if file has changed
            new_hash = bytes_hash(content)
            old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
                continue

            status = 1
-
-    cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
-    if not cached:
-        LOGGER.error(f"Error while caching greylist URLs : {err}")
-
-    rmtree(tmp_downloads, ignore_errors=True)
 except SystemExit as e:
    status = e.code
 except:
--- a/src/common/core/realip/jobs/realip-download.py
+++ b/src/common/core/realip/jobs/realip-download.py
@ -3,11 +3,9 @@
 from contextlib import suppress
 from datetime import datetime, timedelta
 from ipaddress import ip_address, ip_network
-from json import dumps, loads
 from os import getenv, sep
 from os.path import join, normpath
 from pathlib import Path
-from shutil import rmtree
 from sys import exit as sys_exit, path as sys_path
 from traceback import format_exc

@ -89,17 +87,11 @@ try:
                LOGGER.warning(f"Couldn't delete realip URLs from cache : {err}")
        sys_exit(0)

-    cached_urls = loads(JOB.get_cache("urls.json") or "{}")
-
-    tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "realip")
-    tmp_downloads.mkdir(parents=True, exist_ok=True)
-    downloaded_urls = {}
    failed_urls = set()
-    current_timestamp = datetime.now().astimezone().timestamp()

    for service, urls in services_realip_urls.items():
        if not urls:
-            if Path(JOB.job_path.joinpath(service, "combined.list")).exists():
+            if JOB.job_path.joinpath(service, "combined.list").exists():
                LOGGER.warning(f"{service} realip combined.list is cached but no URL is configured, removing from cache...")
                deleted, err = JOB.del_cache("combined.list", service_id=service)
                if not deleted:
@ -109,25 +101,16 @@ try:
        # Write combined data of the kind in memory and check if it has changed
        content = b""
        for url in urls:
+            url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
+            cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
            try:
-                cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
-                # Check if the URL's last download timestamp is younger than 1 hour
-                if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
-                    downloaded_urls[url] = {
-                        "time": cached_url["time"],
-                        "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                    }
-                    LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
-                    failed_urls.add(url)
-                    status = 1 if status == 1 else 0
-                    continue
-
                # Check if the URL has already been downloaded
                if url in failed_urls:
                    continue
-                elif url in downloaded_urls:
-                    LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
-                    content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
+                elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
+                    LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
+                    # Remove first line (URL) and add to content
+                    content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
                else:
                    LOGGER.info(f"Downloading realip data from {url} ...")
                    if url.startswith("file://"):
@ -155,16 +138,17 @@ try:
                            i += 1

                    LOGGER.info(f"Downloaded {i} realip from {url}")
-                    tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").write_bytes(content)
-                    downloaded_urls[url] = {
-                        "time": current_timestamp,
-                        "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                    }
+
+                    cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
+                    if not cached:
+                        LOGGER.error(f"Error while caching url content : {err}")
            except BaseException as e:
                status = 2
                LOGGER.error(f"Exception while getting {service} realip from {url} :\n{e}")
                failed_urls.add(url)

+        LOGGER.debug(f"Content for {service} : {content}")
+
        # Check if file has changed
        new_hash = bytes_hash(content)
        old_hash = JOB.cache_hash("combined.list", service_id=service)
@ -181,12 +165,6 @@ try:
            continue

        status = 1
-
-    cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
-    if not cached:
-        LOGGER.error(f"Error while caching whitelist URLs : {err}")
-
-    rmtree(tmp_downloads, ignore_errors=True)
 except SystemExit as e:
    status = e.code
 except:
--- a/src/common/core/whitelist/jobs/whitelist-download.py
+++ b/src/common/core/whitelist/jobs/whitelist-download.py
@ -3,12 +3,10 @@
 from contextlib import suppress
 from datetime import datetime, timedelta
 from ipaddress import ip_address, ip_network
-from json import dumps, loads
 from os import getenv, sep
 from os.path import join, normpath
 from pathlib import Path
 from re import compile as re_compile
-from shutil import rmtree
 from sys import exit as sys_exit, path as sys_path
 from traceback import format_exc
 from typing import Tuple
@ -112,19 +110,13 @@ try:
                LOGGER.warning(f"Couldn't delete whitelist URLs from cache : {err}")
        sys_exit(0)

-    cached_urls = loads(JOB.get_cache("urls.json") or "{}")
-
-    tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "blacklist")
-    tmp_downloads.mkdir(parents=True, exist_ok=True)
-    downloaded_urls = {}
    failed_urls = set()
-    current_timestamp = datetime.now().astimezone().timestamp()

    # Loop on kinds
    for service, kinds in services_whitelist_urls.items():
        for kind, urls_list in kinds.items():
            if not urls_list:
-                if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
+                if JOB.job_path.joinpath(service, f"{kind}.list").exists():
                    LOGGER.warning(f"{service} whitelist for {kind} is cached but no URL is configured, removing from cache...")
                    deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
                    if not deleted:
@ -134,25 +126,16 @@ try:
            # Write combined data of the kind in memory and check if it has changed
            content = b""
            for url in urls_list:
+                url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
+                cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
                try:
-                    cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
-                    # Check if the URL's last download timestamp is younger than 1 hour
-                    if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
-                        downloaded_urls[url] = {
-                            "time": cached_url["time"],
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
-                        LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
-                        failed_urls.add(url)
-                        status = 1 if status == 1 else 0
-                        continue
-
                    # Check if the URL has already been downloaded
                    if url in failed_urls:
                        continue
-                    elif url in downloaded_urls:
-                        LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
-                        content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
+                    elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
+                        LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
+                        # Remove first line (URL) and add to content
+                        content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
                    else:
                        LOGGER.info(f"Downloading whitelist data from {url} ...")
                        if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
                                i += 1

                        LOGGER.info(f"Downloaded {i} bad {kind}")
-                        downloaded_urls[url] = {
-                            "time": current_timestamp,
-                            "tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
-                        }
+
+                        cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
+                        if not cached:
+                            LOGGER.error(f"Error while caching url content : {err}")
                except BaseException as e:
                    status = 2
                    LOGGER.error(f"Exception while getting {service} whitelist from {url} :\n{e}")
                    failed_urls.add(url)

+            LOGGER.debug(f"Content for {service} {kind} : {content}")
+
            # Check if file has changed
            new_hash = bytes_hash(content)
            old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
                continue

            status = 1
-
-    cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
-    if not cached:
-        LOGGER.error(f"Error while caching whitelist URLs : {err}")
-
-    rmtree(tmp_downloads, ignore_errors=True)
 except SystemExit as e:
    status = e.code
 except: