Refactor realip and greylist download scripts to improve caching logic and reduce unnecessary downloads

This commit is contained in:
Théophile Diot 2024-10-30 17:20:48 +01:00
parent aaa7aa2fb8
commit 1c3fea7f09
No known key found for this signature in database
GPG key ID: FA995104A0BA376A
4 changed files with 52 additions and 137 deletions

View file

@ -3,12 +3,10 @@
from contextlib import suppress
from datetime import datetime, timedelta
from ipaddress import ip_address, ip_network
from json import dumps, loads
from os import getenv, sep
from os.path import join, normpath
from pathlib import Path
from re import compile as re_compile
from shutil import rmtree
from sys import exit as sys_exit, path as sys_path
from traceback import format_exc
from typing import Tuple
@ -112,19 +110,13 @@ try:
LOGGER.warning(f"Couldn't delete blacklist URLs from cache : {err}")
sys_exit(0)
cached_urls = loads(JOB.get_cache("urls.json") or "{}")
tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "blacklist")
tmp_downloads.mkdir(parents=True, exist_ok=True)
downloaded_urls = {}
failed_urls = set()
current_timestamp = datetime.now().astimezone().timestamp()
# Loop on kinds
for service, kinds in services_blacklist_urls.items():
for kind, urls_list in kinds.items():
if not urls_list:
if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
if JOB.job_path.joinpath(service, f"{kind}.list").exists():
LOGGER.warning(f"{service} blacklist for {kind} is cached but no URL is configured, removing from cache...")
deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
if not deleted:
@ -134,25 +126,16 @@ try:
# Write combined data of the kind in memory and check if it has changed
content = b""
for url in urls_list:
url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
try:
cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
# Check if the URL's last download timestamp is younger than 1 hour
if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
downloaded_urls[url] = {
"time": cached_url["time"],
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
failed_urls.add(url)
status = 1 if status == 1 else 0
continue
# Check if the URL has already been downloaded
if url in failed_urls:
continue
elif url in downloaded_urls:
LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
# Remove first line (URL) and add to content
content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
else:
LOGGER.info(f"Downloading blacklist data from {url} ...")
if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
i += 1
LOGGER.info(f"Downloaded {i} bad {kind}")
downloaded_urls[url] = {
"time": current_timestamp,
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
if not cached:
LOGGER.error(f"Error while caching url content : {err}")
except BaseException as e:
status = 2
LOGGER.error(f"Exception while getting {service} blacklist from {url} :\n{e}")
failed_urls.add(url)
LOGGER.debug(f"Content for {service} {kind} : {content}")
# Check if file has changed
new_hash = bytes_hash(content)
old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
continue
status = 1
cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
if not cached:
LOGGER.error(f"Error while caching blacklist URLs : {err}")
rmtree(tmp_downloads, ignore_errors=True)
except SystemExit as e:
status = e.code
except:

View file

@ -3,12 +3,10 @@
from contextlib import suppress
from datetime import datetime, timedelta
from ipaddress import ip_address, ip_network
from json import dumps, loads
from os import getenv, sep
from os.path import join, normpath
from pathlib import Path
from re import compile as re_compile
from shutil import rmtree
from sys import exit as sys_exit, path as sys_path
from traceback import format_exc
from typing import Tuple
@ -112,19 +110,13 @@ try:
LOGGER.warning(f"Couldn't delete greylist URLs from cache : {err}")
sys_exit(0)
cached_urls = loads(JOB.get_cache("urls.json") or "{}")
tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "greylist")
tmp_downloads.mkdir(parents=True, exist_ok=True)
downloaded_urls = {}
failed_urls = set()
current_timestamp = datetime.now().astimezone().timestamp()
# Loop on kinds
for service, kinds in services_greylist_urls.items():
for kind, urls_list in kinds.items():
if not urls_list:
if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
if JOB.job_path.joinpath(service, f"{kind}.list").exists():
LOGGER.warning(f"{service} greylist for {kind} is cached but no URL is configured, removing from cache...")
deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
if not deleted:
@ -134,25 +126,16 @@ try:
# Write combined data of the kind in memory and check if it has changed
content = b""
for url in urls_list:
url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
try:
cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
# Check if the URL's last download timestamp is younger than 1 hour
if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
downloaded_urls[url] = {
"time": cached_url["time"],
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
failed_urls.add(url)
status = 1 if status == 1 else 0
continue
# Check if the URL has already been downloaded
if url in failed_urls:
continue
elif url in downloaded_urls:
LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
# Remove first line (URL) and add to content
content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
else:
LOGGER.info(f"Downloading greylist data from {url} ...")
if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
i += 1
LOGGER.info(f"Downloaded {i} bad {kind}")
downloaded_urls[url] = {
"time": current_timestamp,
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
if not cached:
LOGGER.error(f"Error while caching url content : {err}")
except BaseException as e:
status = 2
LOGGER.error(f"Exception while getting {service} greylist from {url} :\n{e}")
failed_urls.add(url)
LOGGER.debug(f"Content for {service} {kind} : {content}")
# Check if file has changed
new_hash = bytes_hash(content)
old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
continue
status = 1
cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
if not cached:
LOGGER.error(f"Error while caching greylist URLs : {err}")
rmtree(tmp_downloads, ignore_errors=True)
except SystemExit as e:
status = e.code
except:

View file

@ -3,11 +3,9 @@
from contextlib import suppress
from datetime import datetime, timedelta
from ipaddress import ip_address, ip_network
from json import dumps, loads
from os import getenv, sep
from os.path import join, normpath
from pathlib import Path
from shutil import rmtree
from sys import exit as sys_exit, path as sys_path
from traceback import format_exc
@ -89,17 +87,11 @@ try:
LOGGER.warning(f"Couldn't delete realip URLs from cache : {err}")
sys_exit(0)
cached_urls = loads(JOB.get_cache("urls.json") or "{}")
tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "realip")
tmp_downloads.mkdir(parents=True, exist_ok=True)
downloaded_urls = {}
failed_urls = set()
current_timestamp = datetime.now().astimezone().timestamp()
for service, urls in services_realip_urls.items():
if not urls:
if Path(JOB.job_path.joinpath(service, "combined.list")).exists():
if JOB.job_path.joinpath(service, "combined.list").exists():
LOGGER.warning(f"{service} realip combined.list is cached but no URL is configured, removing from cache...")
deleted, err = JOB.del_cache("combined.list", service_id=service)
if not deleted:
@ -109,25 +101,16 @@ try:
# Write combined data of the kind in memory and check if it has changed
content = b""
for url in urls:
url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
try:
cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
# Check if the URL's last download timestamp is younger than 1 hour
if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
downloaded_urls[url] = {
"time": cached_url["time"],
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
failed_urls.add(url)
status = 1 if status == 1 else 0
continue
# Check if the URL has already been downloaded
if url in failed_urls:
continue
elif url in downloaded_urls:
LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
# Remove first line (URL) and add to content
content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
else:
LOGGER.info(f"Downloading realip data from {url} ...")
if url.startswith("file://"):
@ -155,16 +138,17 @@ try:
i += 1
LOGGER.info(f"Downloaded {i} realip from {url}")
tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").write_bytes(content)
downloaded_urls[url] = {
"time": current_timestamp,
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
if not cached:
LOGGER.error(f"Error while caching url content : {err}")
except BaseException as e:
status = 2
LOGGER.error(f"Exception while getting {service} realip from {url} :\n{e}")
failed_urls.add(url)
LOGGER.debug(f"Content for {service} : {content}")
# Check if file has changed
new_hash = bytes_hash(content)
old_hash = JOB.cache_hash("combined.list", service_id=service)
@ -181,12 +165,6 @@ try:
continue
status = 1
cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
if not cached:
LOGGER.error(f"Error while caching whitelist URLs : {err}")
rmtree(tmp_downloads, ignore_errors=True)
except SystemExit as e:
status = e.code
except:

View file

@ -3,12 +3,10 @@
from contextlib import suppress
from datetime import datetime, timedelta
from ipaddress import ip_address, ip_network
from json import dumps, loads
from os import getenv, sep
from os.path import join, normpath
from pathlib import Path
from re import compile as re_compile
from shutil import rmtree
from sys import exit as sys_exit, path as sys_path
from traceback import format_exc
from typing import Tuple
@ -112,19 +110,13 @@ try:
LOGGER.warning(f"Couldn't delete whitelist URLs from cache : {err}")
sys_exit(0)
cached_urls = loads(JOB.get_cache("urls.json") or "{}")
tmp_downloads = Path(sep, "var", "tmp", "bunkerweb", "blacklist")
tmp_downloads.mkdir(parents=True, exist_ok=True)
downloaded_urls = {}
failed_urls = set()
current_timestamp = datetime.now().astimezone().timestamp()
# Loop on kinds
for service, kinds in services_whitelist_urls.items():
for kind, urls_list in kinds.items():
if not urls_list:
if Path(JOB.job_path.joinpath(service, f"{kind}.list")).exists():
if JOB.job_path.joinpath(service, f"{kind}.list").exists():
LOGGER.warning(f"{service} whitelist for {kind} is cached but no URL is configured, removing from cache...")
deleted, err = JOB.del_cache(f"{kind}.list", service_id=service)
if not deleted:
@ -134,25 +126,16 @@ try:
# Write combined data of the kind in memory and check if it has changed
content = b""
for url in urls_list:
url_file = f"{bytes_hash(url, algorithm='sha1')}.list"
cached_url = JOB.get_cache(url_file, with_info=True, with_data=True)
try:
cached_url = cached_urls.get(url, {"time": 0, "tmp_path": ""})
# Check if the URL's last download timestamp is younger than 1 hour
if current_timestamp - cached_url["time"] < timedelta(hours=1).total_seconds():
downloaded_urls[url] = {
"time": cached_url["time"],
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
LOGGER.info(f"URL {url} has been downloaded less than 1 hour ago, skipping it...")
failed_urls.add(url)
status = 1 if status == 1 else 0
continue
# Check if the URL has already been downloaded
if url in failed_urls:
continue
elif url in downloaded_urls:
LOGGER.info(f"URL {url} has already been downloaded, skipping it...")
content += Path(downloaded_urls[url]["tmp_path"]).read_bytes()
elif isinstance(cached_url, dict) and cached_url["last_update"] < (datetime.now().astimezone() - timedelta(hours=1)).timestamp():
LOGGER.info(f"URL {url} has already been downloaded less than 1 hour ago, skipping download...")
# Remove first line (URL) and add to content
content += b"\n".join(cached_url["data"].split(b"\n")[1:]) + b"\n"
else:
LOGGER.info(f"Downloading whitelist data from {url} ...")
if url.startswith("file://"):
@ -182,15 +165,17 @@ try:
i += 1
LOGGER.info(f"Downloaded {i} bad {kind}")
downloaded_urls[url] = {
"time": current_timestamp,
"tmp_path": tmp_downloads.joinpath(f"{bytes_hash(url, algorithm='sha1')}.list").as_posix(),
}
cached, err = JOB.cache_file(url_file, b"# Downloaded from " + url.encode("utf-8") + b"\n" + content)
if not cached:
LOGGER.error(f"Error while caching url content : {err}")
except BaseException as e:
status = 2
LOGGER.error(f"Exception while getting {service} whitelist from {url} :\n{e}")
failed_urls.add(url)
LOGGER.debug(f"Content for {service} {kind} : {content}")
# Check if file has changed
new_hash = bytes_hash(content)
old_hash = JOB.cache_hash(f"{kind}.list", service_id=service)
@ -207,12 +192,6 @@ try:
continue
status = 1
cached, err = JOB.cache_file("urls.json", dumps(downloaded_urls, indent=2).encode("utf-8"))
if not cached:
LOGGER.error(f"Error while caching whitelist URLs : {err}")
rmtree(tmp_downloads, ignore_errors=True)
except SystemExit as e:
status = e.code
except: