From d040b06869eb7562ff8a9eed48d6dc0c7e3f7e28 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Tue, 28 Oct 2025 19:30:27 -0400 Subject: [PATCH] added system_tests --- common/global/default.nix | 3 +- python/default.nix | 23 ++++++ python/system_tests/__init__.py | 1 + python/system_tests/components.py | 97 ++++++++++++++++++++++++++ python/system_tests/validate_system.py | 64 +++++++++++++++++ 5 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 python/default.nix create mode 100644 python/system_tests/__init__.py create mode 100644 python/system_tests/components.py create mode 100644 python/system_tests/validate_system.py diff --git a/common/global/default.nix b/common/global/default.nix index bec37a8..428c9e1 100644 --- a/common/global/default.nix +++ b/common/global/default.nix @@ -9,6 +9,7 @@ imports = [ inputs.home-manager.nixosModules.home-manager inputs.sops-nix.nixosModules.sops + ../../python ./fail2ban.nix ./fonts.nix ./libs.nix @@ -16,8 +17,8 @@ ./nh.nix ./nix.nix ./programs.nix - ./ssh.nix ./snapshot_manager.nix + ./ssh.nix ]; boot = { diff --git a/python/default.nix b/python/default.nix new file mode 100644 index 0000000..d77a792 --- /dev/null +++ b/python/default.nix @@ -0,0 +1,23 @@ +{ pkgs, ... }: +{ + environment.systemPackages = with pkgs; [ + python313.withPackages + ( + ps: with ps; [ + apprise + apscheduler + mypy + polars + pyfakefs + pytest + pytest-cov + pytest-mock + pytest-xdist + requests + ruff + typer + types-requests + ] + ) + ]; +} diff --git a/python/system_tests/__init__.py b/python/system_tests/__init__.py new file mode 100644 index 0000000..e022647 --- /dev/null +++ b/python/system_tests/__init__.py @@ -0,0 +1 @@ +"""system_tests.""" diff --git a/python/system_tests/components.py b/python/system_tests/components.py new file mode 100644 index 0000000..416995e --- /dev/null +++ b/python/system_tests/components.py @@ -0,0 +1,97 @@ +"""Validate Jeeves.""" + +from __future__ import annotations + +import logging +from copy import copy +from re import search +from time import sleep +from typing import TYPE_CHECKING + +from python.common import bash_wrapper +from python.zfs import Zpool + +if TYPE_CHECKING: + from collections.abc import Sequence + + +def zpool_tests(pool_names: Sequence[str], zpool_capacity_threshold: int = 90) -> list[str] | None: + """Check the zpool health and capacity. + + Args: + pool_names (Sequence[str]): A list of pool names to test. + zpool_capacity_threshold (int, optional): The threshold for the zpool capacity. Defaults to 90. + + Returns: + list[str] | None: A list of errors if any. + """ + logging.info("Testing zpool") + + errors: list[str] = [] + for pool_name in pool_names: + pool = Zpool(pool_name) + if pool.health != "ONLINE": + errors.append(f"{pool.name} is {pool.health}") + if pool.capacity >= zpool_capacity_threshold: + errors.append(f"{pool.name} is low on space") + + upgrade_status, _ = bash_wrapper("zpool upgrade") + if not search(r"Every feature flags pool has all supported and requested features enabled.", upgrade_status): + errors.append("ZPool out of date run `sudo zpool upgrade -a`") + + return errors + + +def systemd_tests( + service_names: Sequence[str], + max_retries: int = 30, + retry_delay_secs: int = 1, + retryable_statuses: Sequence[str] | None = None, + valid_statuses: Sequence[str] | None = None, +) -> list[str] | None: + """Tests a systemd services. + + Args: + service_names (Sequence[str]): A list of service names to test. + max_retries (int, optional): The maximum number of retries. Defaults to 30. + minimum value is 1. + retry_delay_secs (int, optional): The delay between retries in seconds. Defaults to 1. + minimum value is 1. + retryable_statuses (Sequence[str] | None, optional): A list of retryable statuses. Defaults to None. + valid_statuses (Sequence[str] | None, optional): A list of valid statuses. Defaults to None. + + Returns: + list[str] | None: A list of errors if any. + """ + logging.info("Testing systemd service") + + max_retries = max(max_retries, 1) + retry_delay_secs = max(retry_delay_secs, 1) + last_try = max_retries - 1 + + if retryable_statuses is None: + retryable_statuses = ("inactive\n", "activating\n") + + if valid_statuses is None: + valid_statuses = ("active\n",) + + service_names_set = set(service_names) + + errors: set[str] = set() + for retry in range(max_retries): + if not service_names_set: + break + logging.info(f"Testing systemd service in {retry + 1} of {max_retries}") + service_names_to_test = copy(service_names_set) + for service_name in service_names_to_test: + service_status, _ = bash_wrapper(f"systemctl is-active {service_name}") + if service_status in valid_statuses: + service_names_set.remove(service_name) + continue + if service_status in retryable_statuses and retry < last_try: + continue + errors.add(f"{service_name} is {service_status.strip()}") + + sleep(retry_delay_secs) + + return list(errors) diff --git a/python/system_tests/validate_system.py b/python/system_tests/validate_system.py new file mode 100644 index 0000000..8de5564 --- /dev/null +++ b/python/system_tests/validate_system.py @@ -0,0 +1,64 @@ +"""Validate {server_name}.""" + +import logging +import sys +import tomllib +from os import environ +from pathlib import Path +from socket import gethostname + +import typer + +from python.common import configure_logger, signal_alert +from python.system_tests.components import systemd_tests, zpool_tests + + +def load_config_data(config_file: Path) -> dict[str, list[str]]: + """Load a TOML configuration file. + + Args: + config_file (Path): The path to the configuration file. + + Returns: + dict: The configuration data. + """ + return tomllib.loads(config_file.read_text()) + + +def main(config_file: Path) -> None: + """Main.""" + configure_logger(level=environ.get("LOG_LEVEL", "INFO")) + + server_name = gethostname() + logging.info(f"Starting {server_name} validation") + + config_data = load_config_data(config_file) + + errors: list[str] = [] + try: + if config_data.get("zpools") and (zpool_errors := zpool_tests(config_data["zpools"])): + errors.extend(zpool_errors) + + if config_data.get("services") and (systemd_errors := systemd_tests(config_data["services"])): + errors.extend(systemd_errors) + + except Exception as error: + logging.exception(f"{server_name} validation failed") + errors.append(f"{server_name} validation failed: {error}") + + if errors: + logging.error(f"{server_name} validation failed: \n{'\n'.join(errors)}") + signal_alert(f"{server_name} validation failed {errors}") + + sys.exit(1) + + logging.info(f"{server_name} validation passed") + + +def cli() -> None: + """CLI.""" + typer.run(main) + + +if __name__ == "__main__": + cli()