From e148eeb8ccd6d52ef5b53c5d34ef268137165533 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Tue, 28 Apr 2026 14:23:10 -0400 Subject: [PATCH] setting up resource monitoring for bob and jeeves --- common/optional/monitoring-agent.nix | 256 +++++++++++ systems/bob/default.nix | 1 + systems/jeeves/default.nix | 2 + .../monitoring/dashboards/overview.json | 426 ++++++++++++++++++ .../dashboards/process-history-grouped.json | 216 +++++++++ .../dashboards/process-live-pid.json | 224 +++++++++ .../monitoring/dashboards/storage-zfs.json | 351 +++++++++++++++ systems/jeeves/monitoring/default.nix | 182 ++++++++ systems/jeeves/scripts/zfs.sh | 1 + systems/jeeves/services/grafana.nix | 80 ++++ systems/jeeves/services/hedgedoc.nix | 24 - 11 files changed, 1739 insertions(+), 24 deletions(-) create mode 100644 common/optional/monitoring-agent.nix create mode 100644 systems/jeeves/monitoring/dashboards/overview.json create mode 100644 systems/jeeves/monitoring/dashboards/process-history-grouped.json create mode 100644 systems/jeeves/monitoring/dashboards/process-live-pid.json create mode 100644 systems/jeeves/monitoring/dashboards/storage-zfs.json create mode 100644 systems/jeeves/monitoring/default.nix create mode 100644 systems/jeeves/services/grafana.nix delete mode 100644 systems/jeeves/services/hedgedoc.nix diff --git a/common/optional/monitoring-agent.nix b/common/optional/monitoring-agent.nix new file mode 100644 index 0000000..49f1f47 --- /dev/null +++ b/common/optional/monitoring-agent.nix @@ -0,0 +1,256 @@ +{ + config, + lib, + pkgs, + ... +}: +let + monitoringInterface = "ztwfunumly"; + nodeTextfileDir = "/var/lib/prometheus-node-exporter-textfile"; + + mkProcessNameTemplate = + perPid: template: if perPid then "${template}:{{.PID}}:{{.StartTime}}" else template; + + mkProcessMatchers = perPid: [ + { + name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Module}}"; + cmdline = [ "^/nix/store[^ ]*/bin/python[^ ]* -m (?P[^ ]+)" ]; + } + { + name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}"; + cmdline = [ + "^/nix/store[^ ]*/bin/python[^ ]* /nix/store[^ ]*/bin/\\.?(?P[^ /]+?)(?:-wrapped)?(?:\\s|$)" + ]; + } + { + name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}"; + cmdline = [ + "^/nix/store[^ ]*/bin/node /nix/store[^ ]*-(?P[A-Za-z0-9._+-]+)-[0-9][^ /]*/" + ]; + } + { + name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}"; + cmdline = [ "^/nix/store[^ ]*/(?:bin/|lib/[^ ]*/)?\\.?(?P[^ /]+?)(?:-wrapped)?(?:\\s|$)" ]; + } + { + name = mkProcessNameTemplate perPid "{{.Username}}:{{.ExeBase}}"; + cmdline = [ ".+" ]; + } + ]; + + perPidConfig = pkgs.writeText "process-exporter-per-pid.yaml" ( + builtins.toJSON { + process_names = mkProcessMatchers true; + } + ); + + zpoolLatencyScript = pkgs.writeShellScript "zpool-latency-exporter" '' + set -euo pipefail + + out_dir=${lib.escapeShellArg nodeTextfileDir} + host=${lib.escapeShellArg config.networking.hostName} + tmp_file="$(mktemp "$out_dir/zpool.prom.XXXXXX")" + trap 'rm -f "$tmp_file"' EXIT + + pools="$(zpool list -H -o name | paste -sd, -)" + + cat >"$tmp_file" <<'EOF' + # HELP zpool_iostat_total_wait_read_ns Average total read wait time reported by zpool iostat. + # TYPE zpool_iostat_total_wait_read_ns gauge + # HELP zpool_iostat_total_wait_write_ns Average total write wait time reported by zpool iostat. + # TYPE zpool_iostat_total_wait_write_ns gauge + # HELP zpool_iostat_disk_wait_read_ns Average disk read wait time reported by zpool iostat. + # TYPE zpool_iostat_disk_wait_read_ns gauge + # HELP zpool_iostat_disk_wait_write_ns Average disk write wait time reported by zpool iostat. + # TYPE zpool_iostat_disk_wait_write_ns gauge + # HELP zpool_iostat_syncq_wait_read_ns Average synchronous queue read wait time reported by zpool iostat. + # TYPE zpool_iostat_syncq_wait_read_ns gauge + # HELP zpool_iostat_syncq_wait_write_ns Average synchronous queue write wait time reported by zpool iostat. + # TYPE zpool_iostat_syncq_wait_write_ns gauge + # HELP zpool_iostat_asyncq_wait_read_ns Average asynchronous queue read wait time reported by zpool iostat. + # TYPE zpool_iostat_asyncq_wait_read_ns gauge + # HELP zpool_iostat_asyncq_wait_write_ns Average asynchronous queue write wait time reported by zpool iostat. + # TYPE zpool_iostat_asyncq_wait_write_ns gauge + EOF + + zpool iostat -Hplvy -y 1 1 | awk -F '\t' -v host="$host" -v pools="$pools" ' + function esc(str, out) { + out = str + gsub(/\\/, "\\\\", out) + gsub(/"/, "\\\"", out) + return out + } + + function emit(metric, pool, vdev, value) { + if (value == "" || value == "-") { + return + } + + printf "%s{host=\"%s\",pool=\"%s\",vdev=\"%s\"} %s\n", + metric, + esc(host), + esc(pool), + esc(vdev), + value + } + + BEGIN { + split(pools, pool_names, ",") + for (idx in pool_names) { + if (pool_names[idx] != "") { + known_pools[pool_names[idx]] = 1 + } + } + } + + NF == 0 { + next + } + + { + row_name = $1 + + if (row_name in known_pools) { + current_pool = row_name + current_vdev = "_pool" + } else if (current_pool == "") { + next + } else { + current_vdev = row_name + } + + emit("zpool_iostat_total_wait_read_ns", current_pool, current_vdev, $8) + emit("zpool_iostat_total_wait_write_ns", current_pool, current_vdev, $9) + emit("zpool_iostat_disk_wait_read_ns", current_pool, current_vdev, $10) + emit("zpool_iostat_disk_wait_write_ns", current_pool, current_vdev, $11) + emit("zpool_iostat_syncq_wait_read_ns", current_pool, current_vdev, $12) + emit("zpool_iostat_syncq_wait_write_ns", current_pool, current_vdev, $13) + emit("zpool_iostat_asyncq_wait_read_ns", current_pool, current_vdev, $14) + emit("zpool_iostat_asyncq_wait_write_ns", current_pool, current_vdev, $15) + } + ' >>"$tmp_file" + + mv "$tmp_file" "$out_dir/zpool.prom" + trap - EXIT + ''; +in +{ + networking.firewall.interfaces.${monitoringInterface}.allowedTCPPorts = [ + 9100 + 9134 + 9256 + 9257 + 9633 + ]; + + services.prometheus.exporters = { + node = { + enable = true; + enabledCollectors = [ + "pressure" + "processes" + "systemd" + ]; + extraFlags = [ "--collector.textfile.directory=${nodeTextfileDir}" ]; + }; + + process = { + enable = true; + user = "root"; + group = "root"; + settings.process_names = mkProcessMatchers false; + extraFlags = [ + "-gather-smaps=false" + "-remove-empty-groups=true" + "-threads=false" + ]; + }; + + smartctl.enable = true; + zfs.enable = true; + }; + + programs.atop = { + enable = true; + atopService.enable = true; + atopRotateTimer.enable = true; + atopacctService.enable = true; + settings.interval = 30; + }; + + systemd = { + services = { + prometheus-process-pid-exporter = { + description = "Prometheus process exporter with per-PID naming"; + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + ExecStart = '' + ${pkgs.prometheus-process-exporter}/bin/process-exporter \ + --web.listen-address 0.0.0.0:9257 \ + --config.path ${perPidConfig} \ + -children=false \ + -gather-smaps=false \ + -remove-empty-groups=true \ + -threads=false + ''; + User = "root"; + Group = "root"; + Restart = "always"; + WorkingDirectory = "/tmp"; + CapabilityBoundingSet = [ "" ]; + DeviceAllow = [ "" ]; + LockPersonality = true; + MemoryDenyWriteExecute = true; + NoNewPrivileges = true; + PrivateDevices = true; + PrivateTmp = true; + ProtectClock = true; + ProtectControlGroups = true; + ProtectHome = true; + ProtectHostname = true; + ProtectKernelLogs = true; + ProtectKernelModules = true; + ProtectKernelTunables = true; + ProtectSystem = "strict"; + RemoveIPC = true; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + ]; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + SystemCallArchitectures = "native"; + UMask = "0077"; + }; + }; + + zpool-latency-exporter = { + description = "Exports ZFS latency metrics for node_exporter textfile collection"; + after = [ "zfs-import.target" ]; + requires = [ "zfs-import.target" ]; + path = [ + config.boot.zfs.package + pkgs.coreutils + pkgs.gawk + ]; + serviceConfig = { + Type = "oneshot"; + ExecStart = zpoolLatencyScript; + }; + }; + }; + + timers.zpool-latency-exporter = { + wantedBy = [ "timers.target" ]; + timerConfig = { + OnBootSec = "2m"; + OnUnitActiveSec = "60s"; + Unit = "zpool-latency-exporter.service"; + }; + }; + + tmpfiles.rules = [ "d ${nodeTextfileDir} 0755 root root - -" ]; + }; +} diff --git a/systems/bob/default.nix b/systems/bob/default.nix index ac97388..58de81f 100644 --- a/systems/bob/default.nix +++ b/systems/bob/default.nix @@ -7,6 +7,7 @@ "${inputs.self}/common/global" "${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/scanner.nix" + "${inputs.self}/common/optional/monitoring-agent.nix" "${inputs.self}/common/optional/steam.nix" "${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/systemd-boot.nix" diff --git a/systems/jeeves/default.nix b/systems/jeeves/default.nix index 61d7499..1d47f8d 100644 --- a/systems/jeeves/default.nix +++ b/systems/jeeves/default.nix @@ -10,10 +10,12 @@ in "${inputs.self}/users/steve" "${inputs.self}/common/global" "${inputs.self}/common/optional/docker.nix" + "${inputs.self}/common/optional/monitoring-agent.nix" "${inputs.self}/common/optional/ssh_decrypt.nix" "${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/update.nix" "${inputs.self}/common/optional/zerotier.nix" + ./monitoring ./docker ./services ./web_services diff --git a/systems/jeeves/monitoring/dashboards/overview.json b/systems/jeeves/monitoring/dashboards/overview.json new file mode 100644 index 0000000..c45542d --- /dev/null +++ b/systems/jeeves/monitoring/dashboards/overview.json @@ -0,0 +1,426 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "RAM Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "100 * (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Swap Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "node_load1", + "legendFormat": "{{instance}} load1", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "node_load5", + "legendFormat": "{{instance}} load5", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "node_load15", + "legendFormat": "{{instance}} load15", + "range": true, + "refId": "C" + } + ], + "title": "Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "sum by (instance) (rate(node_disk_read_bytes_total[5m]))", + "legendFormat": "{{instance}} read", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "sum by (instance) (rate(node_disk_written_bytes_total[5m]))", + "legendFormat": "{{instance}} write", + "range": true, + "refId": "B" + } + ], + "title": "Disk Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"} / node_filesystem_size_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"}))", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{mountpoint}}", + "refId": "A" + } + ], + "title": "Filesystem Usage", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top Grouped CPU", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top Grouped Memory", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Overview", + "uid": "monitor-overview", + "version": 1, + "weekStart": "" +} diff --git a/systems/jeeves/monitoring/dashboards/process-history-grouped.json b/systems/jeeves/monitoring/dashboards/process-history-grouped.json new file mode 100644 index 0000000..dff19bb --- /dev/null +++ b/systems/jeeves/monitoring/dashboards/process-history-grouped.json @@ -0,0 +1,216 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))", + "legendFormat": "{{instance}} {{groupname}}", + "range": true, + "refId": "A" + } + ], + "title": "Grouped CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})", + "legendFormat": "{{instance}} {{groupname}}", + "range": true, + "refId": "A" + } + ], + "title": "Grouped Resident Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, rate(namedprocess_namegroup_read_bytes_total[5m]))", + "legendFormat": "{{instance}} {{groupname}}", + "range": true, + "refId": "A" + } + ], + "title": "Grouped Read I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(10, rate(namedprocess_namegroup_write_bytes_total[5m]))", + "legendFormat": "{{instance}} {{groupname}}", + "range": true, + "refId": "A" + } + ], + "title": "Grouped Write I/O", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "monitoring", + "process" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Process History Grouped", + "uid": "monitor-process-history", + "version": 1, + "weekStart": "" +} diff --git a/systems/jeeves/monitoring/dashboards/process-live-pid.json b/systems/jeeves/monitoring/dashboards/process-live-pid.json new file mode 100644 index 0000000..16bf464 --- /dev/null +++ b/systems/jeeves/monitoring/dashboards/process-live-pid.json @@ -0,0 +1,224 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "editorMode": "code", + "expr": "topk(20, rate(namedprocess_namegroup_cpu_seconds_total[2m]))", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top PID CPU", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "editorMode": "code", + "expr": "topk(20, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top PID RSS", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 3, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "editorMode": "code", + "expr": "topk(20, rate(namedprocess_namegroup_read_bytes_total[2m]))", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top PID Read I/O", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-pid-short" + }, + "editorMode": "code", + "expr": "topk(20, rate(namedprocess_namegroup_write_bytes_total[2m]))", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{groupname}}", + "refId": "A" + } + ], + "title": "Top PID Write I/O", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "monitoring", + "process" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-10m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Process Live PID", + "uid": "monitor-process-pid", + "version": 1, + "weekStart": "" +} diff --git a/systems/jeeves/monitoring/dashboards/storage-zfs.json b/systems/jeeves/monitoring/dashboards/storage-zfs.json new file mode 100644 index 0000000..23fad66 --- /dev/null +++ b/systems/jeeves/monitoring/dashboards/storage-zfs.json @@ -0,0 +1,351 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "100 * (zfs_pool_allocated_bytes / zfs_pool_size_bytes)", + "legendFormat": "{{instance}} {{pool}}", + "range": true, + "refId": "A" + } + ], + "title": "Pool Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "zfs_pool_free_bytes", + "legendFormat": "{{instance}} {{pool}}", + "range": true, + "refId": "A" + } + ], + "title": "Pool Free Bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 3, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(20, zfs_dataset_used_bytes{type=\"filesystem\"})", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{name}}", + "refId": "A" + } + ], + "title": "Top Filesystems by Used Bytes", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(20, zpool_iostat_total_wait_read_ns{vdev!=\"_pool\"})", + "legendFormat": "{{host}} {{pool}} {{vdev}}", + "range": true, + "refId": "A" + } + ], + "title": "ZFS Read Wait", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "topk(20, zpool_iostat_total_wait_write_ns{vdev!=\"_pool\"})", + "legendFormat": "{{host}} {{pool}} {{vdev}}", + "range": true, + "refId": "A" + } + ], + "title": "ZFS Write Wait", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "smartctl_device_temperature{temperature_type=\"current\"}", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{device}}", + "refId": "A" + } + ], + "title": "Disk Temperature", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Value" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom-main" + }, + "editorMode": "code", + "expr": "smartctl_device_smart_status", + "format": "table", + "instant": true, + "legendFormat": "{{instance}} {{device}}", + "refId": "A" + } + ], + "title": "SMART Health", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "monitoring", + "zfs" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Storage and ZFS", + "uid": "monitor-storage", + "version": 1, + "weekStart": "" +} diff --git a/systems/jeeves/monitoring/default.nix b/systems/jeeves/monitoring/default.nix new file mode 100644 index 0000000..e651539 --- /dev/null +++ b/systems/jeeves/monitoring/default.nix @@ -0,0 +1,182 @@ +{ + lib, + pkgs, + ... +}: +let + vars = import ../vars.nix; + + + prometheusDataRoot = "${vars.database}/prometheus"; + mainPrometheusDataDir = "${prometheusDataRoot}/main"; + pidPrometheusDataDir = "${prometheusDataRoot}/pid-short"; + + prometheusYaml = pkgs.formats.yaml { }; + + mkPrometheusConfig = + name: cfg: + let + configFile = prometheusYaml.generate "${name}.yaml" cfg; + in + pkgs.runCommand "${name}-checked.yaml" + { + nativeBuildInputs = [ pkgs.prometheus.cli ]; + } + '' + promtool check config ${configFile} + cp ${configFile} $out + ''; + + mkTarget = host: address: { + targets = [ address ]; + labels.instance = host; + }; + + mainPrometheusConfig = mkPrometheusConfig "prometheus-main" { + global = { + scrape_interval = "30s"; + scrape_timeout = "10s"; + evaluation_interval = "30s"; + }; + scrape_configs = [ + { + job_name = "node"; + static_configs = [ + (mkTarget "jeeves" "192.168.90.40:9100") + (mkTarget "bob" "192.168.90.25:9100") + ]; + } + { + job_name = "process_grouped"; + static_configs = [ + (mkTarget "jeeves" "192.168.90.40:9256") + (mkTarget "bob" "192.168.90.25:9256") + ]; + } + { + job_name = "smartctl"; + static_configs = [ + (mkTarget "jeeves" "192.168.90.40:9633") + (mkTarget "bob" "192.168.90.25:9633") + ]; + } + { + job_name = "zfs"; + static_configs = [ + (mkTarget "jeeves" "192.168.90.40:9134") + (mkTarget "bob" "192.168.90.25:9134") + ]; + } + ]; + }; + + pidPrometheusConfig = mkPrometheusConfig "prometheus-pid-short" { + global = { + scrape_interval = "15s"; + scrape_timeout = "10s"; + evaluation_interval = "15s"; + }; + scrape_configs = [ + { + job_name = "process_pid"; + static_configs = [ + (mkTarget "jeeves" "192.168.90.40:9257") + (mkTarget "bob" "192.168.90.25:9257") + ]; + } + ]; + }; + + mkPrometheusService = + { + dataDir, + configFile, + port, + retention, + }: + { + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + ExecStart = "${lib.getExe pkgs.prometheus} ${ + lib.escapeShellArgs [ + "--config.file=${configFile}" + "--storage.tsdb.path=${dataDir}" + "--storage.tsdb.retention.time=${retention}" + "--web.listen-address=127.0.0.1:${toString port}" + ] + }"; + User = "prometheus"; + Group = "prometheus"; + Restart = "always"; + RestartSec = "5s"; + WorkingDirectory = dataDir; + ReadWritePaths = [ dataDir ]; + CapabilityBoundingSet = [ "" ]; + DeviceAllow = [ "/dev/null rw" ]; + DevicePolicy = "strict"; + LockPersonality = true; + MemoryDenyWriteExecute = true; + NoNewPrivileges = true; + PrivateDevices = true; + PrivateTmp = true; + ProtectClock = true; + ProtectControlGroups = true; + ProtectHome = true; + ProtectHostname = true; + ProtectKernelLogs = true; + ProtectKernelModules = true; + ProtectKernelTunables = true; + ProtectProc = "invisible"; + ProtectSystem = "strict"; + RemoveIPC = true; + RestrictAddressFamilies = [ + "AF_INET" + "AF_INET6" + "AF_UNIX" + ]; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + SystemCallArchitectures = "native"; + SystemCallFilter = [ + "@system-service" + "~@privileged" + ]; + }; + }; +in +{ + users = { + groups.prometheus = { }; + users.prometheus = { + isSystemUser = true; + group = "prometheus"; + description = "Prometheus daemon user"; + }; + }; + + systemd = { + services = { + prometheus-main = mkPrometheusService { + configFile = mainPrometheusConfig; + dataDir = mainPrometheusDataDir; + port = 9090; + retention = "90d"; + }; + + prometheus-pid-short = mkPrometheusService { + configFile = pidPrometheusConfig; + dataDir = pidPrometheusDataDir; + port = 9092; + retention = "10m"; + }; + }; + + tmpfiles.rules = [ + "d ${prometheusDataRoot} 0755 root root - -" + "d ${mainPrometheusDataDir} 0750 prometheus prometheus - -" + "d ${pidPrometheusDataDir} 0750 prometheus prometheus - -" + ]; + }; +} diff --git a/systems/jeeves/scripts/zfs.sh b/systems/jeeves/scripts/zfs.sh index dbb8ce6..6938222 100644 --- a/systems/jeeves/scripts/zfs.sh +++ b/systems/jeeves/scripts/zfs.sh @@ -23,6 +23,7 @@ sudo zfs create media/secure/home_assistant -o compression=zstd-19 sudo zfs create media/secure/notes -o copies=2 sudo zfs create media/secure/postgres -o mountpoint=/zfs/media/database/postgres -o recordsize=16k -o primarycache=metadata sudo zfs create media/secure/postgres-wal -o mountpoint=/zfs/media/database/postgres-wal -o recordsize=32k -o primarycache=metadata -o special_small_blocks=32K -o compression=lz4 -o secondarycache=none -o logbias=latency +sudo zfs create media/secure/prometheus -o mountpoint=/zfs/media/database/prometheus -o compression=lz4 sudo zfs create media/secure/services -o compression=zstd-9 sudo zfs create media/secure/share -o mountpoint=/zfs/media/share -o exec=off diff --git a/systems/jeeves/services/grafana.nix b/systems/jeeves/services/grafana.nix new file mode 100644 index 0000000..d474e65 --- /dev/null +++ b/systems/jeeves/services/grafana.nix @@ -0,0 +1,80 @@ +{ + ... +}: +let + vars = import ../vars.nix; + grafanaDataDir = "${vars.services}/grafana"; +in +{ + networking.firewall.allowedTCPPorts = [ 3000 ]; + + services.grafana = { + enable = true; + dataDir = grafanaDataDir; + settings = { + database.type = "sqlite3"; + security = { + admin_password = "$__file{${vars.secrets}/services/grafana/admin_password}"; + admin_user = "admin"; + secret_key = "$__file{${vars.secrets}/services/grafana/secret_key}"; + }; + server = { + http_addr = "192.168.90.40"; + http_port = 3000; + root_url = "http://$192.168.90.40:3000/"; + }; + }; + provision = { + enable = true; + dashboards.settings = { + apiVersion = 1; + providers = [ + { + name = "monitoring"; + folder = "Monitoring"; + type = "file"; + disableDeletion = false; + editable = false; + allowUiUpdates = false; + updateIntervalSeconds = 30; + options.path = ../monitoring/dashboards; + } + ]; + }; + datasources.settings = { + apiVersion = 1; + prune = true; + datasources = [ + { + access = "proxy"; + editable = false; + isDefault = true; + name = "prom-main"; + type = "prometheus"; + uid = "prom-main"; + url = "http://127.0.0.1:9090"; + } + { + access = "proxy"; + editable = false; + name = "prom-pid-short"; + type = "prometheus"; + uid = "prom-pid-short"; + url = "http://127.0.0.1:9092"; + } + ]; + }; + }; + }; + + systemd = { + services.grafana.after = [ + "prometheus-main.service" + "prometheus-pid-short.service" + ]; + + tmpfiles.rules = [ + "d ${grafanaDataDir} 0750 grafana grafana - -" + ]; + }; +} diff --git a/systems/jeeves/services/hedgedoc.nix b/systems/jeeves/services/hedgedoc.nix deleted file mode 100644 index 314fc48..0000000 --- a/systems/jeeves/services/hedgedoc.nix +++ /dev/null @@ -1,24 +0,0 @@ -{ - services.hedgedoc = { - enable = true; - settings = { - host = "0.0.0.0"; - port = 3000; - domain = "192.168.90.40"; - urlAddPort = true; - protocolUseSSL = false; - db = { - dialect = "postgres"; - database = "hedgedoc"; - username = "hedgedoc"; - host = "/run/postgresql"; - }; - }; - }; - networking.firewall.allowedTCPPorts = [ 3000 ]; - - systemd.services.hedgedoc = { - after = [ "postgresql.service" ]; - requires = [ "postgresql.service" ]; - }; -}