setting up resource monitoring for bob and jeeves

This commit is contained in:
2026-04-28 14:23:10 -04:00
parent 957110b7e9
commit e148eeb8cc
11 changed files with 1739 additions and 24 deletions
+256
View File
@@ -0,0 +1,256 @@
{
config,
lib,
pkgs,
...
}:
let
monitoringInterface = "ztwfunumly";
nodeTextfileDir = "/var/lib/prometheus-node-exporter-textfile";
mkProcessNameTemplate =
perPid: template: if perPid then "${template}:{{.PID}}:{{.StartTime}}" else template;
mkProcessMatchers = perPid: [
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Module}}";
cmdline = [ "^/nix/store[^ ]*/bin/python[^ ]* -m (?P<Module>[^ ]+)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/python[^ ]* /nix/store[^ ]*/bin/\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/node /nix/store[^ ]*-(?P<Wrapped>[A-Za-z0-9._+-]+)-[0-9][^ /]*/"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [ "^/nix/store[^ ]*/(?:bin/|lib/[^ ]*/)?\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.ExeBase}}";
cmdline = [ ".+" ];
}
];
perPidConfig = pkgs.writeText "process-exporter-per-pid.yaml" (
builtins.toJSON {
process_names = mkProcessMatchers true;
}
);
zpoolLatencyScript = pkgs.writeShellScript "zpool-latency-exporter" ''
set -euo pipefail
out_dir=${lib.escapeShellArg nodeTextfileDir}
host=${lib.escapeShellArg config.networking.hostName}
tmp_file="$(mktemp "$out_dir/zpool.prom.XXXXXX")"
trap 'rm -f "$tmp_file"' EXIT
pools="$(zpool list -H -o name | paste -sd, -)"
cat >"$tmp_file" <<'EOF'
# HELP zpool_iostat_total_wait_read_ns Average total read wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_read_ns gauge
# HELP zpool_iostat_total_wait_write_ns Average total write wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_write_ns gauge
# HELP zpool_iostat_disk_wait_read_ns Average disk read wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_read_ns gauge
# HELP zpool_iostat_disk_wait_write_ns Average disk write wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_write_ns gauge
# HELP zpool_iostat_syncq_wait_read_ns Average synchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_read_ns gauge
# HELP zpool_iostat_syncq_wait_write_ns Average synchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_write_ns gauge
# HELP zpool_iostat_asyncq_wait_read_ns Average asynchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_read_ns gauge
# HELP zpool_iostat_asyncq_wait_write_ns Average asynchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_write_ns gauge
EOF
zpool iostat -Hplvy -y 1 1 | awk -F '\t' -v host="$host" -v pools="$pools" '
function esc(str, out) {
out = str
gsub(/\\/, "\\\\", out)
gsub(/"/, "\\\"", out)
return out
}
function emit(metric, pool, vdev, value) {
if (value == "" || value == "-") {
return
}
printf "%s{host=\"%s\",pool=\"%s\",vdev=\"%s\"} %s\n",
metric,
esc(host),
esc(pool),
esc(vdev),
value
}
BEGIN {
split(pools, pool_names, ",")
for (idx in pool_names) {
if (pool_names[idx] != "") {
known_pools[pool_names[idx]] = 1
}
}
}
NF == 0 {
next
}
{
row_name = $1
if (row_name in known_pools) {
current_pool = row_name
current_vdev = "_pool"
} else if (current_pool == "") {
next
} else {
current_vdev = row_name
}
emit("zpool_iostat_total_wait_read_ns", current_pool, current_vdev, $8)
emit("zpool_iostat_total_wait_write_ns", current_pool, current_vdev, $9)
emit("zpool_iostat_disk_wait_read_ns", current_pool, current_vdev, $10)
emit("zpool_iostat_disk_wait_write_ns", current_pool, current_vdev, $11)
emit("zpool_iostat_syncq_wait_read_ns", current_pool, current_vdev, $12)
emit("zpool_iostat_syncq_wait_write_ns", current_pool, current_vdev, $13)
emit("zpool_iostat_asyncq_wait_read_ns", current_pool, current_vdev, $14)
emit("zpool_iostat_asyncq_wait_write_ns", current_pool, current_vdev, $15)
}
' >>"$tmp_file"
mv "$tmp_file" "$out_dir/zpool.prom"
trap - EXIT
'';
in
{
networking.firewall.interfaces.${monitoringInterface}.allowedTCPPorts = [
9100
9134
9256
9257
9633
];
services.prometheus.exporters = {
node = {
enable = true;
enabledCollectors = [
"pressure"
"processes"
"systemd"
];
extraFlags = [ "--collector.textfile.directory=${nodeTextfileDir}" ];
};
process = {
enable = true;
user = "root";
group = "root";
settings.process_names = mkProcessMatchers false;
extraFlags = [
"-gather-smaps=false"
"-remove-empty-groups=true"
"-threads=false"
];
};
smartctl.enable = true;
zfs.enable = true;
};
programs.atop = {
enable = true;
atopService.enable = true;
atopRotateTimer.enable = true;
atopacctService.enable = true;
settings.interval = 30;
};
systemd = {
services = {
prometheus-process-pid-exporter = {
description = "Prometheus process exporter with per-PID naming";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
ExecStart = ''
${pkgs.prometheus-process-exporter}/bin/process-exporter \
--web.listen-address 0.0.0.0:9257 \
--config.path ${perPidConfig} \
-children=false \
-gather-smaps=false \
-remove-empty-groups=true \
-threads=false
'';
User = "root";
Group = "root";
Restart = "always";
WorkingDirectory = "/tmp";
CapabilityBoundingSet = [ "" ];
DeviceAllow = [ "" ];
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectSystem = "strict";
RemoveIPC = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
UMask = "0077";
};
};
zpool-latency-exporter = {
description = "Exports ZFS latency metrics for node_exporter textfile collection";
after = [ "zfs-import.target" ];
requires = [ "zfs-import.target" ];
path = [
config.boot.zfs.package
pkgs.coreutils
pkgs.gawk
];
serviceConfig = {
Type = "oneshot";
ExecStart = zpoolLatencyScript;
};
};
};
timers.zpool-latency-exporter = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnBootSec = "2m";
OnUnitActiveSec = "60s";
Unit = "zpool-latency-exporter.service";
};
};
tmpfiles.rules = [ "d ${nodeTextfileDir} 0755 root root - -" ];
};
}
+1
View File
@@ -7,6 +7,7 @@
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/scanner.nix" "${inputs.self}/common/optional/scanner.nix"
"${inputs.self}/common/optional/monitoring-agent.nix"
"${inputs.self}/common/optional/steam.nix" "${inputs.self}/common/optional/steam.nix"
"${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/syncthing_base.nix"
"${inputs.self}/common/optional/systemd-boot.nix" "${inputs.self}/common/optional/systemd-boot.nix"
+2
View File
@@ -10,10 +10,12 @@ in
"${inputs.self}/users/steve" "${inputs.self}/users/steve"
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/monitoring-agent.nix"
"${inputs.self}/common/optional/ssh_decrypt.nix" "${inputs.self}/common/optional/ssh_decrypt.nix"
"${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/syncthing_base.nix"
"${inputs.self}/common/optional/update.nix" "${inputs.self}/common/optional/update.nix"
"${inputs.self}/common/optional/zerotier.nix" "${inputs.self}/common/optional/zerotier.nix"
./monitoring
./docker ./docker
./services ./services
./web_services ./web_services
@@ -0,0 +1,426 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "CPU Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "RAM Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Swap Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load1",
"legendFormat": "{{instance}} load1",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load5",
"legendFormat": "{{instance}} load5",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load15",
"legendFormat": "{{instance}} load15",
"range": true,
"refId": "C"
}
],
"title": "Load",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_read_bytes_total[5m]))",
"legendFormat": "{{instance}} read",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_written_bytes_total[5m]))",
"legendFormat": "{{instance}} write",
"range": true,
"refId": "B"
}
],
"title": "Disk Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"} / node_filesystem_size_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"}))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{mountpoint}}",
"refId": "A"
}
],
"title": "Filesystem Usage",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 17
},
"id": 8,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped Memory",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Overview",
"uid": "monitor-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,216 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Resident Memory",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_read_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Read I/O",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_write_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Write I/O",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-7d",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process History Grouped",
"uid": "monitor-process-history",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,224 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_cpu_seconds_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID RSS",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_read_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Read I/O",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_write_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Write I/O",
"type": "table"
}
],
"refresh": "15s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-10m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process Live PID",
"uid": "monitor-process-pid",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,351 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (zfs_pool_allocated_bytes / zfs_pool_size_bytes)",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "zfs_pool_free_bytes",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Free Bytes",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 0
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zfs_dataset_used_bytes{type=\"filesystem\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{name}}",
"refId": "A"
}
],
"title": "Top Filesystems by Used Bytes",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_read_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Read Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_write_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Write Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 17
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_temperature{temperature_type=\"current\"}",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "Disk Temperature",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_smart_status",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "SMART Health",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"zfs"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Storage and ZFS",
"uid": "monitor-storage",
"version": 1,
"weekStart": ""
}
+182
View File
@@ -0,0 +1,182 @@
{
lib,
pkgs,
...
}:
let
vars = import ../vars.nix;
prometheusDataRoot = "${vars.database}/prometheus";
mainPrometheusDataDir = "${prometheusDataRoot}/main";
pidPrometheusDataDir = "${prometheusDataRoot}/pid-short";
prometheusYaml = pkgs.formats.yaml { };
mkPrometheusConfig =
name: cfg:
let
configFile = prometheusYaml.generate "${name}.yaml" cfg;
in
pkgs.runCommand "${name}-checked.yaml"
{
nativeBuildInputs = [ pkgs.prometheus.cli ];
}
''
promtool check config ${configFile}
cp ${configFile} $out
'';
mkTarget = host: address: {
targets = [ address ];
labels.instance = host;
};
mainPrometheusConfig = mkPrometheusConfig "prometheus-main" {
global = {
scrape_interval = "30s";
scrape_timeout = "10s";
evaluation_interval = "30s";
};
scrape_configs = [
{
job_name = "node";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9100")
(mkTarget "bob" "192.168.90.25:9100")
];
}
{
job_name = "process_grouped";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9256")
(mkTarget "bob" "192.168.90.25:9256")
];
}
{
job_name = "smartctl";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9633")
(mkTarget "bob" "192.168.90.25:9633")
];
}
{
job_name = "zfs";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9134")
(mkTarget "bob" "192.168.90.25:9134")
];
}
];
};
pidPrometheusConfig = mkPrometheusConfig "prometheus-pid-short" {
global = {
scrape_interval = "15s";
scrape_timeout = "10s";
evaluation_interval = "15s";
};
scrape_configs = [
{
job_name = "process_pid";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9257")
(mkTarget "bob" "192.168.90.25:9257")
];
}
];
};
mkPrometheusService =
{
dataDir,
configFile,
port,
retention,
}:
{
after = [ "network.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = "${lib.getExe pkgs.prometheus} ${
lib.escapeShellArgs [
"--config.file=${configFile}"
"--storage.tsdb.path=${dataDir}"
"--storage.tsdb.retention.time=${retention}"
"--web.listen-address=127.0.0.1:${toString port}"
]
}";
User = "prometheus";
Group = "prometheus";
Restart = "always";
RestartSec = "5s";
WorkingDirectory = dataDir;
ReadWritePaths = [ dataDir ];
CapabilityBoundingSet = [ "" ];
DeviceAllow = [ "/dev/null rw" ];
DevicePolicy = "strict";
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectProc = "invisible";
ProtectSystem = "strict";
RemoveIPC = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
"AF_UNIX"
];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
SystemCallFilter = [
"@system-service"
"~@privileged"
];
};
};
in
{
users = {
groups.prometheus = { };
users.prometheus = {
isSystemUser = true;
group = "prometheus";
description = "Prometheus daemon user";
};
};
systemd = {
services = {
prometheus-main = mkPrometheusService {
configFile = mainPrometheusConfig;
dataDir = mainPrometheusDataDir;
port = 9090;
retention = "90d";
};
prometheus-pid-short = mkPrometheusService {
configFile = pidPrometheusConfig;
dataDir = pidPrometheusDataDir;
port = 9092;
retention = "10m";
};
};
tmpfiles.rules = [
"d ${prometheusDataRoot} 0755 root root - -"
"d ${mainPrometheusDataDir} 0750 prometheus prometheus - -"
"d ${pidPrometheusDataDir} 0750 prometheus prometheus - -"
];
};
}
+1
View File
@@ -23,6 +23,7 @@ sudo zfs create media/secure/home_assistant -o compression=zstd-19
sudo zfs create media/secure/notes -o copies=2 sudo zfs create media/secure/notes -o copies=2
sudo zfs create media/secure/postgres -o mountpoint=/zfs/media/database/postgres -o recordsize=16k -o primarycache=metadata sudo zfs create media/secure/postgres -o mountpoint=/zfs/media/database/postgres -o recordsize=16k -o primarycache=metadata
sudo zfs create media/secure/postgres-wal -o mountpoint=/zfs/media/database/postgres-wal -o recordsize=32k -o primarycache=metadata -o special_small_blocks=32K -o compression=lz4 -o secondarycache=none -o logbias=latency sudo zfs create media/secure/postgres-wal -o mountpoint=/zfs/media/database/postgres-wal -o recordsize=32k -o primarycache=metadata -o special_small_blocks=32K -o compression=lz4 -o secondarycache=none -o logbias=latency
sudo zfs create media/secure/prometheus -o mountpoint=/zfs/media/database/prometheus -o compression=lz4
sudo zfs create media/secure/services -o compression=zstd-9 sudo zfs create media/secure/services -o compression=zstd-9
sudo zfs create media/secure/share -o mountpoint=/zfs/media/share -o exec=off sudo zfs create media/secure/share -o mountpoint=/zfs/media/share -o exec=off
+80
View File
@@ -0,0 +1,80 @@
{
...
}:
let
vars = import ../vars.nix;
grafanaDataDir = "${vars.services}/grafana";
in
{
networking.firewall.allowedTCPPorts = [ 3000 ];
services.grafana = {
enable = true;
dataDir = grafanaDataDir;
settings = {
database.type = "sqlite3";
security = {
admin_password = "$__file{${vars.secrets}/services/grafana/admin_password}";
admin_user = "admin";
secret_key = "$__file{${vars.secrets}/services/grafana/secret_key}";
};
server = {
http_addr = "192.168.90.40";
http_port = 3000;
root_url = "http://$192.168.90.40:3000/";
};
};
provision = {
enable = true;
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "monitoring";
folder = "Monitoring";
type = "file";
disableDeletion = false;
editable = false;
allowUiUpdates = false;
updateIntervalSeconds = 30;
options.path = ../monitoring/dashboards;
}
];
};
datasources.settings = {
apiVersion = 1;
prune = true;
datasources = [
{
access = "proxy";
editable = false;
isDefault = true;
name = "prom-main";
type = "prometheus";
uid = "prom-main";
url = "http://127.0.0.1:9090";
}
{
access = "proxy";
editable = false;
name = "prom-pid-short";
type = "prometheus";
uid = "prom-pid-short";
url = "http://127.0.0.1:9092";
}
];
};
};
};
systemd = {
services.grafana.after = [
"prometheus-main.service"
"prometheus-pid-short.service"
];
tmpfiles.rules = [
"d ${grafanaDataDir} 0750 grafana grafana - -"
];
};
}
-24
View File
@@ -1,24 +0,0 @@
{
services.hedgedoc = {
enable = true;
settings = {
host = "0.0.0.0";
port = 3000;
domain = "192.168.90.40";
urlAddPort = true;
protocolUseSSL = false;
db = {
dialect = "postgres";
database = "hedgedoc";
username = "hedgedoc";
host = "/run/postgresql";
};
};
};
networking.firewall.allowedTCPPorts = [ 3000 ];
systemd.services.hedgedoc = {
after = [ "postgresql.service" ];
requires = [ "postgresql.service" ];
};
}