setting up resource monitoring for bob and jeeves

This commit is contained in:
2026-04-28 14:23:10 -04:00
parent 957110b7e9
commit 374d7e8d38
11 changed files with 1739 additions and 24 deletions
@@ -0,0 +1,426 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "CPU Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "RAM Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Swap Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load1",
"legendFormat": "{{instance}} load1",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load5",
"legendFormat": "{{instance}} load5",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load15",
"legendFormat": "{{instance}} load15",
"range": true,
"refId": "C"
}
],
"title": "Load",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_read_bytes_total[5m]))",
"legendFormat": "{{instance}} read",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_written_bytes_total[5m]))",
"legendFormat": "{{instance}} write",
"range": true,
"refId": "B"
}
],
"title": "Disk Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"} / node_filesystem_size_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"}))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{mountpoint}}",
"refId": "A"
}
],
"title": "Filesystem Usage",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 17
},
"id": 8,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped Memory",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Overview",
"uid": "monitor-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,216 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Resident Memory",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_read_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Read I/O",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_write_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Write I/O",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-7d",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process History Grouped",
"uid": "monitor-process-history",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,224 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_cpu_seconds_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID RSS",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_read_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Read I/O",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_write_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Write I/O",
"type": "table"
}
],
"refresh": "15s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-10m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process Live PID",
"uid": "monitor-process-pid",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,351 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (zfs_pool_allocated_bytes / zfs_pool_size_bytes)",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "zfs_pool_free_bytes",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Free Bytes",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 0
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zfs_dataset_used_bytes{type=\"filesystem\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{name}}",
"refId": "A"
}
],
"title": "Top Filesystems by Used Bytes",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_read_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Read Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_write_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Write Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 17
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_temperature{temperature_type=\"current\"}",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "Disk Temperature",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_smart_status",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "SMART Health",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"zfs"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Storage and ZFS",
"uid": "monitor-storage",
"version": 1,
"weekStart": ""
}