diff --git a/kustomize/infra/monitoring-extras/dashboard-agave-status.yaml b/kustomize/infra/monitoring-extras/dashboard-agave-status.yaml index 5b0fadb..8952f2f 100644 --- a/kustomize/infra/monitoring-extras/dashboard-agave-status.yaml +++ b/kustomize/infra/monitoring-extras/dashboard-agave-status.yaml @@ -12,7 +12,7 @@ data: "title": "Agave @ mpabi", "timezone": "browser", "schemaVersion": 39, - "version": 1, + "version": 2, "refresh": "10s", "tags": ["agave", "solana", "mpabi"], "templating": { @@ -22,10 +22,7 @@ data: "type": "query", "label": "Node Exporter Instance", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "query": { - "query": "label_values(up{job=\"mpabi-node-exporter\"}, instance)", - "refId": "PromVarInstance" - }, + "query": { "query": "label_values(up{job=\"mpabi-node-exporter\"}, instance)", "refId": "PromVarInstance" }, "current": { "selected": false, "text": "10.66.66.1:9100", "value": "10.66.66.1:9100" } } ] @@ -36,27 +33,10 @@ data: "type": "stat", "title": "Geyser Metrics Target (Prometheus up)", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "targets": [ - { - "refId": "A", - "expr": "up{job=\"mpabi-yellowstone-geyser\"}" - } - ], - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - }, + "targets": [{ "refId": "A", "expr": "up{job=\"mpabi-yellowstone-geyser\"}" }], + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "orientation": "horizontal", "textMode": "value" }, "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, + "defaults": { "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 0 } @@ -66,27 +46,10 @@ data: "type": "stat", "title": "agave-validator.service (systemd active)", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "targets": [ - { - "refId": "B", - "expr": "node_systemd_unit_state{job=\"mpabi-node-exporter\",instance=\"$instance\",name=\"agave-validator.service\",state=\"active\"}" - } - ], - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "orientation": "horizontal", - "textMode": "value" - }, + "targets": [{ "refId": "B", "expr": "node_systemd_unit_state{job=\"mpabi-node-exporter\",instance=\"$instance\",name=\"agave-validator.service\",state=\"active\"}" }], + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "orientation": "horizontal", "textMode": "value" }, "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, + "defaults": { "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 0 } @@ -94,28 +57,120 @@ data: { "id": 3, "type": "timeseries", - "title": "Load (1m)", + "title": "CPU Used (%)", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "targets": [ - { - "refId": "C", - "expr": "node_load1{job=\"mpabi-node-exporter\",instance=\"$instance\"}" - } - ], - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 } + "targets": [{ "refId": "C", "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",instance=\"$instance\",mode=\"idle\"}[5m])) * 100)" }], + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 } }, { "id": 4, "type": "timeseries", + "title": "Load (1m)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "D", "expr": "node_load1{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }], + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 } + }, + { + "id": 5, + "type": "timeseries", "title": "Memory Used (%)", "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "E", "expr": "100 - (node_memory_MemAvailable_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} * 100)" }], + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 } + }, + { + "id": 6, + "type": "timeseries", + "title": "Swap Used (GiB)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "F", "expr": "(node_memory_SwapTotal_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} - node_memory_SwapFree_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"}) / 1024 / 1024 / 1024" }], + "fieldConfig": { "defaults": { "unit": "gbytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 } + }, + { + "id": 7, + "type": "timeseries", + "title": "Disk Free Accounts (%)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "G", "expr": "100 * node_filesystem_avail_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/accounts\"} / node_filesystem_size_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/accounts\"}" }], + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 } + }, + { + "id": 8, + "type": "timeseries", + "title": "Disk Free Ledger (%)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "H", "expr": "100 * node_filesystem_avail_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/ledger\"} / node_filesystem_size_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/ledger\"}" }], + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 } + }, + { + "id": 9, + "type": "timeseries", + "title": "Disk IO (NVMe) Read/Write (MiB/s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, "targets": [ - { - "refId": "D", - "expr": "100 - (node_memory_MemAvailable_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} * 100)" - } + { "refId": "I", "expr": "sum by (device) (rate(node_disk_read_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=~\"nvme.*\"}[5m])) / 1024 / 1024", "legendFormat": "read {{device}}" }, + { "refId": "J", "expr": "sum by (device) (rate(node_disk_written_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=~\"nvme.*\"}[5m])) / 1024 / 1024", "legendFormat": "write {{device}}" } ], - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 } + "fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 } + }, + { + "id": 10, + "type": "timeseries", + "title": "Network wg0 RX/TX (MiB/s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "K", "expr": "rate(node_network_receive_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=\"wg0\"}[5m]) / 1024 / 1024", "legendFormat": "rx" }, + { "refId": "L", "expr": "rate(node_network_transmit_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=\"wg0\"}[5m]) / 1024 / 1024", "legendFormat": "tx" } + ], + "fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 } + }, + { + "id": 11, + "type": "timeseries", + "title": "Geyser: Subscriber Queue Size", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "M", "expr": "grpc_subscriber_queue_size{job=\"mpabi-yellowstone-geyser\"}", "legendFormat": "{{subscriber_id}}" }], + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 } + }, + { + "id": 12, + "type": "timeseries", + "title": "Geyser: Connections Total", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "N", "expr": "connections_total{job=\"mpabi-yellowstone-geyser\"}" }], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 } + }, + { + "id": 13, + "type": "timeseries", + "title": "Geyser: Bytes Sent (MiB/s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "O", "expr": "rate(grpc_bytes_sent{job=\"mpabi-yellowstone-geyser\"}[5m]) / 1024 / 1024", "legendFormat": "{{subscriber_id}}" }], + "fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 } + }, + { + "id": 14, + "type": "timeseries", + "title": "Geyser: Messages Sent (/s)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "P", "expr": "rate(grpc_message_sent_count{job=\"mpabi-yellowstone-geyser\"}[5m])", "legendFormat": "{{subscriber_id}}" }], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 } + }, + { + "id": 15, + "type": "timeseries", + "title": "Geyser: Disconnects (increase 15m)", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "Q", "expr": "sum by (reason) (increase(grpc_client_disconnects_total{job=\"mpabi-yellowstone-geyser\"}[15m]))", "legendFormat": "{{reason}}" }], + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 46 } } ] }