apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: agave-rpc-alerts namespace: monitoring labels: app: kube-prometheus-stack release: monitoring-stack spec: groups: - name: agave-rpc-health rules: - alert: AgaveRPCDown expr: "max by (instance) (solana_rpc_up{job=\"mpabi-node-exporter\"}) == 0" for: 30s labels: severity: critical team: mpabi annotations: summary: "Agave RPC is unreachable" description: "RPC probe from node exporter reports solana_rpc_up == 0 for instance {{ $labels.instance }}." - alert: AgaveRPCSlotLagHigh expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 50" for: 2m labels: severity: warning team: mpabi annotations: summary: "Agave RPC is lagging behind cluster" description: "Current slot lag is {{ $value }} for instance {{ $labels.instance }}. Reference endpoint in probe config may be misconfigured or validator is behind." - alert: AgaveRPCSlotLagCritical expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 500" for: 2m labels: severity: critical team: mpabi annotations: summary: "Agave RPC severe lag" description: "Slot lag is critically high ({{ $value }} slots) on instance {{ $labels.instance }}." - alert: AgaveIOHigh expr: | sum by (instance) ( (rate(node_disk_read_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m]) + rate(node_disk_written_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m])) / 1024 / 1024 ) > 300 for: 5m labels: severity: warning team: mpabi annotations: summary: "High storage I/O on Agave node" description: "Combined NVMe read+write throughput >300 MiB/s for 5m on {{ $labels.instance }}. Check disk pressure and Geyser/ledger workload." - alert: AgaveIOWaitHigh expr: "avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",mode=\"iowait\"}[5m])) > 0.2" for: 5m labels: severity: warning team: mpabi annotations: summary: "High iowait on Agave node" description: "Iowait over 20% on average for 5m on {{ $labels.instance }}. Storage latency is likely impacting slot progress."