feat(dlob): pin markets and wire mevnode endpoints

- Limit DLOB workers/ingestors to SOL-PERP, DOGE-PERP, JUP-PERP across base and staging config.
- Set publisher market ids to [0,7,24] for drift protocol.
- Add overlay patches for dlob-publisher and dlob-server to use wg0 RPC endpoints 10.66.66.1:8899/8900 in staging and prod.
- Extend Agave dashboard and add PrometheusRules for RPC up/lag/I/O alerts.
- Ensure overlays reference new patches for automated ArgoCD rollouts.
This commit is contained in:
mpabi
2026-02-15 00:40:50 +01:00
parent 9c4c3096d7
commit 59c3f3ee06
24 changed files with 194 additions and 15 deletions

View File

@@ -0,0 +1,61 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: agave-rpc-alerts
namespace: monitoring
labels:
app: kube-prometheus-stack
release: monitoring-stack
spec:
groups:
- name: agave-rpc-health
rules:
- alert: AgaveRPCDown
expr: "max by (instance) (solana_rpc_up{job=\"mpabi-node-exporter\"}) == 0"
for: 30s
labels:
severity: critical
team: mpabi
annotations:
summary: "Agave RPC is unreachable"
description: "RPC probe from node exporter reports solana_rpc_up == 0 for instance {{ $labels.instance }}."
- alert: AgaveRPCSlotLagHigh
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 50"
for: 2m
labels:
severity: warning
team: mpabi
annotations:
summary: "Agave RPC is lagging behind cluster"
description: "Current slot lag is {{ $value }} for instance {{ $labels.instance }}. Reference endpoint in probe config may be misconfigured or validator is behind."
- alert: AgaveRPCSlotLagCritical
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 500"
for: 2m
labels:
severity: critical
team: mpabi
annotations:
summary: "Agave RPC severe lag"
description: "Slot lag is critically high ({{ $value }} slots) on instance {{ $labels.instance }}."
- alert: AgaveIOHigh
expr: |
sum by (instance) (
(rate(node_disk_read_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m]) +
rate(node_disk_written_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m])) / 1024 / 1024
) > 300
for: 5m
labels:
severity: warning
team: mpabi
annotations:
summary: "High storage I/O on Agave node"
description: "Combined NVMe read+write throughput >300 MiB/s for 5m on {{ $labels.instance }}. Check disk pressure and Geyser/ledger workload."
- alert: AgaveIOWaitHigh
expr: "avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",mode=\"iowait\"}[5m])) > 0.2"
for: 5m
labels:
severity: warning
team: mpabi
annotations:
summary: "High iowait on Agave node"
description: "Iowait over 20% on average for 5m on {{ $labels.instance }}. Storage latency is likely impacting slot progress."