feat(dlob): pin markets and wire mevnode endpoints
- Limit DLOB workers/ingestors to SOL-PERP, DOGE-PERP, JUP-PERP across base and staging config. - Set publisher market ids to [0,7,24] for drift protocol. - Add overlay patches for dlob-publisher and dlob-server to use wg0 RPC endpoints 10.66.66.1:8899/8900 in staging and prod. - Extend Agave dashboard and add PrometheusRules for RPC up/lag/I/O alerts. - Ensure overlays reference new patches for automated ArgoCD rollouts.
This commit is contained in:
@@ -176,6 +176,63 @@ data:
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "Q", "expr": "sum by (reason) (increase(grpc_client_disconnects_total{job=\"mpabi-yellowstone-geyser\"}[15m]))", "legendFormat": "{{reason}}" }],
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 46 }
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "stat",
|
||||
"title": "RPC Slot Lag (slots)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "R", "expr": "solana_rpc_slot_lag{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 20 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "RPC Slot Lag (szac. minuty)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "S", "expr": "solana_rpc_slot_lag{job=\"mpabi-node-exporter\",instance=\"$instance\"} * 0.4 / 60" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "min",
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "RPC Slot & Reference Slot",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "T", "expr": "solana_rpc_slot{job=\"mpabi-node-exporter\",instance=\"$instance\"}" },
|
||||
{ "refId": "U", "expr": "solana_rpc_slot_reference{job=\"mpabi-node-exporter\",instance=\"$instance\"}" },
|
||||
{ "refId": "V", "expr": "solana_rpc_block_height{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }
|
||||
],
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 60 }
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -8,3 +8,4 @@ resources:
|
||||
- ingressroute-prometheus.yaml
|
||||
- ingressroute-prometheus-http.yaml
|
||||
- dashboard-agave-status.yaml
|
||||
- prometheus-rules-agave.yaml
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: agave-rpc-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
release: monitoring-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: agave-rpc-health
|
||||
rules:
|
||||
- alert: AgaveRPCDown
|
||||
expr: "max by (instance) (solana_rpc_up{job=\"mpabi-node-exporter\"}) == 0"
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC is unreachable"
|
||||
description: "RPC probe from node exporter reports solana_rpc_up == 0 for instance {{ $labels.instance }}."
|
||||
- alert: AgaveRPCSlotLagHigh
|
||||
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 50"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC is lagging behind cluster"
|
||||
description: "Current slot lag is {{ $value }} for instance {{ $labels.instance }}. Reference endpoint in probe config may be misconfigured or validator is behind."
|
||||
- alert: AgaveRPCSlotLagCritical
|
||||
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 500"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC severe lag"
|
||||
description: "Slot lag is critically high ({{ $value }} slots) on instance {{ $labels.instance }}."
|
||||
- alert: AgaveIOHigh
|
||||
expr: |
|
||||
sum by (instance) (
|
||||
(rate(node_disk_read_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m]) +
|
||||
rate(node_disk_written_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m])) / 1024 / 1024
|
||||
) > 300
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "High storage I/O on Agave node"
|
||||
description: "Combined NVMe read+write throughput >300 MiB/s for 5m on {{ $labels.instance }}. Check disk pressure and Geyser/ledger workload."
|
||||
- alert: AgaveIOWaitHigh
|
||||
expr: "avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",mode=\"iowait\"}[5m])) > 0.2"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "High iowait on Agave node"
|
||||
description: "Iowait over 20% on average for 5m on {{ $labels.instance }}. Storage latency is likely impacting slot progress."
|
||||
Reference in New Issue
Block a user