Compare commits
33 Commits
5f46d26037
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59c3f3ee06 | ||
|
|
9c4c3096d7 | ||
|
|
b02bd6b66c | ||
|
|
890ac4c86e | ||
| 9c6e974d3a | |||
| d1dc32d3bb | |||
| a6a0accd6a | |||
| 3b8b1f5492 | |||
| 0c80a08732 | |||
| 8b72e62621 | |||
| ff7a4b69cd | |||
| e7f9594381 | |||
| 0c0f219d02 | |||
| 9cf0ed84d9 | |||
| 2702edce22 | |||
| c33533fcd6 | |||
| 32eb047551 | |||
| d27d64e407 | |||
| fa6893aa98 | |||
| 77a8265b40 | |||
| c692f8d653 | |||
| 47096c9877 | |||
| 0104532e73 | |||
| 19e7e48190 | |||
| 7ef3ffe62c | |||
| 1853ef6452 | |||
| 34ef9490a4 | |||
| f3bc3da9bb | |||
| f797234abd | |||
| c95a4286fb | |||
| b72f281651 | |||
| 98912c5b03 | |||
| 28876fa1d2 |
20
bootstrap/argocd/application-monitoring-extras.yaml
Normal file
20
bootstrap/argocd/application-monitoring-extras.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: monitoring-extras
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: https://gitea.mpabi.pl/trade/trade-deploy.git
|
||||
targetRevision: main
|
||||
path: kustomize/infra/monitoring-extras
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
50
bootstrap/argocd/application-monitoring-stack.yaml
Normal file
50
bootstrap/argocd/application-monitoring-stack.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: monitoring-stack
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: https://prometheus-community.github.io/helm-charts
|
||||
chart: kube-prometheus-stack
|
||||
targetRevision: 81.6.9
|
||||
helm:
|
||||
skipCrds: true
|
||||
values: |
|
||||
grafana:
|
||||
enabled: true
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
scrapeInterval: 15s
|
||||
evaluationInterval: 15s
|
||||
additionalScrapeConfigs:
|
||||
- job_name: mpabi-yellowstone-geyser
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
fallback_scrape_protocol: PrometheusText0.0.4
|
||||
static_configs:
|
||||
- targets:
|
||||
- 10.66.66.1:8999
|
||||
- job_name: mpabi-node-exporter
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets:
|
||||
- 10.66.66.1:9100
|
||||
prometheusOperator:
|
||||
admissionWebhooks:
|
||||
enabled: false
|
||||
patch:
|
||||
enabled: false
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- Replace=true
|
||||
- ServerSideApply=true
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: drift
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "1000"
|
||||
- name: DLOB_DEPTH_BPS_BANDS
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: mevnode
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "1000"
|
||||
- name: DLOB_DEPTH_BPS_BANDS
|
||||
|
||||
@@ -65,7 +65,7 @@ function resolveConfig() {
|
||||
const hasuraAuthToken = process.env.HASURA_AUTH_TOKEN || process.env.HASURA_JWT || undefined;
|
||||
|
||||
const dlobSource = String(process.env.DLOB_SOURCE || 'mevnode').trim() || 'mevnode';
|
||||
const markets = envList('DLOB_MARKETS', 'PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP');
|
||||
const markets = envList('DLOB_MARKETS', 'SOL-PERP,DOGE-PERP,JUP-PERP');
|
||||
const pollMs = clampInt(process.env.DLOB_POLL_MS, 250, 60_000, 1000);
|
||||
const bandsBps = envIntList('DLOB_DEPTH_BPS_BANDS', '5,10,20,50,100,200');
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: drift
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "1000"
|
||||
- name: DLOB_SLIPPAGE_SIZES_USD
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: mevnode
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "1000"
|
||||
- name: DLOB_SLIPPAGE_SIZES_USD
|
||||
|
||||
@@ -56,7 +56,7 @@ function resolveConfig() {
|
||||
const hasuraAuthToken = process.env.HASURA_AUTH_TOKEN || process.env.HASURA_JWT || undefined;
|
||||
|
||||
const dlobSource = String(process.env.DLOB_SOURCE || 'mevnode').trim() || 'mevnode';
|
||||
const markets = envList('DLOB_MARKETS', 'PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP');
|
||||
const markets = envList('DLOB_MARKETS', 'SOL-PERP,DOGE-PERP,JUP-PERP');
|
||||
const pollMs = clampInt(process.env.DLOB_POLL_MS, 250, 60_000, 1000);
|
||||
|
||||
const sizesUsd = envList('DLOB_SLIPPAGE_SIZES_USD', '10,25,50,100,250,500,1000')
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: drift
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_TS_POLL_MS
|
||||
value: "1000"
|
||||
command: ["node", "/app/worker.mjs"]
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
- name: DLOB_SOURCE
|
||||
value: mevnode
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_TS_POLL_MS
|
||||
value: "1000"
|
||||
command: ["node", "/app/worker.mjs"]
|
||||
|
||||
@@ -50,7 +50,7 @@ function resolveConfig() {
|
||||
const hasuraAuthToken = process.env.HASURA_AUTH_TOKEN || process.env.HASURA_JWT || undefined;
|
||||
|
||||
const dlobSource = String(process.env.DLOB_SOURCE || 'mevnode').trim() || 'mevnode';
|
||||
const markets = envList('DLOB_MARKETS', 'PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP');
|
||||
const markets = envList('DLOB_MARKETS', 'SOL-PERP,DOGE-PERP,JUP-PERP');
|
||||
const pollMs = clampInt(process.env.DLOB_TS_POLL_MS, 500, 60_000, 1000);
|
||||
|
||||
return { hasuraUrl, hasuraAdminSecret, hasuraAuthToken, dlobSource, markets, pollMs };
|
||||
|
||||
@@ -35,7 +35,7 @@ spec:
|
||||
- name: DLOB_FORCE_IPV6
|
||||
value: "true"
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "500"
|
||||
- name: DLOB_DEPTH
|
||||
|
||||
@@ -31,7 +31,7 @@ spec:
|
||||
- name: DLOB_HTTP_URL
|
||||
value: http://dlob-server:6969
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: DLOB_POLL_MS
|
||||
value: "500"
|
||||
- name: DLOB_DEPTH
|
||||
|
||||
@@ -66,7 +66,7 @@ function resolveConfig() {
|
||||
const dlobForceIpv6 = envBool('DLOB_FORCE_IPV6', false);
|
||||
const dlobSource = String(process.env.DLOB_SOURCE || 'mevnode').trim() || 'mevnode';
|
||||
|
||||
const markets = envList('DLOB_MARKETS', 'PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP');
|
||||
const markets = envList('DLOB_MARKETS', 'SOL-PERP,DOGE-PERP,JUP-PERP');
|
||||
const depth = clampInt(process.env.DLOB_DEPTH, 1, 50, 10);
|
||||
const pollMs = clampInt(process.env.DLOB_POLL_MS, 100, 10_000, 500);
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ spec:
|
||||
- name: REDIS_CLIENT
|
||||
value: DLOB
|
||||
- name: PERP_MARKETS_TO_LOAD
|
||||
value: "0,1,2,4,75"
|
||||
value: "0,7,24"
|
||||
- name: ENDPOINT
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
@@ -59,11 +59,15 @@ spec:
|
||||
httpGet:
|
||||
path: /startup
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 30
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
initialDelaySeconds: 240
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 10
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: monitoring-mpabi-pl
|
||||
namespace: monitoring
|
||||
spec:
|
||||
secretName: monitoring-mpabi-pl-tls
|
||||
issuerRef:
|
||||
kind: ClusterIssuer
|
||||
name: letsencrypt-prod
|
||||
dnsNames:
|
||||
- grafana.mpabi.pl
|
||||
- prometheus.mpabi.pl
|
||||
238
kustomize/infra/monitoring-extras/dashboard-agave-status.yaml
Normal file
238
kustomize/infra/monitoring-extras/dashboard-agave-status.yaml
Normal file
@@ -0,0 +1,238 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-agave-status
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
agave-status.json: |-
|
||||
{
|
||||
"uid": "agave-status-mpabi",
|
||||
"title": "Agave @ mpabi",
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 2,
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {
|
||||
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m"],
|
||||
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d"]
|
||||
},
|
||||
"refresh": "10s",
|
||||
"tags": ["agave", "solana", "mpabi"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"label": "Node Exporter Instance",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"query": { "query": "label_values(up{job=\"mpabi-node-exporter\"}, instance)", "refId": "PromVarInstance" },
|
||||
"current": { "selected": false, "text": "10.66.66.1:9100", "value": "10.66.66.1:9100" }
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Geyser Metrics Target (Prometheus up)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "A", "expr": "up{job=\"mpabi-yellowstone-geyser\"}" }],
|
||||
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "orientation": "horizontal", "textMode": "value" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } },
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "agave-validator.service (systemd active)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "B", "expr": "node_systemd_unit_state{job=\"mpabi-node-exporter\",instance=\"$instance\",name=\"agave-validator.service\",state=\"active\"}" }],
|
||||
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "orientation": "horizontal", "textMode": "value" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } },
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "C", "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",instance=\"$instance\",mode=\"idle\"}[5m])) * 100)" }],
|
||||
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "timeseries",
|
||||
"title": "Load (1m)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "D", "expr": "node_load1{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }],
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "E", "expr": "100 - (node_memory_MemAvailable_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} * 100)" }],
|
||||
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Swap Used (GiB)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "F", "expr": "(node_memory_SwapTotal_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"} - node_memory_SwapFree_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\"}) / 1024 / 1024 / 1024" }],
|
||||
"fieldConfig": { "defaults": { "unit": "gbytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Disk Free Accounts (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "G", "expr": "100 * node_filesystem_avail_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/accounts\"} / node_filesystem_size_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/accounts\"}" }],
|
||||
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "timeseries",
|
||||
"title": "Disk Free Ledger (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "H", "expr": "100 * node_filesystem_avail_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/ledger\"} / node_filesystem_size_bytes{job=\"mpabi-node-exporter\",instance=\"$instance\",mountpoint=\"/var/lib/solana/ledger\"}" }],
|
||||
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "timeseries",
|
||||
"title": "Disk IO (NVMe) Read/Write (MiB/s)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "I", "expr": "sum by (device) (rate(node_disk_read_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=~\"nvme.*\"}[5m])) / 1024 / 1024", "legendFormat": "read {{device}}" },
|
||||
{ "refId": "J", "expr": "sum by (device) (rate(node_disk_written_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=~\"nvme.*\"}[5m])) / 1024 / 1024", "legendFormat": "write {{device}}" }
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "Network wg0 RX/TX (MiB/s)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "K", "expr": "rate(node_network_receive_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=\"wg0\"}[5m]) / 1024 / 1024", "legendFormat": "rx" },
|
||||
{ "refId": "L", "expr": "rate(node_network_transmit_bytes_total{job=\"mpabi-node-exporter\",instance=\"$instance\",device=\"wg0\"}[5m]) / 1024 / 1024", "legendFormat": "tx" }
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "Geyser: Subscriber Queue Size",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "M", "expr": "grpc_subscriber_queue_size{job=\"mpabi-yellowstone-geyser\"}", "legendFormat": "{{subscriber_id}}" }],
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 }
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Geyser: Connections Total",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "N", "expr": "connections_total{job=\"mpabi-yellowstone-geyser\"}" }],
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 }
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "timeseries",
|
||||
"title": "Geyser: Bytes Sent (MiB/s)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "O", "expr": "rate(grpc_bytes_sent{job=\"mpabi-yellowstone-geyser\"}[5m]) / 1024 / 1024", "legendFormat": "{{subscriber_id}}" }],
|
||||
"fieldConfig": { "defaults": { "unit": "mbytes" }, "overrides": [] },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Geyser: Messages Sent (/s)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "P", "expr": "rate(grpc_message_sent_count{job=\"mpabi-yellowstone-geyser\"}[5m])", "legendFormat": "{{subscriber_id}}" }],
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Geyser: Disconnects (increase 15m)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "Q", "expr": "sum by (reason) (increase(grpc_client_disconnects_total{job=\"mpabi-yellowstone-geyser\"}[15m]))", "legendFormat": "{{reason}}" }],
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 46 }
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "stat",
|
||||
"title": "RPC Slot Lag (slots)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "R", "expr": "solana_rpc_slot_lag{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 20 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "RPC Slot Lag (szac. minuty)",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [{ "refId": "S", "expr": "solana_rpc_slot_lag{job=\"mpabi-node-exporter\",instance=\"$instance\"} * 0.4 / 60" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "min",
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "RPC Slot & Reference Slot",
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "refId": "T", "expr": "solana_rpc_slot{job=\"mpabi-node-exporter\",instance=\"$instance\"}" },
|
||||
{ "refId": "U", "expr": "solana_rpc_slot_reference{job=\"mpabi-node-exporter\",instance=\"$instance\"}" },
|
||||
{ "refId": "V", "expr": "solana_rpc_block_height{job=\"mpabi-node-exporter\",instance=\"$instance\"}" }
|
||||
],
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 60 }
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: grafana-http
|
||||
namespace: monitoring
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`grafana.mpabi.pl`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: redirect-to-https
|
||||
services:
|
||||
- name: monitoring-stack-grafana
|
||||
port: 80
|
||||
16
kustomize/infra/monitoring-extras/ingressroute-grafana.yaml
Normal file
16
kustomize/infra/monitoring-extras/ingressroute-grafana.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`grafana.mpabi.pl`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: monitoring-stack-grafana
|
||||
port: 80
|
||||
tls:
|
||||
secretName: monitoring-mpabi-pl-tls
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: prometheus-http
|
||||
namespace: monitoring
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`prometheus.mpabi.pl`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: redirect-to-https
|
||||
services:
|
||||
- name: monitoring-stack-kube-prom-prometheus
|
||||
port: 9090
|
||||
@@ -0,0 +1,16 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`prometheus.mpabi.pl`)
|
||||
kind: Rule
|
||||
services:
|
||||
- name: monitoring-stack-kube-prom-prometheus
|
||||
port: 9090
|
||||
tls:
|
||||
secretName: monitoring-mpabi-pl-tls
|
||||
11
kustomize/infra/monitoring-extras/kustomization.yaml
Normal file
11
kustomize/infra/monitoring-extras/kustomization.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- middleware-redirect-to-https.yaml
|
||||
- certificate-monitoring-mpabi-pl.yaml
|
||||
- ingressroute-grafana.yaml
|
||||
- ingressroute-grafana-http.yaml
|
||||
- ingressroute-prometheus.yaml
|
||||
- ingressroute-prometheus-http.yaml
|
||||
- dashboard-agave-status.yaml
|
||||
- prometheus-rules-agave.yaml
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: redirect-to-https
|
||||
namespace: monitoring
|
||||
spec:
|
||||
redirectScheme:
|
||||
scheme: https
|
||||
permanent: true
|
||||
@@ -0,0 +1,61 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: agave-rpc-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
release: monitoring-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: agave-rpc-health
|
||||
rules:
|
||||
- alert: AgaveRPCDown
|
||||
expr: "max by (instance) (solana_rpc_up{job=\"mpabi-node-exporter\"}) == 0"
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC is unreachable"
|
||||
description: "RPC probe from node exporter reports solana_rpc_up == 0 for instance {{ $labels.instance }}."
|
||||
- alert: AgaveRPCSlotLagHigh
|
||||
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 50"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC is lagging behind cluster"
|
||||
description: "Current slot lag is {{ $value }} for instance {{ $labels.instance }}. Reference endpoint in probe config may be misconfigured or validator is behind."
|
||||
- alert: AgaveRPCSlotLagCritical
|
||||
expr: "sum by (instance) (solana_rpc_slot_lag{job=\"mpabi-node-exporter\"}) > 500"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "Agave RPC severe lag"
|
||||
description: "Slot lag is critically high ({{ $value }} slots) on instance {{ $labels.instance }}."
|
||||
- alert: AgaveIOHigh
|
||||
expr: |
|
||||
sum by (instance) (
|
||||
(rate(node_disk_read_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m]) +
|
||||
rate(node_disk_written_bytes_total{job="mpabi-node-exporter",device=~"nvme.*"}[5m])) / 1024 / 1024
|
||||
) > 300
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "High storage I/O on Agave node"
|
||||
description: "Combined NVMe read+write throughput >300 MiB/s for 5m on {{ $labels.instance }}. Check disk pressure and Geyser/ledger workload."
|
||||
- alert: AgaveIOWaitHigh
|
||||
expr: "avg by (instance) (rate(node_cpu_seconds_total{job=\"mpabi-node-exporter\",mode=\"iowait\"}[5m])) > 0.2"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: mpabi
|
||||
annotations:
|
||||
summary: "High iowait on Agave node"
|
||||
description: "Iowait over 20% on average for 5m on {{ $labels.instance }}. Storage latency is likely impacting slot progress."
|
||||
14
kustomize/overlays/prod/dlob-rpc-endpoint-patch.yaml
Normal file
14
kustomize/overlays/prod/dlob-rpc-endpoint-patch.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: dlob-publisher
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: publisher
|
||||
env:
|
||||
- name: ENDPOINT
|
||||
value: "http://10.66.66.1:8899"
|
||||
- name: WS_ENDPOINT
|
||||
value: "ws://10.66.66.1:8900"
|
||||
14
kustomize/overlays/prod/dlob-rpc-server-endpoint-patch.yaml
Normal file
14
kustomize/overlays/prod/dlob-rpc-server-endpoint-patch.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: dlob-server
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: server
|
||||
env:
|
||||
- name: ENDPOINT
|
||||
value: "http://10.66.66.1:8899"
|
||||
- name: WS_ENDPOINT
|
||||
value: "ws://10.66.66.1:8900"
|
||||
@@ -7,6 +7,8 @@ resources:
|
||||
- ../../base
|
||||
|
||||
patchesStrategicMerge:
|
||||
- dlob-rpc-endpoint-patch.yaml
|
||||
- dlob-rpc-server-endpoint-patch.yaml
|
||||
- frontend-graphql-proxy-patch.yaml
|
||||
|
||||
configMapGenerator:
|
||||
|
||||
@@ -66,7 +66,7 @@ function resolveConfig() {
|
||||
const hasuraAdminSecret = envString('HASURA_ADMIN_SECRET', '');
|
||||
if (!hasuraAdminSecret) throw new Error('Missing HASURA_ADMIN_SECRET');
|
||||
|
||||
const markets = envList('DLOB_MARKETS', 'PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP');
|
||||
const markets = envList('DLOB_MARKETS', 'SOL-PERP,DOGE-PERP,JUP-PERP');
|
||||
const pollMs = envInt('TICKS_POLL_MS', 1000, { min: 250, max: 60_000 });
|
||||
const source = envString('TICKS_SOURCE', 'dlob_stats');
|
||||
|
||||
|
||||
14
kustomize/overlays/staging/dlob-rpc-endpoint-patch.yaml
Normal file
14
kustomize/overlays/staging/dlob-rpc-endpoint-patch.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: dlob-publisher
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: publisher
|
||||
env:
|
||||
- name: ENDPOINT
|
||||
value: "http://10.66.66.1:8899"
|
||||
- name: WS_ENDPOINT
|
||||
value: "ws://10.66.66.1:8900"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: dlob-server
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: server
|
||||
env:
|
||||
- name: ENDPOINT
|
||||
value: "http://10.66.66.1:8899"
|
||||
- name: WS_ENDPOINT
|
||||
value: "ws://10.66.66.1:8900"
|
||||
@@ -18,7 +18,7 @@ spec:
|
||||
name: trade-hasura
|
||||
key: HASURA_GRAPHQL_ADMIN_SECRET
|
||||
- name: DLOB_MARKETS
|
||||
value: PUMP-PERP,SOL-PERP,1MBONK-PERP,BTC-PERP,ETH-PERP
|
||||
value: SOL-PERP,DOGE-PERP,JUP-PERP
|
||||
- name: TICKS_POLL_MS
|
||||
value: "1000"
|
||||
- name: TICKS_SOURCE
|
||||
|
||||
@@ -10,6 +10,8 @@ resources:
|
||||
- frontend-ingress-root.yaml
|
||||
|
||||
patchesStrategicMerge:
|
||||
- dlob-rpc-endpoint-patch.yaml
|
||||
- dlob-rpc-server-endpoint-patch.yaml
|
||||
- hasura-patch.yaml
|
||||
- frontend-auth-patch.yaml
|
||||
- frontend-graphql-proxy-patch.yaml
|
||||
|
||||
Reference in New Issue
Block a user