ops(sol): add offline canary recovery path
All checks were successful
deploy-trade-r001-canary / apply (push) Successful in 6m45s
All checks were successful
deploy-trade-r001-canary / apply (push) Successful in 6m45s
This commit is contained in:
@@ -45,6 +45,7 @@ Minimal canary namespace for migration baseline `R001` on `sol`.
|
||||
- `trade-api` and `trade-frontend` use the current live images from Gitea registry and the same bootstrap wrapper/config pattern as the source environment.
|
||||
- `dlob-publisher-hot` now targets the host validator on `sol` through `trade-infra` services and writes `dlob-hot:*` into the shared Redis host service.
|
||||
- `dlob-publisher-all` now targets the same host validator path on `sol` and writes `dlob-all:*` into the shared Redis host service.
|
||||
- `dlob-publisher-hot` and `dlob-publisher-all` use `/startup` for Kubernetes liveness on `sol`; the internal `/health` endpoint stayed noisy on self-hosted Agave while downstream `drift_ticks` data remained fresh, so the operator smoke check is the stronger health gate for canary rollouts.
|
||||
- `dlob-hot-redis-to-postgres-raw-writer` and `dlob-hot-postgres-to-postgres-derived-writer` rebuild the first live DLOB derived path on `sol`.
|
||||
- `dlob-all-redis-to-postgres-derived-writer` rebuilds the live full-market derived DLOB path on `sol`.
|
||||
- The canary workflow re-runs:
|
||||
@@ -67,6 +68,26 @@ kubectl apply -k environments/sol/trade-infra
|
||||
./environments/sol/trade-r001-canary/scripts/create-gitea-registry-secret.sh
|
||||
./environments/sol/trade-r001-canary/scripts/create-trade-dlob-rpc-secret.sh
|
||||
./environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh
|
||||
./environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh
|
||||
./environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh
|
||||
```
|
||||
|
||||
After the prerequisites are seeded, push to `main` and let `deploy-trade-r001-canary` apply the environment.
|
||||
|
||||
The smoke check script validates:
|
||||
|
||||
- `agave-validator` and `k3s` service state on `sol`
|
||||
- Agave RPC lag and health
|
||||
- deployment readiness in `trade-r001-canary`
|
||||
- derived DLOB and `drift_ticks` freshness in Postgres
|
||||
- `trade-api` read-path and `trade-frontend` HTTP response
|
||||
|
||||
If `mevnode_bot` is no longer available, bootstrap scripts automatically prefer a local secret snapshot from `$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets` when that directory exists:
|
||||
|
||||
```bash
|
||||
./environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh
|
||||
SOURCE_DIR="$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" \
|
||||
./environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh
|
||||
SOURCE_DIR="$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" \
|
||||
./environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh
|
||||
```
|
||||
|
||||
@@ -108,7 +108,7 @@ spec:
|
||||
failureThreshold: 180
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
path: /startup
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 20
|
||||
|
||||
@@ -127,7 +127,7 @@ spec:
|
||||
failureThreshold: 30
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
path: /startup
|
||||
port: http
|
||||
initialDelaySeconds: 240
|
||||
periodSeconds: 20
|
||||
|
||||
130
environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh
Executable file
130
environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh
Executable file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
TARGET_HOST="${TARGET_HOST:-mevnode}"
|
||||
TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}"
|
||||
MAX_AGAVE_LAG="${MAX_AGAVE_LAG:-50}"
|
||||
|
||||
ssh_target() {
|
||||
ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@"
|
||||
}
|
||||
|
||||
remote_bash() {
|
||||
local script="$1"
|
||||
ssh_target \
|
||||
"TARGET_NAMESPACE=$(printf '%q' "$TARGET_NAMESPACE") MAX_AGAVE_LAG=$(printf '%q' "$MAX_AGAVE_LAG") bash -lc $(printf '%q' "$script")"
|
||||
}
|
||||
|
||||
remote_bash_stdin() {
|
||||
ssh_target \
|
||||
"TARGET_NAMESPACE=$(printf '%q' "$TARGET_NAMESPACE") MAX_AGAVE_LAG=$(printf '%q' "$MAX_AGAVE_LAG") bash -s"
|
||||
}
|
||||
|
||||
echo "[1/5] Host services"
|
||||
remote_bash '
|
||||
set -euo pipefail
|
||||
echo "time=$(date --iso-8601=seconds)"
|
||||
for svc in agave-validator k3s; do
|
||||
state="$(systemctl is-active "$svc")"
|
||||
echo "$svc=$state"
|
||||
test "$state" = active
|
||||
done
|
||||
'
|
||||
|
||||
echo
|
||||
echo "[2/5] Agave RPC lag"
|
||||
remote_bash '
|
||||
set -euo pipefail
|
||||
req='\''{"jsonrpc":"2.0","id":1,"method":"getSlot"}'\''
|
||||
health_req='\''{"jsonrpc":"2.0","id":1,"method":"getHealth"}'\''
|
||||
local_slot="$(curl -fsS -H "Content-Type: application/json" --data-binary "$req" http://127.0.0.1:8899 | jq -r .result)"
|
||||
ref_slot="$(curl -fsS -H "Content-Type: application/json" --data-binary "$req" https://api.mainnet-beta.solana.com | jq -r .result)"
|
||||
health="$(curl -fsS -H "Content-Type: application/json" --data-binary "$health_req" http://127.0.0.1:8899 | jq -r ".result // .error.message")"
|
||||
lag="$((ref_slot-local_slot))"
|
||||
echo "local_slot=$local_slot"
|
||||
echo "reference_slot=$ref_slot"
|
||||
echo "lag=$lag"
|
||||
echo "health=$health"
|
||||
test "$health" = ok
|
||||
test "$lag" -le "$MAX_AGAVE_LAG"
|
||||
'
|
||||
|
||||
echo
|
||||
echo "[3/5] Canary deployments"
|
||||
remote_bash '
|
||||
set -euo pipefail
|
||||
sudo k3s kubectl -n "$TARGET_NAMESPACE" wait --for=condition=Available --timeout=180s deploy --all >/dev/null
|
||||
sudo k3s kubectl -n "$TARGET_NAMESPACE" get deploy -o wide
|
||||
bad="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get deploy -o json | jq -r ".items[] | select((.status.readyReplicas // 0) != (.spec.replicas // 1) or (.status.availableReplicas // 0) != (.spec.replicas // 1)) | .metadata.name")"
|
||||
if [ -n "$bad" ]; then
|
||||
echo "deployments_not_ready=$bad" >&2
|
||||
exit 1
|
||||
fi
|
||||
'
|
||||
|
||||
echo
|
||||
echo "[4/5] Database freshness"
|
||||
remote_bash_stdin <<'REMOTE'
|
||||
set -euo pipefail
|
||||
pg_user="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_USER}" | base64 -d)"
|
||||
pg_pass="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_PASSWORD}" | base64 -d)"
|
||||
pg_db="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_DB}" | base64 -d)"
|
||||
export PGPASSWORD="$pg_pass"
|
||||
sql="$(cat <<'SQL'
|
||||
SELECT 'dlob_hot_derived_latest|' || count(*) FROM dlob_hot_derived_latest;
|
||||
SELECT 'dlob_all_derived_latest|' || count(*) FROM dlob_all_derived_latest;
|
||||
SELECT 'drift_ticks_15m|' || count(*) FROM drift_ticks WHERE ts >= now() - interval '15 minutes';
|
||||
SELECT 'latest_tick|' || symbol || '|' || source || '|' || COALESCE(raw->>'from','<null>') || '|' || to_char(ts, 'YYYY-MM-DD HH24:MI:SS')
|
||||
FROM drift_ticks
|
||||
ORDER BY ts DESC
|
||||
LIMIT 1;
|
||||
SQL
|
||||
)"
|
||||
psql -h 127.0.0.1 -U "$pg_user" -d "$pg_db" -Atqc "$sql"
|
||||
REMOTE
|
||||
|
||||
echo
|
||||
echo "[5/5] API and frontend"
|
||||
remote_bash_stdin <<'REMOTE'
|
||||
set -euo pipefail
|
||||
token="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-frontend-tokens -o jsonpath="{.data.read\.json}" | base64 -d | jq -r .token)"
|
||||
pod_name="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get pod -l app.kubernetes.io/name=trade-ingestor -o jsonpath="{.items[0].metadata.name}")"
|
||||
sudo k3s kubectl -n "$TARGET_NAMESPACE" exec -i "$pod_name" -- env API_TOKEN="$token" node - <<'JS'
|
||||
const headers = { Authorization: `Bearer ${process.env.API_TOKEN}` };
|
||||
|
||||
async function getJson(url) {
|
||||
const response = await fetch(url, {
|
||||
headers,
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
const payload = await response.json();
|
||||
if (!response.ok || !payload?.ok) {
|
||||
throw new Error(`${url} failed: ${response.status} ${JSON.stringify(payload)}`);
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const ticks = await getJson('http://trade-api:8787/v1/ticks?symbol=SOL-PERP&limit=3');
|
||||
const chart = await getJson('http://trade-api:8787/v1/chart?symbol=SOL-PERP&tf=1m&limit=5');
|
||||
const frontend = await fetch('http://trade-frontend:8081/', {
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
const html = await frontend.text();
|
||||
if (!frontend.ok || !/<html/i.test(html)) {
|
||||
throw new Error(`frontend failed: ${frontend.status}`);
|
||||
}
|
||||
console.log(JSON.stringify({
|
||||
ticks_last: ticks.ticks?.at(-1) || null,
|
||||
chart_last: chart.candles?.at(-1) || null,
|
||||
frontend: { status: frontend.status, bytes: html.length },
|
||||
}, null, 2));
|
||||
})().catch((err) => {
|
||||
console.error(String(err && err.message ? err.message : err));
|
||||
process.exit(1);
|
||||
});
|
||||
JS
|
||||
REMOTE
|
||||
|
||||
echo
|
||||
echo "Smoke check passed for ${TARGET_NAMESPACE} on ${TARGET_HOST}"
|
||||
@@ -1,8 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
DEFAULT_SOURCE_DIR="${HOME}/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets"
|
||||
SOURCE_HOST="${SOURCE_HOST:-mevnode_bot}"
|
||||
SOURCE_NAMESPACE="${SOURCE_NAMESPACE:-trade-staging}"
|
||||
SOURCE_DIR="${SOURCE_DIR:-}"
|
||||
TARGET_HOST="${TARGET_HOST:-mevnode}"
|
||||
PG_VERSION="${PG_VERSION:-16}"
|
||||
|
||||
@@ -14,7 +16,15 @@ ssh_target() {
|
||||
ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@"
|
||||
}
|
||||
|
||||
SRC_SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret trade-postgres -o json")"
|
||||
if [ -z "$SOURCE_DIR" ] && [ -d "$DEFAULT_SOURCE_DIR" ]; then
|
||||
SOURCE_DIR="$DEFAULT_SOURCE_DIR"
|
||||
fi
|
||||
|
||||
if [ -n "$SOURCE_DIR" ]; then
|
||||
SRC_SECRET_JSON="$(cat "${SOURCE_DIR}/trade-postgres.json")"
|
||||
else
|
||||
SRC_SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret trade-postgres -o json")"
|
||||
fi
|
||||
POSTGRES_USER="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_USER' | base64 -d)"
|
||||
POSTGRES_PASSWORD="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_PASSWORD' | base64 -d)"
|
||||
POSTGRES_DB="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_DB' | base64 -d)"
|
||||
|
||||
32
environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh
Executable file
32
environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
TARGET_HOST="${TARGET_HOST:-mevnode}"
|
||||
TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}"
|
||||
SNAPSHOT_DIR="${SNAPSHOT_DIR:-$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets}"
|
||||
|
||||
SECRETS=(
|
||||
gitea-registry
|
||||
trade-api
|
||||
trade-basic-auth
|
||||
trade-dlob-rpc
|
||||
trade-frontend-tokens
|
||||
trade-hasura
|
||||
trade-ingestor-tokens
|
||||
trade-postgres
|
||||
)
|
||||
|
||||
ssh_target() {
|
||||
ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@"
|
||||
}
|
||||
|
||||
install -d -m 700 "$SNAPSHOT_DIR"
|
||||
|
||||
for secret_name in "${SECRETS[@]}"; do
|
||||
ssh_target "sudo k3s kubectl -n ${TARGET_NAMESPACE} get secret ${secret_name} -o json" \
|
||||
> "${SNAPSHOT_DIR}/${secret_name}.json"
|
||||
chmod 600 "${SNAPSHOT_DIR}/${secret_name}.json"
|
||||
echo "Snapshotted ${secret_name} to ${SNAPSHOT_DIR}/${secret_name}.json"
|
||||
done
|
||||
|
||||
echo "Secret snapshot ready at ${SNAPSHOT_DIR}"
|
||||
@@ -1,8 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
DEFAULT_SOURCE_DIR="${HOME}/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets"
|
||||
SOURCE_HOST="${SOURCE_HOST:-mevnode_bot}"
|
||||
SOURCE_NAMESPACE="${SOURCE_NAMESPACE:-trade-staging}"
|
||||
SOURCE_DIR="${SOURCE_DIR:-}"
|
||||
TARGET_HOST="${TARGET_HOST:-mevnode}"
|
||||
TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}"
|
||||
|
||||
@@ -23,10 +25,23 @@ ssh_target() {
|
||||
ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@"
|
||||
}
|
||||
|
||||
if [ -z "$SOURCE_DIR" ] && [ -d "$DEFAULT_SOURCE_DIR" ]; then
|
||||
SOURCE_DIR="$DEFAULT_SOURCE_DIR"
|
||||
fi
|
||||
|
||||
get_secret_json() {
|
||||
local secret_name="$1"
|
||||
if [ -n "$SOURCE_DIR" ]; then
|
||||
cat "${SOURCE_DIR}/${secret_name}.json"
|
||||
else
|
||||
ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret ${secret_name} -o json"
|
||||
fi
|
||||
}
|
||||
|
||||
ssh_target "sudo k3s kubectl get ns ${TARGET_NAMESPACE} >/dev/null 2>&1 || sudo k3s kubectl create ns ${TARGET_NAMESPACE} >/dev/null"
|
||||
|
||||
for secret_name in "${SECRETS[@]}"; do
|
||||
SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret ${secret_name} -o json")"
|
||||
SECRET_JSON="$(get_secret_json "${secret_name}")"
|
||||
printf '%s' "$SECRET_JSON" \
|
||||
| jq --arg ns "$TARGET_NAMESPACE" 'del(.metadata.uid,.metadata.resourceVersion,.metadata.creationTimestamp,.metadata.managedFields,.metadata.ownerReferences,.metadata.selfLink,.metadata.annotations["kubectl.kubernetes.io/last-applied-configuration"]) | .metadata.namespace = $ns' \
|
||||
| ssh_target "sudo k3s kubectl apply -f - >/dev/null"
|
||||
|
||||
Reference in New Issue
Block a user