diff --git a/environments/sol/trade-r001-canary/README.md b/environments/sol/trade-r001-canary/README.md index d6036fb..06ab3fb 100644 --- a/environments/sol/trade-r001-canary/README.md +++ b/environments/sol/trade-r001-canary/README.md @@ -45,6 +45,7 @@ Minimal canary namespace for migration baseline `R001` on `sol`. - `trade-api` and `trade-frontend` use the current live images from Gitea registry and the same bootstrap wrapper/config pattern as the source environment. - `dlob-publisher-hot` now targets the host validator on `sol` through `trade-infra` services and writes `dlob-hot:*` into the shared Redis host service. - `dlob-publisher-all` now targets the same host validator path on `sol` and writes `dlob-all:*` into the shared Redis host service. +- `dlob-publisher-hot` and `dlob-publisher-all` use `/startup` for Kubernetes liveness on `sol`; the internal `/health` endpoint stayed noisy on self-hosted Agave while downstream `drift_ticks` data remained fresh, so the operator smoke check is the stronger health gate for canary rollouts. - `dlob-hot-redis-to-postgres-raw-writer` and `dlob-hot-postgres-to-postgres-derived-writer` rebuild the first live DLOB derived path on `sol`. - `dlob-all-redis-to-postgres-derived-writer` rebuilds the live full-market derived DLOB path on `sol`. - The canary workflow re-runs: @@ -67,6 +68,26 @@ kubectl apply -k environments/sol/trade-infra ./environments/sol/trade-r001-canary/scripts/create-gitea-registry-secret.sh ./environments/sol/trade-r001-canary/scripts/create-trade-dlob-rpc-secret.sh ./environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh +./environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh +./environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh ``` After the prerequisites are seeded, push to `main` and let `deploy-trade-r001-canary` apply the environment. + +The smoke check script validates: + +- `agave-validator` and `k3s` service state on `sol` +- Agave RPC lag and health +- deployment readiness in `trade-r001-canary` +- derived DLOB and `drift_ticks` freshness in Postgres +- `trade-api` read-path and `trade-frontend` HTTP response + +If `mevnode_bot` is no longer available, bootstrap scripts automatically prefer a local secret snapshot from `$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets` when that directory exists: + +```bash +./environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh +SOURCE_DIR="$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" \ + ./environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh +SOURCE_DIR="$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" \ + ./environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh +``` diff --git a/environments/sol/trade-r001-canary/dlob-publisher-all-deployment.yaml b/environments/sol/trade-r001-canary/dlob-publisher-all-deployment.yaml index 6c639c6..23e42ca 100644 --- a/environments/sol/trade-r001-canary/dlob-publisher-all-deployment.yaml +++ b/environments/sol/trade-r001-canary/dlob-publisher-all-deployment.yaml @@ -108,7 +108,7 @@ spec: failureThreshold: 180 livenessProbe: httpGet: - path: /health + path: /startup port: http initialDelaySeconds: 30 periodSeconds: 20 diff --git a/environments/sol/trade-r001-canary/dlob-publisher-hot-deployment.yaml b/environments/sol/trade-r001-canary/dlob-publisher-hot-deployment.yaml index eda7e9f..d9c299b 100644 --- a/environments/sol/trade-r001-canary/dlob-publisher-hot-deployment.yaml +++ b/environments/sol/trade-r001-canary/dlob-publisher-hot-deployment.yaml @@ -127,7 +127,7 @@ spec: failureThreshold: 30 livenessProbe: httpGet: - path: /health + path: /startup port: http initialDelaySeconds: 240 periodSeconds: 20 diff --git a/environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh b/environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh new file mode 100755 index 0000000..1445935 --- /dev/null +++ b/environments/sol/trade-r001-canary/scripts/check-sol-canary-smoke.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +set -euo pipefail + +TARGET_HOST="${TARGET_HOST:-mevnode}" +TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}" +MAX_AGAVE_LAG="${MAX_AGAVE_LAG:-50}" + +ssh_target() { + ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@" +} + +remote_bash() { + local script="$1" + ssh_target \ + "TARGET_NAMESPACE=$(printf '%q' "$TARGET_NAMESPACE") MAX_AGAVE_LAG=$(printf '%q' "$MAX_AGAVE_LAG") bash -lc $(printf '%q' "$script")" +} + +remote_bash_stdin() { + ssh_target \ + "TARGET_NAMESPACE=$(printf '%q' "$TARGET_NAMESPACE") MAX_AGAVE_LAG=$(printf '%q' "$MAX_AGAVE_LAG") bash -s" +} + +echo "[1/5] Host services" +remote_bash ' + set -euo pipefail + echo "time=$(date --iso-8601=seconds)" + for svc in agave-validator k3s; do + state="$(systemctl is-active "$svc")" + echo "$svc=$state" + test "$state" = active + done +' + +echo +echo "[2/5] Agave RPC lag" +remote_bash ' + set -euo pipefail + req='\''{"jsonrpc":"2.0","id":1,"method":"getSlot"}'\'' + health_req='\''{"jsonrpc":"2.0","id":1,"method":"getHealth"}'\'' + local_slot="$(curl -fsS -H "Content-Type: application/json" --data-binary "$req" http://127.0.0.1:8899 | jq -r .result)" + ref_slot="$(curl -fsS -H "Content-Type: application/json" --data-binary "$req" https://api.mainnet-beta.solana.com | jq -r .result)" + health="$(curl -fsS -H "Content-Type: application/json" --data-binary "$health_req" http://127.0.0.1:8899 | jq -r ".result // .error.message")" + lag="$((ref_slot-local_slot))" + echo "local_slot=$local_slot" + echo "reference_slot=$ref_slot" + echo "lag=$lag" + echo "health=$health" + test "$health" = ok + test "$lag" -le "$MAX_AGAVE_LAG" +' + +echo +echo "[3/5] Canary deployments" +remote_bash ' + set -euo pipefail + sudo k3s kubectl -n "$TARGET_NAMESPACE" wait --for=condition=Available --timeout=180s deploy --all >/dev/null + sudo k3s kubectl -n "$TARGET_NAMESPACE" get deploy -o wide + bad="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get deploy -o json | jq -r ".items[] | select((.status.readyReplicas // 0) != (.spec.replicas // 1) or (.status.availableReplicas // 0) != (.spec.replicas // 1)) | .metadata.name")" + if [ -n "$bad" ]; then + echo "deployments_not_ready=$bad" >&2 + exit 1 + fi +' + +echo +echo "[4/5] Database freshness" +remote_bash_stdin <<'REMOTE' + set -euo pipefail + pg_user="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_USER}" | base64 -d)" + pg_pass="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_PASSWORD}" | base64 -d)" + pg_db="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-postgres -o jsonpath="{.data.POSTGRES_DB}" | base64 -d)" + export PGPASSWORD="$pg_pass" + sql="$(cat <<'SQL' +SELECT 'dlob_hot_derived_latest|' || count(*) FROM dlob_hot_derived_latest; +SELECT 'dlob_all_derived_latest|' || count(*) FROM dlob_all_derived_latest; +SELECT 'drift_ticks_15m|' || count(*) FROM drift_ticks WHERE ts >= now() - interval '15 minutes'; +SELECT 'latest_tick|' || symbol || '|' || source || '|' || COALESCE(raw->>'from','') || '|' || to_char(ts, 'YYYY-MM-DD HH24:MI:SS') +FROM drift_ticks +ORDER BY ts DESC +LIMIT 1; +SQL +)" + psql -h 127.0.0.1 -U "$pg_user" -d "$pg_db" -Atqc "$sql" +REMOTE + +echo +echo "[5/5] API and frontend" +remote_bash_stdin <<'REMOTE' + set -euo pipefail + token="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get secret trade-frontend-tokens -o jsonpath="{.data.read\.json}" | base64 -d | jq -r .token)" + pod_name="$(sudo k3s kubectl -n "$TARGET_NAMESPACE" get pod -l app.kubernetes.io/name=trade-ingestor -o jsonpath="{.items[0].metadata.name}")" + sudo k3s kubectl -n "$TARGET_NAMESPACE" exec -i "$pod_name" -- env API_TOKEN="$token" node - <<'JS' +const headers = { Authorization: `Bearer ${process.env.API_TOKEN}` }; + +async function getJson(url) { + const response = await fetch(url, { + headers, + signal: AbortSignal.timeout(10000), + }); + const payload = await response.json(); + if (!response.ok || !payload?.ok) { + throw new Error(`${url} failed: ${response.status} ${JSON.stringify(payload)}`); + } + return payload; +} + +(async () => { + const ticks = await getJson('http://trade-api:8787/v1/ticks?symbol=SOL-PERP&limit=3'); + const chart = await getJson('http://trade-api:8787/v1/chart?symbol=SOL-PERP&tf=1m&limit=5'); + const frontend = await fetch('http://trade-frontend:8081/', { + signal: AbortSignal.timeout(10000), + }); + const html = await frontend.text(); + if (!frontend.ok || !/ { + console.error(String(err && err.message ? err.message : err)); + process.exit(1); +}); +JS +REMOTE + +echo +echo "Smoke check passed for ${TARGET_NAMESPACE} on ${TARGET_HOST}" diff --git a/environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh b/environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh index 7e521c6..cb90431 100755 --- a/environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh +++ b/environments/sol/trade-r001-canary/scripts/prepare-sol-postgres.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash set -euo pipefail +DEFAULT_SOURCE_DIR="${HOME}/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" SOURCE_HOST="${SOURCE_HOST:-mevnode_bot}" SOURCE_NAMESPACE="${SOURCE_NAMESPACE:-trade-staging}" +SOURCE_DIR="${SOURCE_DIR:-}" TARGET_HOST="${TARGET_HOST:-mevnode}" PG_VERSION="${PG_VERSION:-16}" @@ -14,7 +16,15 @@ ssh_target() { ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@" } -SRC_SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret trade-postgres -o json")" +if [ -z "$SOURCE_DIR" ] && [ -d "$DEFAULT_SOURCE_DIR" ]; then + SOURCE_DIR="$DEFAULT_SOURCE_DIR" +fi + +if [ -n "$SOURCE_DIR" ]; then + SRC_SECRET_JSON="$(cat "${SOURCE_DIR}/trade-postgres.json")" +else + SRC_SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret trade-postgres -o json")" +fi POSTGRES_USER="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_USER' | base64 -d)" POSTGRES_PASSWORD="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_PASSWORD' | base64 -d)" POSTGRES_DB="$(printf '%s' "$SRC_SECRET_JSON" | jq -r '.data.POSTGRES_DB' | base64 -d)" diff --git a/environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh b/environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh new file mode 100755 index 0000000..a89790b --- /dev/null +++ b/environments/sol/trade-r001-canary/scripts/snapshot-sol-secrets.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +TARGET_HOST="${TARGET_HOST:-mevnode}" +TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}" +SNAPSHOT_DIR="${SNAPSHOT_DIR:-$HOME/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets}" + +SECRETS=( + gitea-registry + trade-api + trade-basic-auth + trade-dlob-rpc + trade-frontend-tokens + trade-hasura + trade-ingestor-tokens + trade-postgres +) + +ssh_target() { + ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@" +} + +install -d -m 700 "$SNAPSHOT_DIR" + +for secret_name in "${SECRETS[@]}"; do + ssh_target "sudo k3s kubectl -n ${TARGET_NAMESPACE} get secret ${secret_name} -o json" \ + > "${SNAPSHOT_DIR}/${secret_name}.json" + chmod 600 "${SNAPSHOT_DIR}/${secret_name}.json" + echo "Snapshotted ${secret_name} to ${SNAPSHOT_DIR}/${secret_name}.json" +done + +echo "Secret snapshot ready at ${SNAPSHOT_DIR}" diff --git a/environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh b/environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh index e272578..d51cfdc 100755 --- a/environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh +++ b/environments/sol/trade-r001-canary/scripts/sync-live-secrets.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash set -euo pipefail +DEFAULT_SOURCE_DIR="${HOME}/.local/share/trade-bootstrap/sol/trade-r001-canary-secrets" SOURCE_HOST="${SOURCE_HOST:-mevnode_bot}" SOURCE_NAMESPACE="${SOURCE_NAMESPACE:-trade-staging}" +SOURCE_DIR="${SOURCE_DIR:-}" TARGET_HOST="${TARGET_HOST:-mevnode}" TARGET_NAMESPACE="${TARGET_NAMESPACE:-trade-r001-canary}" @@ -23,10 +25,23 @@ ssh_target() { ssh -o StrictHostKeyChecking=no "$TARGET_HOST" "$@" } +if [ -z "$SOURCE_DIR" ] && [ -d "$DEFAULT_SOURCE_DIR" ]; then + SOURCE_DIR="$DEFAULT_SOURCE_DIR" +fi + +get_secret_json() { + local secret_name="$1" + if [ -n "$SOURCE_DIR" ]; then + cat "${SOURCE_DIR}/${secret_name}.json" + else + ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret ${secret_name} -o json" + fi +} + ssh_target "sudo k3s kubectl get ns ${TARGET_NAMESPACE} >/dev/null 2>&1 || sudo k3s kubectl create ns ${TARGET_NAMESPACE} >/dev/null" for secret_name in "${SECRETS[@]}"; do - SECRET_JSON="$(ssh_source "sudo k3s kubectl -n ${SOURCE_NAMESPACE} get secret ${secret_name} -o json")" + SECRET_JSON="$(get_secret_json "${secret_name}")" printf '%s' "$SECRET_JSON" \ | jq --arg ns "$TARGET_NAMESPACE" 'del(.metadata.uid,.metadata.resourceVersion,.metadata.creationTimestamp,.metadata.managedFields,.metadata.ownerReferences,.metadata.selfLink,.metadata.annotations["kubectl.kubernetes.io/last-applied-configuration"]) | .metadata.namespace = $ns' \ | ssh_target "sudo k3s kubectl apply -f - >/dev/null"