Sunday, 13 April 2025

Filled under:

✅ Checks lag for all replicas
✅ Tracks which replicas failed to clear lag
✅ Stores both lag values and failed IPs in state for downstream debugging or alerting

🌟 Amelia JavaScript: Check Replica Lag for All Replicas (With Failure Tracking)

const replicaIPs = state.get("replica_ip_list"); // array of IPs const clusterName = state.get("cluster_name");

const maxAttempts = 30; const waitBetween = 5000; // 5 sec const canWaitForever = false;

let lagClear = true; let failedReplicas = []; let replicaLags = {};

for (const ip of replicaIPs) { let attempt = 0; let lag = null;

status(🔄 Checking replication lag on replica ${ip}...);

while (attempt < maxAttempts || canWaitForever) { const output = host.run("check_replica_lag.sh", [ip, clusterName]); const outStr = output.stdout.trim(); const match = outStr.match(/LAG\s*=\s*(\d+)/i); lag = match ? parseInt(match[1]) : null;

pgsql
if (lag === 0) { status(`✅ Replica ${ip} has zero lag.`); replicaLags[ip] = 0; break; } else { status(`⚠️ Replica ${ip} still lagging (LAG=${lag ?? "unknown"}). Attempt ${attempt + 1}/${maxAttempts}`); attempt += 1; sleep(waitBetween); }

}

if (lag > 0 || lag === null) { failedReplicas.push(ip); replicaLags[ip] = lag ?? "N/A"; lagClear = false;

pgsql
if (!canWaitForever) { status(`❌ Stopping early. Replica ${ip} did not clear lag.`); break; }

} }

// Save details for later steps state.set("replica_lags", replicaLags); state.set("replica_lag_failures", failedReplicas);

if (lagClear) { status("✅ All replicas cleared LAG. Proceeding."); transition("NEXT"); } else { status(❌ Lag still present on replicas: ${failedReplicas.join(", ")}); transition("FAIL"); }

📄 Your companion host script (check_replica_lag.sh) should look like this:

#!/bin/bash export LC_ALL=en_US.UTF-8

REPLICA_IP="$1" CLUSTER_NAME="$2"

TMP_FILE="/tmp/LAG_CHECK_${REPLICA_IP}" /usr/local/bin/patronictl -d etcd://127.0.0.1:2379 list "$CLUSTER_NAME" > "$TMP_FILE"

LAG=$(grep "$REPLICA_IP" "$TMP_FILE" | awk '{print $NF}' | sed 's/[^0-9]//g') echo "LAG=${LAG:-0}"

🧠 What You Can Do With This:

  • Review replica_lags in Amelia debug UI

  • Notify or alert based on state.get("replica_lag_failures").length > 0

  • Skip next steps or trigger self-healing if LAG persists

0 comments:

Post a Comment