feat: SRTT-based nameserver selection (#19)
* feat: SRTT-based nameserver selection for recursive resolver BIND-style Smoothed RTT (EWMA) tracking per NS IP address. The resolver learns which nameservers respond fastest and prefers them, eliminating cascading timeouts from slow/unreachable IPv6 servers. - New src/srtt.rs: SrttCache with record_rtt, record_failure, sort_by_rtt - EWMA formula: new = (old * 7 + sample) / 8, 5s failure penalty, 5min decay - TCP penalty (+100ms) lets SRTT naturally deprioritize IPv6-over-TCP - Enabled flag embedded in SrttCache (no-op when disabled) - Batch eviction (64 entries) for O(1) amortized writes at capacity - Configurable via [upstream] srtt = true/false (default: true) - Benchmark script: scripts/benchmark.sh (full, cold, warm, compare-all) - Benchmarks show 12x avg improvement, 0% queries >1s (was 58%) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: show DNSSEC and SRTT status in dashboard + API Add dnssec and srtt boolean fields to /stats API response. Display on/off indicators in the dashboard footer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: apply SRTT decay before EWMA so recovered servers rehabilitate Without decay-before-EWMA, a server penalized at 5000ms stayed near that value even after recovery — the stale raw penalty was used as the EWMA base instead of the decayed estimate. Extract decayed_srtt() helper and call it in record_rtt() before the smoothing step. Also restores removed "why" comments in send_query / resolve_recursive. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: add install/upgrade instructions, smarter benchmark priming README: document `numa install`, `numa service`, Homebrew upgrade, and `make deploy` workflows. Benchmark: replace fixed `sleep 4` with `wait_for_priming` that polls cache entry count for stability. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit was merged in pull request #19.
This commit is contained in:
306
scripts/benchmark.sh
Executable file
306
scripts/benchmark.sh
Executable file
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
API="${NUMA_API:-http://127.0.0.1:5380}"
|
||||
DNS="${NUMA_DNS:-127.0.0.1}"
|
||||
NUMA_BIN="${NUMA_BIN:-/usr/local/bin/numa}"
|
||||
LAUNCHD_PLIST="/Library/LaunchDaemons/com.numa.dns.plist"
|
||||
|
||||
DOMAINS=(
|
||||
paypal.com ebay.com zoom.us slack.com discord.com
|
||||
microsoft.com apple.com meta.com oracle.com ibm.com
|
||||
docker.com kubernetes.io prometheus.io grafana.com terraform.io
|
||||
python.org nodejs.org golang.org wikipedia.org reddit.com
|
||||
stackoverflow.com stripe.com linear.app nytimes.com bbc.co.uk
|
||||
rust-lang.org fastly.com hetzner.com uber.com airbnb.com
|
||||
notion.so figma.com netflix.com spotify.com dropbox.com
|
||||
gitlab.com twitch.tv shopify.com vercel.app mozilla.org
|
||||
)
|
||||
|
||||
stats() {
|
||||
curl -s "$API/query-log" | python3 -c "
|
||||
import sys, json
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
rec = [q for q in data if q['path'] == 'RECURSIVE']
|
||||
if not rec:
|
||||
print('No recursive queries in log.')
|
||||
sys.exit()
|
||||
|
||||
vals = sorted([q['latency_ms'] for q in rec])
|
||||
n = len(vals)
|
||||
|
||||
print(f'Recursive queries: {n}')
|
||||
print(f' Avg: {sum(vals)/n:.1f}ms')
|
||||
print(f' Median: {vals[n//2]:.1f}ms')
|
||||
print(f' P95: {vals[int(n*0.95)]:.1f}ms')
|
||||
print(f' P99: {vals[int(n*0.99)]:.1f}ms')
|
||||
print(f' Min: {min(vals):.1f}ms')
|
||||
print(f' Max: {max(vals):.1f}ms')
|
||||
print(f' <100ms: {sum(1 for v in vals if v < 100)}')
|
||||
print(f' <200ms: {sum(1 for v in vals if v < 200)}')
|
||||
print(f' <500ms: {sum(1 for v in vals if v < 500)}')
|
||||
print(f' >1s: {sum(1 for v in vals if v >= 1000)}')
|
||||
print()
|
||||
print('Slowest 5:')
|
||||
for q in sorted(rec, key=lambda q: q['latency_ms'], reverse=True)[:5]:
|
||||
print(f' {q[\"latency_ms\"]:>8.1f}ms {q[\"query_type\"]:5s} {q[\"domain\"]:35s} {q[\"rescode\"]}')
|
||||
print()
|
||||
print('Fastest 5:')
|
||||
for q in sorted(rec, key=lambda q: q['latency_ms'])[:5]:
|
||||
print(f' {q[\"latency_ms\"]:>8.1f}ms {q[\"query_type\"]:5s} {q[\"domain\"]:35s} {q[\"rescode\"]}')
|
||||
"
|
||||
}
|
||||
|
||||
query_all() {
|
||||
local label="$1"
|
||||
echo "=== $label ==="
|
||||
for d in "${DOMAINS[@]}"; do
|
||||
printf " %-25s " "$d"
|
||||
dig "@$DNS" "$d" A +noall +stats 2>/dev/null | grep "Query time"
|
||||
done
|
||||
echo
|
||||
}
|
||||
|
||||
flush_cache() {
|
||||
curl -s -X DELETE "$API/cache" > /dev/null
|
||||
echo "Cache flushed ($(curl -s "$API/stats" | python3 -c "import sys,json; print(json.load(sys.stdin)['cache']['entries'])" 2>/dev/null || echo '?') entries)."
|
||||
}
|
||||
|
||||
wait_for_api() {
|
||||
local attempts=0
|
||||
while ! curl -sf "$API/health" > /dev/null 2>&1; do
|
||||
attempts=$((attempts + 1))
|
||||
if [ $attempts -ge 20 ]; then
|
||||
echo "ERROR: API not reachable at $API after 10s" >&2
|
||||
exit 1
|
||||
fi
|
||||
sleep 0.5
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_priming() {
|
||||
echo -n "Waiting for TLD priming..."
|
||||
local prev=0
|
||||
local stable=0
|
||||
for _ in $(seq 1 60); do
|
||||
local entries
|
||||
entries=$(curl -s "$API/stats" | python3 -c "import sys,json; print(json.load(sys.stdin)['cache']['entries'])" 2>/dev/null || echo 0)
|
||||
if [ "$entries" -gt 0 ] && [ "$entries" = "$prev" ]; then
|
||||
stable=$((stable + 1))
|
||||
if [ $stable -ge 3 ]; then
|
||||
echo " done ($entries cache entries)."
|
||||
return
|
||||
fi
|
||||
else
|
||||
stable=0
|
||||
fi
|
||||
prev="$entries"
|
||||
sleep 1
|
||||
done
|
||||
echo " timeout (cache: $prev entries)."
|
||||
}
|
||||
|
||||
# restart_numa <config_toml_body>
|
||||
# Writes config to a temp file, stops numa (launchd or manual), starts with that config.
|
||||
restart_numa() {
|
||||
local config_body="$1"
|
||||
local tmpconf
|
||||
tmpconf=$(mktemp /tmp/numa-bench-XXXXXX)
|
||||
mv "$tmpconf" "${tmpconf}.toml"
|
||||
tmpconf="${tmpconf}.toml"
|
||||
echo "$config_body" > "$tmpconf"
|
||||
|
||||
# Stop launchd-managed numa if active
|
||||
if sudo launchctl list com.numa.dns &>/dev/null; then
|
||||
sudo launchctl unload "$LAUNCHD_PLIST" 2>/dev/null || true
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# Kill any remaining
|
||||
sudo killall numa 2>/dev/null || true
|
||||
sleep 2
|
||||
|
||||
sudo "$NUMA_BIN" "$tmpconf" &
|
||||
wait_for_api
|
||||
wait_for_priming
|
||||
echo "numa ready (pid $(pgrep numa | head -1), config: $tmpconf)."
|
||||
}
|
||||
|
||||
# Restore the launchd service
|
||||
restore_launchd() {
|
||||
sudo killall numa 2>/dev/null || true
|
||||
sleep 1
|
||||
if [ -f "$LAUNCHD_PLIST" ]; then
|
||||
sudo launchctl load "$LAUNCHD_PLIST" 2>/dev/null || true
|
||||
echo "Restored launchd service."
|
||||
fi
|
||||
}
|
||||
|
||||
run_pass() {
|
||||
local label="$1"
|
||||
flush_cache
|
||||
sleep 0.5
|
||||
query_all "$label"
|
||||
echo "=== $label — stats ==="
|
||||
stats
|
||||
}
|
||||
|
||||
case "${1:-full}" in
|
||||
cold)
|
||||
echo "--- Cold cache benchmark ---"
|
||||
run_pass "Cold SRTT + Cold cache"
|
||||
;;
|
||||
warm)
|
||||
echo "--- Warm SRTT benchmark ---"
|
||||
echo "Priming SRTT..."
|
||||
for d in "${DOMAINS[@]}"; do dig "@$DNS" "$d" A +short > /dev/null 2>&1; done
|
||||
run_pass "Warm SRTT + Cold cache"
|
||||
;;
|
||||
stats)
|
||||
stats
|
||||
;;
|
||||
compare-srtt)
|
||||
echo "============================================"
|
||||
echo " A/B: SRTT OFF vs ON (dnssec off)"
|
||||
echo "============================================"
|
||||
echo
|
||||
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = false
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "SRTT OFF"
|
||||
|
||||
echo
|
||||
echo "--------------------------------------------"
|
||||
echo
|
||||
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = true
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "SRTT ON"
|
||||
|
||||
echo
|
||||
restore_launchd
|
||||
;;
|
||||
compare-dnssec)
|
||||
echo "============================================"
|
||||
echo " A/B: DNSSEC OFF vs ON (srtt on)"
|
||||
echo "============================================"
|
||||
echo
|
||||
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = true
|
||||
|
||||
[dnssec]
|
||||
enabled = false
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "DNSSEC OFF"
|
||||
|
||||
echo
|
||||
echo "--------------------------------------------"
|
||||
echo
|
||||
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = true
|
||||
|
||||
[dnssec]
|
||||
enabled = true
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "DNSSEC ON"
|
||||
|
||||
echo
|
||||
restore_launchd
|
||||
;;
|
||||
compare-all)
|
||||
echo "============================================"
|
||||
echo " Full A/B matrix"
|
||||
echo " 1. SRTT OFF + DNSSEC OFF (baseline)"
|
||||
echo " 2. SRTT ON + DNSSEC OFF"
|
||||
echo " 3. SRTT ON + DNSSEC ON"
|
||||
echo "============================================"
|
||||
echo
|
||||
|
||||
# --- 1. Baseline ---
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = false
|
||||
|
||||
[dnssec]
|
||||
enabled = false
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "SRTT OFF + DNSSEC OFF"
|
||||
|
||||
echo
|
||||
echo "--------------------------------------------"
|
||||
echo
|
||||
|
||||
# --- 2. SRTT only ---
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = true
|
||||
|
||||
[dnssec]
|
||||
enabled = false
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "SRTT ON + DNSSEC OFF"
|
||||
|
||||
echo
|
||||
echo "--------------------------------------------"
|
||||
echo
|
||||
|
||||
# --- 3. Both ---
|
||||
restart_numa "$(cat <<'TOML'
|
||||
[upstream]
|
||||
mode = "recursive"
|
||||
srtt = true
|
||||
|
||||
[dnssec]
|
||||
enabled = true
|
||||
TOML
|
||||
)"
|
||||
echo
|
||||
run_pass "SRTT ON + DNSSEC ON"
|
||||
|
||||
echo
|
||||
restore_launchd
|
||||
;;
|
||||
full|*)
|
||||
echo "--- Full benchmark (cold → warm → SRTT-only) ---"
|
||||
echo
|
||||
|
||||
wait_for_priming
|
||||
flush_cache
|
||||
sleep 0.5
|
||||
query_all "Pass 1: Cold SRTT + Cold cache"
|
||||
|
||||
flush_cache
|
||||
sleep 0.5
|
||||
query_all "Pass 2: Warm SRTT + Cold cache"
|
||||
|
||||
echo "=== Pass 2 stats (SRTT-warm) ==="
|
||||
stats
|
||||
;;
|
||||
esac
|
||||
Reference in New Issue
Block a user