fix(bootstrap): route numa HTTPS via IP-literal bootstrap resolver (#122)
When numa is its own system DNS resolver (HAOS add-on, Pi-hole-style container, /etc/resolv.conf → 127.0.0.1), every numa-originated HTTPS connection — DoH upstream, ODoH relay/target, blocklist CDN — routed its hostname through getaddrinfo() back to numa itself. Cold boot deadlocked; steady state taxed every new TCP connection. 0.14.1's retry-with-backoff masked the startup race but not the underlying self-loop. NumaResolver implements reqwest::dns::Resolve with two lanes: - Per-host overrides (ODoH relay_ip/target_ip) short-circuit DNS entirely, preserving ODoH's zero-plain-DNS-leak property. - Otherwise: A+AAAA in parallel via UDP to IP-literal bootstrap servers, with TCP fallback for UDP-hostile networks. Bootstrap IPs come from upstream.fallback (IP-literal filtered, hostnames skipped with a warning). Empty fallback yields the hardcoded default [9.9.9.9, 1.1.1.1]; the chosen source is logged at startup so the silent default is visible. doh_keepalive_loop now fires its first tick immediately, and keepalive_doh logs failures at WARN — bootstrap issues surface within ~100ms of boot instead of on the first client query. Distinct from UpstreamPool.fallback (client-query failover) which stays untouched: client queries with no configured fallback still SERVFAIL on primary failure rather than silently shadow-routing. Reproducer: tests/docker/self-resolver-loop.sh. Before: 0 blocklist domains, 3072ms SERVFAIL. After: 397k domains, 118ms NOERROR.
This commit is contained in:
155
tests/docker/self-resolver-loop.sh
Executable file
155
tests/docker/self-resolver-loop.sh
Executable file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Reproducer for issue #122 — chicken-and-egg when numa is its own system
|
||||
# resolver (HAOS add-on, Pi-hole-style container, laptop with
|
||||
# resolv.conf → 127.0.0.1).
|
||||
#
|
||||
# Topology:
|
||||
# container /etc/resolv.conf → nameserver 127.0.0.1
|
||||
# numa bound on :53 → upstream DoH by hostname (quad9)
|
||||
# numa boots → spawns blocklist download
|
||||
# reqwest::get → getaddrinfo("cdn.jsdelivr.net")
|
||||
# → loopback UDP :53 → numa → cache miss → DoH upstream
|
||||
# → getaddrinfo("dns.quad9.net") → same loop → glibc EAI_AGAIN
|
||||
#
|
||||
# Expected on master: both assertions FAIL (bug reproduced).
|
||||
# Expected after bootstrap-IP fix: both assertions PASS.
|
||||
#
|
||||
# Requirements: docker (with internet access for external lists/DoH)
|
||||
# Usage: ./tests/docker/self-resolver-loop.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/../.."
|
||||
|
||||
GREEN="\033[32m"; RED="\033[31m"; RESET="\033[0m"
|
||||
|
||||
pass() { printf " ${GREEN}✓${RESET} %s\n" "$1"; }
|
||||
fail() { printf " ${RED}✗${RESET} %s\n" "$1"; printf " %s\n" "$2"; FAILED=$((FAILED+1)); }
|
||||
FAILED=0
|
||||
|
||||
OUT=/tmp/numa-self-resolver.out
|
||||
|
||||
echo "── self-resolver-loop: building + reproducing on debian:bookworm ──"
|
||||
echo " (first run is slow: image pull + cold cargo build, ~5-8 min)"
|
||||
echo
|
||||
|
||||
docker run --rm \
|
||||
-v "$PWD:/src:ro" \
|
||||
-v numa-self-resolver-cargo:/root/.cargo \
|
||||
-v numa-self-resolver-target:/work/target \
|
||||
debian:bookworm bash -c '
|
||||
set -e
|
||||
|
||||
# Phase 1: install deps + build with the container DNS as given by Docker
|
||||
# (resolves deb.debian.org, static.rust-lang.org, crates.io).
|
||||
apt-get update -qq && apt-get install -y -qq curl build-essential dnsutils 2>&1 | tail -3
|
||||
|
||||
if ! command -v cargo &>/dev/null; then
|
||||
curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --quiet
|
||||
fi
|
||||
. "$HOME/.cargo/env"
|
||||
|
||||
mkdir -p /work
|
||||
tar -C /src --exclude=./target --exclude=./.git -cf - . | tar -C /work -xf -
|
||||
cd /work
|
||||
|
||||
echo "── cargo build --release --locked ──"
|
||||
cargo build --release --locked 2>&1 | tail -5
|
||||
echo
|
||||
|
||||
# Phase 2: flip system DNS to numa itself — this is the pathological
|
||||
# topology from issue #122 (HAOS add-on, resolv.conf → 127.0.0.1).
|
||||
# Everything after this point, any getaddrinfo call inside numa loops
|
||||
# back through :53.
|
||||
echo "nameserver 127.0.0.1" > /etc/resolv.conf
|
||||
echo "── /etc/resolv.conf inside container (post-flip) ──"
|
||||
cat /etc/resolv.conf
|
||||
echo
|
||||
|
||||
cat > /tmp/numa.toml <<CONF
|
||||
[server]
|
||||
bind_addr = "0.0.0.0:53"
|
||||
api_port = 5380
|
||||
api_bind_addr = "127.0.0.1"
|
||||
data_dir = "/tmp/numa-data"
|
||||
|
||||
[upstream]
|
||||
mode = "forward"
|
||||
address = ["https://dns.quad9.net/dns-query"]
|
||||
timeout_ms = 3000
|
||||
|
||||
[blocking]
|
||||
enabled = true
|
||||
lists = ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@latest/hosts/pro.txt"]
|
||||
CONF
|
||||
|
||||
mkdir -p /tmp/numa-data
|
||||
|
||||
echo "── starting numa ──"
|
||||
RUST_LOG=info ./target/release/numa /tmp/numa.toml > /tmp/numa.log 2>&1 &
|
||||
NUMA_PID=$!
|
||||
|
||||
# Wait up to 120s for blocklist to populate.
|
||||
# Retry delays 2+10+30s = 42s, plus ~4 × ~10s getaddrinfo timeouts under
|
||||
# self-loop = ~82s worst case. 120s leaves headroom.
|
||||
LOADED=0
|
||||
for i in $(seq 1 120); do
|
||||
LOADED=$(curl -sf http://127.0.0.1:5380/blocking/stats 2>/dev/null \
|
||||
| grep -o "\"domains_loaded\":[0-9]*" | cut -d: -f2 || echo 0)
|
||||
[ "${LOADED:-0}" -gt 100 ] && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# First cold DoH query — time it.
|
||||
START=$(date +%s%N)
|
||||
dig @127.0.0.1 example.com A +time=15 +tries=1 > /tmp/dig.out 2>&1 || true
|
||||
END=$(date +%s%N)
|
||||
LATENCY_MS=$(( (END - START) / 1000000 ))
|
||||
STATUS=$(grep -oE "status: [A-Z]+" /tmp/dig.out | head -1 || echo "status: TIMEOUT")
|
||||
|
||||
kill $NUMA_PID 2>/dev/null || true
|
||||
wait $NUMA_PID 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "=== RESULT ==="
|
||||
echo "domains_loaded=$LOADED"
|
||||
echo "first_query_latency_ms=$LATENCY_MS"
|
||||
echo "first_query_${STATUS// /_}"
|
||||
echo
|
||||
echo "=== numa.log (tail 40) ==="
|
||||
tail -40 /tmp/numa.log
|
||||
echo
|
||||
echo "=== dig.out ==="
|
||||
cat /tmp/dig.out
|
||||
' 2>&1 | tee "$OUT"
|
||||
|
||||
echo
|
||||
echo "── assertions ──"
|
||||
|
||||
LOADED=$(grep '^domains_loaded=' "$OUT" | tail -1 | cut -d= -f2 || echo 0)
|
||||
LATENCY=$(grep '^first_query_latency_ms=' "$OUT" | tail -1 | cut -d= -f2 || echo 999999)
|
||||
STATUS_LINE=$(grep '^first_query_status_' "$OUT" | tail -1 || echo "first_query_status_TIMEOUT")
|
||||
|
||||
if [ "${LOADED:-0}" -gt 100 ]; then
|
||||
pass "blocklist downloaded (domains_loaded=$LOADED)"
|
||||
else
|
||||
fail "blocklist downloaded (got domains_loaded=${LOADED:-0}, expected >100)" \
|
||||
"chicken-and-egg: blocklist HTTPS client has no DNS bootstrap; getaddrinfo loops through numa"
|
||||
fi
|
||||
|
||||
if [ "${LATENCY:-999999}" -lt 2000 ]; then
|
||||
pass "first DoH query under 2s (latency=${LATENCY}ms, $STATUS_LINE)"
|
||||
else
|
||||
fail "first DoH query under 2s (got ${LATENCY}ms, $STATUS_LINE)" \
|
||||
"self-loop on getaddrinfo(upstream_host); plain DoH needs bootstrap-IP symmetry with ODoH"
|
||||
fi
|
||||
|
||||
echo
|
||||
if [ "$FAILED" -eq 0 ]; then
|
||||
printf "${GREEN}── self-resolver-loop passed (fix is in place) ──${RESET}\n"
|
||||
exit 0
|
||||
else
|
||||
printf "${RED}── self-resolver-loop failed ($FAILED assertion(s)) — bug #122 reproduced ──${RESET}\n"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user