fix: apply SRTT decay before EWMA so recovered servers rehabilitate

Without decay-before-EWMA, a server penalized at 5000ms stayed near
that value even after recovery — the stale raw penalty was used as the
EWMA base instead of the decayed estimate. Extract decayed_srtt()
helper and call it in record_rtt() before the smoothing step.

Also restores removed "why" comments in send_query / resolve_recursive.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Razvan Dimescu
2026-03-28 23:16:18 +02:00
parent 076dd3677f
commit 239938dc07
2 changed files with 23 additions and 15 deletions

View File

@@ -147,6 +147,8 @@ pub async fn resolve_recursive(
root_hints: &[SocketAddr], root_hints: &[SocketAddr],
srtt: &RwLock<SrttCache>, srtt: &RwLock<SrttCache>,
) -> crate::Result<DnsPacket> { ) -> crate::Result<DnsPacket> {
// No overall timeout — each hop is bounded by NS_QUERY_TIMEOUT (UDP + TCP fallback),
// and MAX_REFERRAL_DEPTH caps the chain length.
let mut resp = resolve_iterative(qname, qtype, cache, root_hints, srtt, 0, 0).await?; let mut resp = resolve_iterative(qname, qtype, cache, root_hints, srtt, 0, 0).await?;
resp.header.id = original_query.header.id; resp.header.id = original_query.header.id;
@@ -606,10 +608,12 @@ async fn send_query(
let start = Instant::now(); let start = Instant::now();
// IPv6 forced to TCP — our UDP socket is bound to 0.0.0.0
if server.is_ipv6() { if server.is_ipv6() {
return tcp_with_srtt(&query, server, srtt, start).await; return tcp_with_srtt(&query, server, srtt, start).await;
} }
// UDP detected as blocked — go TCP-first
if UDP_DISABLED.load(Ordering::Acquire) { if UDP_DISABLED.load(Ordering::Acquire) {
return tcp_with_srtt(&query, server, srtt, start).await; return tcp_with_srtt(&query, server, srtt, start).await;
} }

View File

@@ -40,10 +40,15 @@ impl SrttCache {
/// Get current SRTT for an IP, applying decay if stale. Returns INITIAL for unknown. /// Get current SRTT for an IP, applying decay if stale. Returns INITIAL for unknown.
pub fn get(&self, ip: IpAddr) -> u64 { pub fn get(&self, ip: IpAddr) -> u64 {
match self.entries.get(&ip) { match self.entries.get(&ip) {
Some(entry) => { Some(entry) => Self::decayed_srtt(entry),
None => INITIAL_SRTT_MS,
}
}
/// Apply time-based decay: each DECAY_AFTER_SECS period halves distance to INITIAL.
fn decayed_srtt(entry: &SrttEntry) -> u64 {
let age_secs = entry.updated_at.elapsed().as_secs(); let age_secs = entry.updated_at.elapsed().as_secs();
if age_secs > DECAY_AFTER_SECS { if age_secs > DECAY_AFTER_SECS {
// Each decay period halves the distance to INITIAL_SRTT_MS
let periods = (age_secs / DECAY_AFTER_SECS).min(8); let periods = (age_secs / DECAY_AFTER_SECS).min(8);
let mut srtt = entry.srtt_ms; let mut srtt = entry.srtt_ms;
for _ in 0..periods { for _ in 0..periods {
@@ -54,9 +59,6 @@ impl SrttCache {
entry.srtt_ms entry.srtt_ms
} }
} }
None => INITIAL_SRTT_MS,
}
}
/// Record a successful query RTT. No-op when disabled. /// Record a successful query RTT. No-op when disabled.
pub fn record_rtt(&mut self, ip: IpAddr, rtt_ms: u64, tcp: bool) { pub fn record_rtt(&mut self, ip: IpAddr, rtt_ms: u64, tcp: bool) {
@@ -69,8 +71,10 @@ impl SrttCache {
srtt_ms: effective, srtt_ms: effective,
updated_at: Instant::now(), updated_at: Instant::now(),
}); });
// Apply decay before EWMA so recovered servers aren't stuck at stale penalties
let base = Self::decayed_srtt(entry);
// BIND EWMA: new = (old * 7 + sample) / 8 // BIND EWMA: new = (old * 7 + sample) / 8
entry.srtt_ms = (entry.srtt_ms * 7 + effective) / 8; entry.srtt_ms = (base * 7 + effective) / 8;
entry.updated_at = Instant::now(); entry.updated_at = Instant::now();
} }