perf: optimize DNS query hot path (#15)

* perf: optimize hot path — RwLock, inline filtering, pre-allocated strings

- Mutex → RwLock for cache, blocklist, and overrides (concurrent read access)
- Make cache.lookup() and overrides.lookup() take &self (read-only)
- Eliminate 3 Vec allocations per DnsPacket::write() via inline filtering
- Pre-allocate domain strings with capacity 64 in parse path
- Add criterion micro-benchmarks (hot_path + throughput)
- Add bench README documenting both benchmark suites

Measured improvement: ~14% faster parsing, ~9% pipeline throughput,
round-trip cached 733ns → 698ns (~2.3M queries/sec).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* chore: simplify benchmark code after review

- Remove redundant DnsHeader::new() (already set by DnsPacket::new())
- Remove unused DnsHeader import
- Change simulate_cached_pipeline to take &DnsCache (lookup is &self now)
- Remove unnecessary mut on cache in cache_lookup_miss bench

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit was merged in pull request #15.
This commit is contained in:
Razvan Dimescu
2026-03-27 02:01:08 +02:00
committed by GitHub
parent 5d454cbed5
commit 236ef7b4f5
13 changed files with 728 additions and 77 deletions

View File

@@ -1,6 +1,6 @@
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Mutex;
use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant, SystemTime};
use arc_swap::ArcSwap;
@@ -27,10 +27,10 @@ use crate::system_dns::ForwardingRule;
pub struct ServerCtx {
pub socket: UdpSocket,
pub zone_map: ZoneMap,
pub cache: Mutex<DnsCache>,
pub cache: RwLock<DnsCache>,
pub stats: Mutex<ServerStats>,
pub overrides: Mutex<OverrideStore>,
pub blocklist: Mutex<BlocklistStore>,
pub overrides: RwLock<OverrideStore>,
pub blocklist: RwLock<BlocklistStore>,
pub query_log: Mutex<QueryLog>,
pub services: Mutex<ServiceStore>,
pub lan_peers: Mutex<PeerStore>,
@@ -73,7 +73,7 @@ pub async fn handle_query(
// Pipeline: overrides -> .tld interception -> blocklist -> local zones -> cache -> upstream
// Each lock is scoped to avoid holding MutexGuard across await points.
let (response, path) = {
let override_record = ctx.overrides.lock().unwrap().lookup(&qname);
let override_record = ctx.overrides.read().unwrap().lookup(&qname);
if let Some(record) = override_record {
let mut resp = DnsPacket::response_from(&query, ResultCode::NOERROR);
resp.answers.push(record);
@@ -116,7 +116,7 @@ pub async fn handle_query(
}),
}
(resp, QueryPath::Local)
} else if ctx.blocklist.lock().unwrap().is_blocked(&qname) {
} else if ctx.blocklist.read().unwrap().is_blocked(&qname) {
let mut resp = DnsPacket::response_from(&query, ResultCode::NOERROR);
match qtype {
QueryType::AAAA => resp.answers.push(DnsRecord::AAAA {
@@ -136,7 +136,7 @@ pub async fn handle_query(
resp.answers = records.clone();
(resp, QueryPath::Local)
} else {
let cached = ctx.cache.lock().unwrap().lookup(&qname, qtype);
let cached = ctx.cache.read().unwrap().lookup(&qname, qtype);
if let Some(cached) = cached {
let mut resp = cached;
resp.header.id = query.header.id;
@@ -149,7 +149,7 @@ pub async fn handle_query(
};
match forward_query(&query, &upstream, ctx.timeout).await {
Ok(resp) => {
ctx.cache.lock().unwrap().insert(&qname, qtype, &resp);
ctx.cache.write().unwrap().insert(&qname, qtype, &resp);
(resp, QueryPath::Forwarded)
}
Err(e) => {