perf: optimize hot path — RwLock, inline filtering, pre-allocated strings

- Mutex → RwLock for cache, blocklist, and overrides (concurrent read access)
- Make cache.lookup() and overrides.lookup() take &self (read-only)
- Eliminate 3 Vec allocations per DnsPacket::write() via inline filtering
- Pre-allocate domain strings with capacity 64 in parse path
- Add criterion micro-benchmarks (hot_path + throughput)
- Add bench README documenting both benchmark suites

Measured improvement: ~14% faster parsing, ~9% pipeline throughput,
round-trip cached 733ns → 698ns (~2.3M queries/sec).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Razvan Dimescu
2026-03-24 22:51:34 +02:00
parent 5d454cbed5
commit aed0e095e1
13 changed files with 729 additions and 77 deletions

186
benches/hot_path.rs Normal file
View File

@@ -0,0 +1,186 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::net::Ipv4Addr;
use numa::buffer::BytePacketBuffer;
use numa::cache::DnsCache;
use numa::header::{DnsHeader, ResultCode};
use numa::packet::DnsPacket;
use numa::question::{DnsQuestion, QueryType};
use numa::record::DnsRecord;
fn make_response(domain: &str) -> DnsPacket {
let mut pkt = DnsPacket::new();
pkt.header = DnsHeader::new();
pkt.header.id = 0x1234;
pkt.header.response = true;
pkt.header.recursion_desired = true;
pkt.header.recursion_available = true;
pkt.header.rescode = ResultCode::NOERROR;
pkt.questions
.push(DnsQuestion::new(domain.to_string(), QueryType::A));
pkt.answers.push(DnsRecord::A {
domain: domain.to_string(),
addr: Ipv4Addr::new(93, 184, 216, 34),
ttl: 300,
});
// Typical response includes authority + additional records
pkt.authorities.push(DnsRecord::NS {
domain: domain.to_string(),
host: format!("ns1.{domain}"),
ttl: 172800,
});
pkt.authorities.push(DnsRecord::NS {
domain: domain.to_string(),
host: format!("ns2.{domain}"),
ttl: 172800,
});
pkt.resources.push(DnsRecord::A {
domain: format!("ns1.{domain}"),
addr: Ipv4Addr::new(198, 51, 100, 1),
ttl: 172800,
});
pkt
}
fn to_wire(pkt: &DnsPacket) -> Vec<u8> {
let mut buf = BytePacketBuffer::new();
pkt.write(&mut buf).unwrap();
buf.filled().to_vec()
}
fn bench_buffer_parse(c: &mut Criterion) {
let pkt = make_response("example.com");
let wire = to_wire(&pkt);
c.bench_function("buffer_parse", |b| {
b.iter(|| {
let mut buf = BytePacketBuffer::from_bytes(black_box(&wire));
DnsPacket::from_buffer(&mut buf).unwrap()
})
});
}
fn bench_buffer_serialize(c: &mut Criterion) {
let pkt = make_response("example.com");
c.bench_function("buffer_serialize", |b| {
b.iter(|| {
let mut buf = BytePacketBuffer::new();
black_box(&pkt).write(&mut buf).unwrap();
black_box(buf.pos());
})
});
}
fn bench_packet_clone(c: &mut Criterion) {
let pkt = make_response("example.com");
c.bench_function("packet_clone", |b| b.iter(|| black_box(&pkt).clone()));
}
fn bench_cache_lookup_hit(c: &mut Criterion) {
let mut cache = DnsCache::new(10_000, 60, 86400);
let pkt = make_response("example.com");
cache.insert("example.com", QueryType::A, &pkt);
c.bench_function("cache_lookup_hit", |b| {
b.iter(|| {
cache
.lookup(black_box("example.com"), QueryType::A)
.unwrap()
})
});
}
fn bench_cache_lookup_miss(c: &mut Criterion) {
let mut cache = DnsCache::new(10_000, 60, 86400);
c.bench_function("cache_lookup_miss", |b| {
b.iter(|| cache.lookup(black_box("nonexistent.com"), QueryType::A))
});
}
fn bench_cache_insert(c: &mut Criterion) {
let pkt = make_response("example.com");
c.bench_function("cache_insert", |b| {
let mut cache = DnsCache::new(10_000, 60, 86400);
let mut i = 0u64;
b.iter(|| {
let domain = format!("bench-{i}.example.com");
cache.insert(&domain, QueryType::A, black_box(&pkt));
i += 1;
// Reset cache periodically to avoid filling up
if i % 5000 == 0 {
cache.clear();
}
})
});
}
fn bench_round_trip(c: &mut Criterion) {
// Simulates the cached hot path: parse query → cache hit → serialize response
let query_pkt = {
let mut q = DnsPacket::new();
q.header.id = 0xABCD;
q.header.recursion_desired = true;
q.questions
.push(DnsQuestion::new("example.com".to_string(), QueryType::A));
q
};
let query_wire = to_wire(&query_pkt);
let response = make_response("example.com");
let mut cache = DnsCache::new(10_000, 60, 86400);
cache.insert("example.com", QueryType::A, &response);
c.bench_function("round_trip_cached", |b| {
b.iter(|| {
// 1. Parse incoming query
let mut buf = BytePacketBuffer::from_bytes(black_box(&query_wire));
let query = DnsPacket::from_buffer(&mut buf).unwrap();
let qname = &query.questions[0].name;
let qtype = query.questions[0].qtype;
// 2. Cache lookup
let mut resp = cache.lookup(qname, qtype).unwrap();
resp.header.id = query.header.id;
// 3. Serialize response
let mut resp_buf = BytePacketBuffer::new();
resp.write(&mut resp_buf).unwrap();
black_box(resp_buf.pos());
})
});
}
fn bench_cache_populated_lookup(c: &mut Criterion) {
// Benchmark with a realistically populated cache (1000 entries)
let mut cache = DnsCache::new(10_000, 60, 86400);
for i in 0..1000 {
let domain = format!("domain-{i}.example.com");
let pkt = make_response(&domain);
cache.insert(&domain, QueryType::A, &pkt);
}
c.bench_function("cache_lookup_hit_populated", |b| {
b.iter(|| {
cache
.lookup(black_box("domain-500.example.com"), QueryType::A)
.unwrap()
})
});
}
criterion_group!(
benches,
bench_buffer_parse,
bench_buffer_serialize,
bench_packet_clone,
bench_cache_lookup_hit,
bench_cache_lookup_miss,
bench_cache_insert,
bench_round_trip,
bench_cache_populated_lookup,
);
criterion_main!(benches);

94
benches/throughput.rs Normal file
View File

@@ -0,0 +1,94 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::net::Ipv4Addr;
use numa::buffer::BytePacketBuffer;
use numa::header::ResultCode;
use numa::packet::DnsPacket;
use numa::question::{DnsQuestion, QueryType};
use numa::record::DnsRecord;
fn make_query_wire(domain: &str) -> Vec<u8> {
let mut q = DnsPacket::new();
q.header.id = 0xABCD;
q.header.recursion_desired = true;
q.questions
.push(DnsQuestion::new(domain.to_string(), QueryType::A));
let mut buf = BytePacketBuffer::new();
q.write(&mut buf).unwrap();
buf.filled().to_vec()
}
fn make_response(domain: &str) -> DnsPacket {
let mut pkt = DnsPacket::new();
pkt.header.id = 0xABCD;
pkt.header.response = true;
pkt.header.recursion_desired = true;
pkt.header.recursion_available = true;
pkt.header.rescode = ResultCode::NOERROR;
pkt.questions
.push(DnsQuestion::new(domain.to_string(), QueryType::A));
pkt.answers.push(DnsRecord::A {
domain: domain.to_string(),
addr: Ipv4Addr::new(93, 184, 216, 34),
ttl: 300,
});
pkt
}
/// Simulates the complete cached query pipeline (sans network I/O):
/// parse → cache lookup → TTL adjust → serialize response
fn simulate_cached_pipeline(query_wire: &[u8], cache: &mut numa::cache::DnsCache) -> usize {
let mut buf = BytePacketBuffer::from_bytes(query_wire);
let query = DnsPacket::from_buffer(&mut buf).unwrap();
let q = &query.questions[0];
let mut resp = cache.lookup(&q.name, q.qtype).unwrap();
resp.header.id = query.header.id;
let mut resp_buf = BytePacketBuffer::new();
resp.write(&mut resp_buf).unwrap();
resp_buf.pos()
}
fn bench_pipeline_throughput(c: &mut Criterion) {
let domains: Vec<String> = (0..100)
.map(|i| format!("domain-{i}.example.com"))
.collect();
let mut cache = numa::cache::DnsCache::new(10_000, 60, 86400);
for d in &domains {
cache.insert(d, QueryType::A, &make_response(d));
}
let query_wires: Vec<Vec<u8>> = domains.iter().map(|d| make_query_wire(d)).collect();
let mut group = c.benchmark_group("pipeline_throughput");
for count in [1, 10, 100] {
group.throughput(Throughput::Elements(count));
group.bench_with_input(BenchmarkId::from_parameter(count), &count, |b, &count| {
let mut idx = 0usize;
b.iter(|| {
for _ in 0..count {
let wire = &query_wires[idx % query_wires.len()];
simulate_cached_pipeline(wire, &mut cache);
idx += 1;
}
});
});
}
group.finish();
}
/// Measures the overhead of BytePacketBuffer allocation + zero-init
fn bench_buffer_alloc(c: &mut Criterion) {
c.bench_function("buffer_alloc", |b| {
b.iter(|| {
let buf = BytePacketBuffer::new();
criterion::black_box(buf.pos());
})
});
}
criterion_group!(benches, bench_pipeline_throughput, bench_buffer_alloc,);
criterion_main!(benches);