Files
wifi-densepose/crates/ruQu/benches/memory_bench.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

620 lines
19 KiB
Rust

//! Memory efficiency benchmarks for ruQu Coherence Gate.
//!
//! Memory Targets:
//! - Per-tile memory usage: **<64KB**
//! - Allocation counts per cycle: **0 (steady state)**
//! - Cache line efficiency: **>80%**
//!
//! Run with: `cargo bench -p ruqu --bench memory_bench`
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicUsize, Ordering};
use ruqu::filters::{FilterConfig, FilterPipeline, ShiftFilter, StructuralFilter};
use ruqu::syndrome::{DetectorBitmap, SyndromeBuffer, SyndromeRound};
use ruqu::tile::{
EvidenceAccumulator, GateThresholds, LocalCutState, PatchGraph, ReceiptLog, SyndromBuffer,
SyndromeDelta, TileReport, TileZero, WorkerTile,
};
// ============================================================================
// ALLOCATION TRACKING ALLOCATOR
// ============================================================================
/// Global allocation counter for tracking allocations
static ALLOC_COUNT: AtomicUsize = AtomicUsize::new(0);
static DEALLOC_COUNT: AtomicUsize = AtomicUsize::new(0);
static BYTES_ALLOCATED: AtomicUsize = AtomicUsize::new(0);
static BYTES_DEALLOCATED: AtomicUsize = AtomicUsize::new(0);
/// Reset allocation counters
fn reset_allocation_counters() {
ALLOC_COUNT.store(0, Ordering::SeqCst);
DEALLOC_COUNT.store(0, Ordering::SeqCst);
BYTES_ALLOCATED.store(0, Ordering::SeqCst);
BYTES_DEALLOCATED.store(0, Ordering::SeqCst);
}
/// Get allocation statistics
fn get_allocation_stats() -> (usize, usize, usize, usize) {
(
ALLOC_COUNT.load(Ordering::SeqCst),
DEALLOC_COUNT.load(Ordering::SeqCst),
BYTES_ALLOCATED.load(Ordering::SeqCst),
BYTES_DEALLOCATED.load(Ordering::SeqCst),
)
}
// ============================================================================
// SIZE VERIFICATION BENCHMARKS
// ============================================================================
/// Benchmark and verify structure sizes
fn bench_structure_sizes(c: &mut Criterion) {
let mut group = c.benchmark_group("structure_sizes");
// Report sizes (this is informational, not a timed benchmark)
println!("\n=== Structure Sizes ===");
println!(
"WorkerTile: {} bytes",
std::mem::size_of::<WorkerTile>()
);
println!(
"PatchGraph: {} bytes",
std::mem::size_of::<PatchGraph>()
);
println!(
"SyndromBuffer: {} bytes",
std::mem::size_of::<SyndromBuffer>()
);
println!(
"EvidenceAccumulator: {} bytes",
std::mem::size_of::<EvidenceAccumulator>()
);
println!(
"LocalCutState: {} bytes",
std::mem::size_of::<LocalCutState>()
);
println!(
"TileReport: {} bytes",
std::mem::size_of::<TileReport>()
);
println!(
"DetectorBitmap: {} bytes",
std::mem::size_of::<DetectorBitmap>()
);
println!(
"SyndromeRound: {} bytes",
std::mem::size_of::<SyndromeRound>()
);
println!(
"SyndromeDelta: {} bytes",
std::mem::size_of::<SyndromeDelta>()
);
println!();
// Verify 64KB budget
let total_tile_size = std::mem::size_of::<WorkerTile>();
let budget = 65536; // 64KB
println!(
"WorkerTile size: {} bytes ({:.1}% of 64KB budget)",
total_tile_size,
(total_tile_size as f64 / budget as f64) * 100.0
);
// Benchmark size computation (ensures compiler doesn't optimize away)
group.bench_function("size_of_worker_tile", |b| {
b.iter(|| black_box(std::mem::size_of::<WorkerTile>()));
});
group.bench_function("size_of_patch_graph", |b| {
b.iter(|| black_box(std::mem::size_of::<PatchGraph>()));
});
group.bench_function("size_of_tile_report", |b| {
b.iter(|| black_box(std::mem::size_of::<TileReport>()));
});
group.finish();
}
// ============================================================================
// PER-TILE MEMORY USAGE
// ============================================================================
/// Benchmark per-tile memory usage
fn bench_per_tile_memory(c: &mut Criterion) {
let mut group = c.benchmark_group("per_tile_memory");
// WorkerTile memory footprint
let worker_tile_size = std::mem::size_of::<WorkerTile>();
assert!(
worker_tile_size <= 131072, // 128KB max (some padding allowed)
"WorkerTile exceeds memory budget: {} bytes",
worker_tile_size
);
// Benchmark WorkerTile creation (measures stack allocation)
group.bench_function("create_worker_tile", |b| {
b.iter(|| {
let tile = WorkerTile::new(1);
black_box(&tile);
// Note: WorkerTile is large, measure creation overhead
});
});
// Benchmark WorkerTile reset (should be allocation-free)
group.bench_function("reset_worker_tile", |b| {
let mut tile = WorkerTile::new(1);
// Populate with some data
for i in 0..50u16 {
let _ = tile.patch_graph.add_edge(i, i + 1, 1000);
}
b.iter(|| {
tile.reset();
black_box(&tile);
});
});
// Benchmark PatchGraph memory efficiency
group.bench_function("patch_graph_memory", |b| {
b.iter(|| {
let graph = PatchGraph::new();
black_box(&graph);
black_box(std::mem::size_of_val(&graph));
});
});
// Benchmark SyndromBuffer memory efficiency
group.bench_function("syndrom_buffer_memory", |b| {
b.iter(|| {
let buffer = SyndromBuffer::new();
black_box(&buffer);
black_box(std::mem::size_of_val(&buffer));
});
});
group.finish();
}
// ============================================================================
// ALLOCATION-FREE OPERATIONS
// ============================================================================
/// Benchmark operations that should be allocation-free in steady state
fn bench_allocation_free_ops(c: &mut Criterion) {
let mut group = c.benchmark_group("allocation_free");
// Worker tile tick should be allocation-free
group.bench_function("worker_tick_no_alloc", |b| {
let mut tile = WorkerTile::new(1);
// Pre-populate
for i in 0..50u16 {
let _ = tile.patch_graph.add_edge(i, i + 1, 1000);
}
tile.patch_graph.recompute_components();
let delta = SyndromeDelta::new(0, 1, 100);
b.iter(|| {
let report = tile.tick(&delta);
black_box(report);
});
});
// PatchGraph operations should be allocation-free
group.bench_function("patch_graph_ops_no_alloc", |b| {
let mut graph = PatchGraph::new();
for i in 0..100u16 {
let _ = graph.add_edge(i, (i + 1) % 100, 1000);
}
graph.recompute_components();
b.iter(|| {
// These operations should not allocate
let cut = graph.estimate_local_cut();
let mut candidates = [0u16; 64];
let count = graph.identify_boundary_candidates(&mut candidates);
black_box((cut, count));
});
});
// DetectorBitmap operations should be allocation-free
group.bench_function("bitmap_ops_no_alloc", |b| {
let mut a = DetectorBitmap::new(1024);
let mut bb = DetectorBitmap::new(1024);
for i in (0..512).step_by(2) {
a.set(i, true);
}
for i in (256..768).step_by(2) {
bb.set(i, true);
}
b.iter(|| {
let result = a.xor(&bb);
let count = result.popcount();
black_box(count);
});
});
// TileReport copy should be allocation-free
group.bench_function("tile_report_copy_no_alloc", |b| {
let mut report = TileReport::new(1);
report.local_cut = 10.0;
report.shift_score = 0.1;
report.e_value = 200.0;
b.iter(|| {
let copy = report;
black_box(copy);
});
});
// Evidence accumulator operations should be allocation-free
group.bench_function("evidence_update_no_alloc", |b| {
let mut evidence = EvidenceAccumulator::new();
b.iter(|| {
evidence.observe(1000);
let e = evidence.e_value();
black_box(e);
});
});
// LocalCutState update should be allocation-free
group.bench_function("local_cut_update_no_alloc", |b| {
let mut graph = PatchGraph::new();
for i in 0..100u16 {
let _ = graph.add_edge(i, (i + 1) % 100, 1000);
}
graph.recompute_components();
let mut cut_state = LocalCutState::new();
b.iter(|| {
cut_state.update_from_graph(&graph);
black_box(&cut_state);
});
});
group.finish();
}
// ============================================================================
// CACHE LINE EFFICIENCY
// ============================================================================
/// Benchmark cache line efficiency
fn bench_cache_efficiency(c: &mut Criterion) {
let mut group = c.benchmark_group("cache_efficiency");
const CACHE_LINE_SIZE: usize = 64;
// Verify cache-line alignment
println!("\n=== Cache Line Alignment ===");
println!(
"TileReport alignment: {} bytes (cache line: {})",
std::mem::align_of::<TileReport>(),
CACHE_LINE_SIZE
);
println!(
"PatchGraph alignment: {} bytes",
std::mem::align_of::<PatchGraph>()
);
println!(
"SyndromBuffer alignment: {} bytes",
std::mem::align_of::<SyndromBuffer>()
);
println!(
"DetectorBitmap alignment: {} bytes",
std::mem::align_of::<DetectorBitmap>()
);
println!();
// Sequential access pattern (cache-friendly)
group.bench_function("sequential_access", |b| {
let mut graph = PatchGraph::new();
for i in 0..200u16 {
graph.ensure_vertex(i);
}
b.iter(|| {
let mut sum = 0u32;
for i in 0..200 {
if graph.vertices[i].is_active() {
sum += graph.vertices[i].degree as u32;
}
}
black_box(sum);
});
});
// Strided access pattern (potential cache misses)
group.bench_function("strided_access", |b| {
let mut graph = PatchGraph::new();
for i in 0..200u16 {
graph.ensure_vertex(i);
}
b.iter(|| {
let mut sum = 0u32;
// Access every 8th element (stride across multiple cache lines)
for i in (0..200).step_by(8) {
if graph.vertices[i].is_active() {
sum += graph.vertices[i].degree as u32;
}
}
black_box(sum);
});
});
// TileReport array access (should be cache-line aligned)
group.bench_function("tile_report_array_access", |b| {
let reports: Vec<TileReport> = (1..=255)
.map(|i| {
let mut r = TileReport::new(i);
r.local_cut = i as f64;
r
})
.collect();
b.iter(|| {
let mut sum = 0.0f64;
for report in &reports {
sum += report.local_cut;
}
black_box(sum);
});
});
// DetectorBitmap word access (should be aligned)
group.bench_function("bitmap_word_access", |b| {
let mut bitmap = DetectorBitmap::new(1024);
for i in (0..1024).step_by(3) {
bitmap.set(i, true);
}
b.iter(|| {
let raw = bitmap.raw_bits();
let mut sum = 0u64;
for word in raw {
sum = sum.wrapping_add(*word);
}
black_box(sum);
});
});
group.finish();
}
// ============================================================================
// MEMORY POOL SIMULATION
// ============================================================================
/// Benchmark simulated memory pool operations
fn bench_memory_pool(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_pool");
// Pre-allocated tile pool
group.bench_function("tile_pool_reuse", |b| {
// Simulate a pool of worker tiles
let mut tile_pool: Vec<WorkerTile> = (1..=10).map(|i| WorkerTile::new(i)).collect();
let delta = SyndromeDelta::new(0, 1, 100);
b.iter(|| {
// Use tiles from pool without allocation
for tile in &mut tile_pool {
let report = tile.tick(&delta);
black_box(&report);
}
});
});
// Pre-allocated report buffer
group.bench_function("report_buffer_reuse", |b| {
// Simulate a reusable report buffer
let mut report_buffer: [TileReport; 255] = [TileReport::default(); 255];
b.iter(|| {
// Fill buffer without allocation
for i in 0..255 {
report_buffer[i].tile_id = i as u8;
report_buffer[i].local_cut = 10.0;
report_buffer[i].shift_score = 0.1;
report_buffer[i].e_value = 200.0;
}
black_box(&report_buffer);
});
});
// Pre-allocated syndrome round buffer
group.bench_function("syndrome_round_reuse", |b| {
let mut buffer = SyndromeBuffer::new(1024);
let mut round_id = 0u64;
// Pre-fill
for i in 0..1024 {
let round = SyndromeRound::new(i, i, i * 1000, DetectorBitmap::new(64), 0);
buffer.push(round);
}
b.iter(|| {
// Push rounds (reusing buffer space)
for _ in 0..100 {
let round = SyndromeRound::new(
round_id,
round_id,
round_id * 1000,
DetectorBitmap::new(64),
0,
);
buffer.push(round);
round_id += 1;
}
black_box(&buffer);
});
});
group.finish();
}
// ============================================================================
// HEAP ALLOCATION BENCHMARKS
// ============================================================================
/// Benchmark operations that require heap allocation
fn bench_heap_allocations(c: &mut Criterion) {
let mut group = c.benchmark_group("heap_allocations");
// Filter pipeline (requires heap for collections)
group.bench_function("filter_pipeline_create", |b| {
b.iter(|| {
let config = FilterConfig::default();
let pipeline = FilterPipeline::new(config);
black_box(pipeline);
});
});
// TileZero creation (requires heap)
group.bench_function("tilezero_create", |b| {
b.iter(|| {
let thresholds = GateThresholds::default();
let tilezero = TileZero::new(thresholds);
black_box(tilezero);
});
});
// ReceiptLog append (heap allocation)
group.bench_function("receipt_log_grow", |b| {
b.iter_batched(
ReceiptLog::new,
|mut log| {
for i in 0..100 {
log.append(ruqu::tile::GateDecision::Permit, i, i * 1000, [0u8; 32]);
}
black_box(&log);
},
criterion::BatchSize::SmallInput,
);
});
// SyndromeBuffer create (heap allocation)
group.bench_function("syndrome_buffer_create", |b| {
b.iter(|| {
let buffer = SyndromeBuffer::new(1024);
black_box(buffer);
});
});
// Large buffer sizes
for size in [1024, 4096, 16384, 65536].iter() {
group.bench_with_input(
BenchmarkId::new("syndrome_buffer_create", size),
size,
|b, &sz| {
b.iter(|| {
let buffer = SyndromeBuffer::new(sz);
black_box(buffer);
});
},
);
}
group.finish();
}
// ============================================================================
// MEMORY BANDWIDTH BENCHMARKS
// ============================================================================
/// Benchmark memory bandwidth operations
fn bench_memory_bandwidth(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_bandwidth");
// Large data copy (TileReport array)
group.throughput(Throughput::Bytes(
255 * std::mem::size_of::<TileReport>() as u64,
));
group.bench_function("copy_255_reports", |b| {
let source: Vec<TileReport> = (1..=255).map(|i| TileReport::new(i)).collect();
b.iter(|| {
let copy: Vec<TileReport> = source.clone();
black_box(copy);
});
});
// DetectorBitmap copy
group.throughput(Throughput::Bytes(
std::mem::size_of::<DetectorBitmap>() as u64
));
group.bench_function("copy_bitmap", |b| {
let mut bitmap = DetectorBitmap::new(1024);
for i in 0..512 {
bitmap.set(i, true);
}
b.iter(|| {
let copy = bitmap;
black_box(copy);
});
});
// Batch bitmap copy
group.throughput(Throughput::Bytes(
100 * std::mem::size_of::<DetectorBitmap>() as u64,
));
group.bench_function("copy_100_bitmaps", |b| {
let bitmaps: Vec<DetectorBitmap> = (0..100)
.map(|i| {
let mut bm = DetectorBitmap::new(1024);
bm.set(i * 10, true);
bm
})
.collect();
b.iter(|| {
let copy: Vec<DetectorBitmap> = bitmaps.clone();
black_box(copy);
});
});
// SyndromeRound copy
group.throughput(Throughput::Bytes(
std::mem::size_of::<SyndromeRound>() as u64
));
group.bench_function("copy_syndrome_round", |b| {
let mut detectors = DetectorBitmap::new(256);
for i in 0..25 {
detectors.set(i * 10, true);
}
let round = SyndromeRound::new(12345, 100, 1000000, detectors, 0);
b.iter(|| {
let copy = round.clone();
black_box(copy);
});
});
group.finish();
}
// ============================================================================
// CRITERION GROUPS
// ============================================================================
criterion_group!(
memory_benches,
bench_structure_sizes,
bench_per_tile_memory,
bench_allocation_free_ops,
bench_cache_efficiency,
bench_memory_pool,
bench_heap_allocations,
bench_memory_bandwidth,
);
criterion_main!(memory_benches);