648 lines
20 KiB
Rust
648 lines
20 KiB
Rust
//! Comprehensive benchmarks for cognitum-gate-kernel
|
|
//!
|
|
//! Target latencies:
|
|
//! - Single edge insert: < 100ns
|
|
//! - Batch 1000 edges: < 100us
|
|
//! - Single tick: < 500us
|
|
//! - Tick under 10K edges: < 5ms
|
|
//! - TileReport serialization: < 1us
|
|
//! - E-value update: < 50ns
|
|
//! - Mixture e-value (SIMD): < 500ns for 16 hypotheses
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
|
|
|
use cognitum_gate_kernel::{
|
|
delta::{Delta, Observation},
|
|
evidence::{
|
|
f32_to_log_e, EvidenceAccumulator, HypothesisState, LogEValue, LOG_LR_CONNECTIVITY_POS,
|
|
},
|
|
report::TileReport,
|
|
shard::{CompactGraph, MAX_SHARD_VERTICES},
|
|
TileState, MAX_DELTA_BUFFER,
|
|
};
|
|
|
|
// ============================================================================
|
|
// Edge Operations Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark single edge insertion
|
|
fn bench_edge_insert(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("edge_operations");
|
|
group.throughput(Throughput::Elements(1));
|
|
|
|
// Benchmark on empty graph
|
|
group.bench_function("insert_single_empty", |b| {
|
|
b.iter_batched(
|
|
CompactGraph::new,
|
|
|mut graph| {
|
|
black_box(graph.add_edge(0, 1, 100));
|
|
graph
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
// Benchmark on partially filled graph
|
|
group.bench_function("insert_single_partial", |b| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..100u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
graph
|
|
},
|
|
|mut graph| {
|
|
black_box(graph.add_edge(200, 201, 100));
|
|
graph
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
// Benchmark edge removal
|
|
group.bench_function("remove_single", |b| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut graph = CompactGraph::new();
|
|
graph.add_edge(0, 1, 100);
|
|
graph.add_edge(1, 2, 100);
|
|
graph.add_edge(2, 3, 100);
|
|
graph
|
|
},
|
|
|mut graph| {
|
|
black_box(graph.remove_edge(1, 2));
|
|
graph
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
// Benchmark edge lookup
|
|
group.bench_function("find_edge", |b| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..200u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
b.iter(|| black_box(graph.find_edge(100, 101)))
|
|
});
|
|
|
|
// Benchmark weight update
|
|
group.bench_function("update_weight", |b| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..100u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
b.iter(|| {
|
|
black_box(graph.update_weight(50, 51, 200));
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark batch edge insertion (1000 edges)
|
|
fn bench_edge_batch(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("edge_batch");
|
|
|
|
for batch_size in [100, 500, 1000] {
|
|
group.throughput(Throughput::Elements(batch_size as u64));
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("insert_batch", batch_size),
|
|
&batch_size,
|
|
|b, &size| {
|
|
b.iter_batched(
|
|
CompactGraph::new,
|
|
|mut graph| {
|
|
for i in 0..size as u16 {
|
|
// Use modular arithmetic to create varied edges within bounds
|
|
let src = i % 200;
|
|
let dst = (i % 200) + 1;
|
|
graph.add_edge(src, dst, 100);
|
|
}
|
|
black_box(graph)
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
},
|
|
);
|
|
}
|
|
|
|
// Benchmark batch with recompute_components
|
|
group.bench_function("batch_1000_with_components", |b| {
|
|
b.iter_batched(
|
|
CompactGraph::new,
|
|
|mut graph| {
|
|
for i in 0..500u16 {
|
|
let src = i % 200;
|
|
let dst = (i % 200) + 1;
|
|
graph.add_edge(src, dst, 100);
|
|
}
|
|
graph.recompute_components();
|
|
black_box(graph)
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Tick Cycle Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark single tick cycle
|
|
fn bench_tick(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("tick_cycle");
|
|
group.throughput(Throughput::Elements(1));
|
|
|
|
// Empty tick (no deltas)
|
|
group.bench_function("tick_empty", |b| {
|
|
let mut tile = TileState::new(0);
|
|
b.iter(|| black_box(tile.tick(black_box(1))))
|
|
});
|
|
|
|
// Tick with small graph
|
|
group.bench_function("tick_small_graph", |b| {
|
|
let mut tile = TileState::new(0);
|
|
// Add some edges
|
|
for i in 0..10u16 {
|
|
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
|
}
|
|
tile.tick(0); // Initial tick to process deltas
|
|
|
|
b.iter(|| black_box(tile.tick(black_box(1))))
|
|
});
|
|
|
|
// Tick with pending deltas
|
|
group.bench_function("tick_with_deltas", |b| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut tile = TileState::new(0);
|
|
for i in 0..10u16 {
|
|
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
|
}
|
|
tile
|
|
},
|
|
|mut tile| black_box(tile.tick(1)),
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
// Tick with observations
|
|
group.bench_function("tick_with_observations", |b| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut tile = TileState::new(0);
|
|
tile.evidence.add_connectivity_hypothesis(5);
|
|
for _ in 0..5 {
|
|
let obs = Observation::connectivity(5, true);
|
|
tile.ingest_delta(&Delta::observation(obs));
|
|
}
|
|
tile
|
|
},
|
|
|mut tile| black_box(tile.tick(1)),
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark tick under heavy load (10K edges simulated via max graph)
|
|
fn bench_tick_under_load(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("tick_under_load");
|
|
group.sample_size(50); // Reduce sample size for expensive benchmarks
|
|
|
|
// Create a densely connected graph (approaching limits)
|
|
for edge_count in [500, 800, 1000] {
|
|
group.throughput(Throughput::Elements(edge_count as u64));
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("edges", edge_count),
|
|
&edge_count,
|
|
|b, &count| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut tile = TileState::new(0);
|
|
// Create a connected graph
|
|
for i in 0..count.min(1000) as u16 {
|
|
let src = i % 250;
|
|
let dst = (i + 1) % 250;
|
|
if src != dst {
|
|
tile.ingest_delta(&Delta::edge_add(src, dst, 100));
|
|
}
|
|
}
|
|
tile.tick(0); // Process initial deltas
|
|
|
|
// Add some pending work
|
|
tile.ingest_delta(&Delta::edge_add(0, 100, 150));
|
|
tile.ingest_delta(&Delta::observation(Observation::connectivity(0, true)));
|
|
tile
|
|
},
|
|
|mut tile| black_box(tile.tick(1)),
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
},
|
|
);
|
|
}
|
|
|
|
// Benchmark connected components recomputation at scale
|
|
group.bench_function("recompute_components_800", |b| {
|
|
b.iter_batched(
|
|
|| {
|
|
let mut graph = CompactGraph::new();
|
|
// Create 4 disconnected clusters of 50 nodes each
|
|
for cluster in 0..4u16 {
|
|
let base = cluster * 60;
|
|
for i in 0..50u16 {
|
|
graph.add_edge(base + i, base + (i + 1) % 50, 100);
|
|
}
|
|
}
|
|
graph
|
|
},
|
|
|mut graph| {
|
|
black_box(graph.recompute_components());
|
|
graph
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Report Serialization Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark TileReport serialization
|
|
fn bench_report_serialize(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("report_serialization");
|
|
group.throughput(Throughput::Elements(1));
|
|
|
|
// Create a populated tile report
|
|
let create_report = || {
|
|
let mut tile = TileState::new(42);
|
|
for i in 0..20u16 {
|
|
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
|
}
|
|
tile.tick(1)
|
|
};
|
|
|
|
let report = create_report();
|
|
|
|
// Raw memory copy (baseline)
|
|
group.bench_function("raw_copy_64_bytes", |b| {
|
|
let report = create_report();
|
|
b.iter(|| {
|
|
let mut buffer = [0u8; 64];
|
|
unsafe {
|
|
let src = &report as *const TileReport as *const u8;
|
|
core::ptr::copy_nonoverlapping(src, buffer.as_mut_ptr(), 64);
|
|
}
|
|
black_box(buffer)
|
|
})
|
|
});
|
|
|
|
// Report creation from scratch
|
|
group.bench_function("create_new", |b| {
|
|
b.iter(|| black_box(TileReport::new(black_box(42))))
|
|
});
|
|
|
|
// Report field access patterns
|
|
group.bench_function("access_witness", |b| {
|
|
b.iter(|| black_box(report.get_witness()))
|
|
});
|
|
|
|
group.bench_function("access_connected", |b| {
|
|
b.iter(|| black_box(report.is_connected()))
|
|
});
|
|
|
|
group.bench_function("e_value_approx", |b| {
|
|
b.iter(|| black_box(report.e_value_approx()))
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// E-Value Computation Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark e-value accumulator update
|
|
fn bench_evalue_update(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("evalue_update");
|
|
group.throughput(Throughput::Elements(1));
|
|
|
|
// Single hypothesis update
|
|
group.bench_function("hypothesis_update_f32", |b| {
|
|
let mut hyp = HypothesisState::new(0, HypothesisState::TYPE_CONNECTIVITY);
|
|
b.iter(|| black_box(hyp.update(black_box(1.5))))
|
|
});
|
|
|
|
// Update with pre-computed log LR (faster path)
|
|
group.bench_function("hypothesis_update_log_lr", |b| {
|
|
let mut hyp = HypothesisState::new(0, HypothesisState::TYPE_CONNECTIVITY);
|
|
b.iter(|| black_box(hyp.update_with_log_lr(black_box(LOG_LR_CONNECTIVITY_POS))))
|
|
});
|
|
|
|
// f32 to log conversion
|
|
group.bench_function("f32_to_log_e", |b| {
|
|
b.iter(|| black_box(f32_to_log_e(black_box(1.5))))
|
|
});
|
|
|
|
// f32 to log with common value (fast path)
|
|
group.bench_function("f32_to_log_e_fast_path", |b| {
|
|
b.iter(|| black_box(f32_to_log_e(black_box(2.0))))
|
|
});
|
|
|
|
// Full accumulator observation processing
|
|
group.bench_function("accumulator_process_obs", |b| {
|
|
let mut acc = EvidenceAccumulator::new();
|
|
acc.add_connectivity_hypothesis(5);
|
|
let obs = Observation::connectivity(5, true);
|
|
|
|
b.iter(|| {
|
|
acc.process_observation(black_box(obs), black_box(1));
|
|
})
|
|
});
|
|
|
|
// Multiple hypotheses
|
|
for hyp_count in [1, 4, 8, 16] {
|
|
group.bench_with_input(
|
|
BenchmarkId::new("process_obs_hypotheses", hyp_count),
|
|
&hyp_count,
|
|
|b, &count| {
|
|
let mut acc = EvidenceAccumulator::new();
|
|
for v in 0..count as u16 {
|
|
acc.add_connectivity_hypothesis(v);
|
|
}
|
|
let obs = Observation::connectivity(0, true);
|
|
|
|
b.iter(|| {
|
|
acc.process_observation(black_box(obs), black_box(1));
|
|
})
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark mixture e-value computation (potential SIMD opportunity)
|
|
fn bench_mixture_evalue(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("mixture_evalue");
|
|
|
|
// Simulated mixture: aggregate multiple log e-values
|
|
// This is where SIMD can provide significant speedup
|
|
|
|
// Scalar baseline
|
|
group.bench_function("aggregate_16_scalar", |b| {
|
|
let log_e_values: [LogEValue; 16] = [
|
|
65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
|
|
65536, 38550, 65536, 38550,
|
|
];
|
|
|
|
b.iter(|| {
|
|
let sum: LogEValue = log_e_values.iter().copied().sum();
|
|
black_box(sum)
|
|
})
|
|
});
|
|
|
|
// Parallel lanes pattern (SIMD-friendly)
|
|
group.bench_function("aggregate_16_parallel_lanes", |b| {
|
|
let log_e_values: [LogEValue; 16] = [
|
|
65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
|
|
65536, 38550, 65536, 38550,
|
|
];
|
|
|
|
b.iter(|| {
|
|
// Process in 4 lanes (potential SIMD with 128-bit registers)
|
|
let mut lanes = [0i32; 4];
|
|
for (i, &val) in log_e_values.iter().enumerate() {
|
|
lanes[i % 4] = lanes[i % 4].saturating_add(val);
|
|
}
|
|
let sum = lanes.iter().sum::<i32>();
|
|
black_box(sum)
|
|
})
|
|
});
|
|
|
|
// Chunked processing (auto-vectorization friendly)
|
|
group.bench_function("aggregate_16_chunked", |b| {
|
|
let log_e_values: [LogEValue; 16] = [
|
|
65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
|
|
65536, 38550, 65536, 38550,
|
|
];
|
|
|
|
b.iter(|| {
|
|
let mut total = 0i32;
|
|
for chunk in log_e_values.chunks(4) {
|
|
let chunk_sum: i32 = chunk.iter().copied().sum();
|
|
total = total.saturating_add(chunk_sum);
|
|
}
|
|
black_box(total)
|
|
})
|
|
});
|
|
|
|
// Scale to 255 tiles (realistic workload)
|
|
group.bench_function("aggregate_255_tiles", |b| {
|
|
let log_e_values: Vec<LogEValue> = (0..255)
|
|
.map(|i| (i as i32 % 3 - 1) * 65536) // Varying positive/negative evidence
|
|
.collect();
|
|
|
|
b.iter(|| {
|
|
let sum: i64 = log_e_values.iter().map(|&v| v as i64).sum();
|
|
black_box(sum)
|
|
})
|
|
});
|
|
|
|
// Mixture with product (exp-log pattern)
|
|
group.bench_function("mixture_product_16", |b| {
|
|
let log_e_values: [LogEValue; 16] = [
|
|
65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
|
|
65536, 38550, 65536, 38550,
|
|
];
|
|
|
|
b.iter(|| {
|
|
// For product, sum the logs, then exp
|
|
let log_sum: i64 = log_e_values.iter().map(|&v| v as i64).sum();
|
|
// Approximate exp2 for final result
|
|
let approx_result = (log_sum as f64) / 65536.0;
|
|
black_box(approx_result)
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Additional Performance Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark delta ingestion
|
|
fn bench_delta_ingestion(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("delta_ingestion");
|
|
group.throughput(Throughput::Elements(1));
|
|
|
|
group.bench_function("ingest_single", |b| {
|
|
let mut tile = TileState::new(0);
|
|
let delta = Delta::edge_add(0, 1, 100);
|
|
|
|
b.iter(|| {
|
|
tile.reset();
|
|
black_box(tile.ingest_delta(&delta))
|
|
})
|
|
});
|
|
|
|
// Fill buffer benchmark
|
|
group.bench_function("fill_buffer_64", |b| {
|
|
b.iter_batched(
|
|
|| TileState::new(0),
|
|
|mut tile| {
|
|
for i in 0..MAX_DELTA_BUFFER as u16 {
|
|
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
|
}
|
|
black_box(tile)
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark neighbor iteration
|
|
fn bench_neighbor_iteration(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("neighbor_iteration");
|
|
|
|
// Create a graph with varying degree vertices
|
|
let mut graph = CompactGraph::new();
|
|
// Create a hub vertex with many neighbors
|
|
for i in 1..25u16 {
|
|
graph.add_edge(0, i, 100);
|
|
}
|
|
// Create a chain
|
|
for i in 30..50u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
|
|
group.bench_function("neighbors_hub_24", |b| {
|
|
b.iter(|| {
|
|
let neighbors = graph.neighbors(0);
|
|
black_box(neighbors.len())
|
|
})
|
|
});
|
|
|
|
group.bench_function("neighbors_chain_2", |b| {
|
|
b.iter(|| {
|
|
let neighbors = graph.neighbors(35);
|
|
black_box(neighbors.len())
|
|
})
|
|
});
|
|
|
|
group.bench_function("iterate_all_neighbors", |b| {
|
|
b.iter(|| {
|
|
let mut total = 0usize;
|
|
for v in 0..50u16 {
|
|
total += graph.neighbors(v).len();
|
|
}
|
|
black_box(total)
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Memory and Cache Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Benchmark memory access patterns
|
|
fn bench_memory_patterns(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("memory_patterns");
|
|
|
|
// Sequential vertex access
|
|
group.bench_function("sequential_vertex_scan", |b| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..200u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
|
|
b.iter(|| {
|
|
let mut active = 0u16;
|
|
for i in 0..256u16 {
|
|
if graph.vertices[i as usize].is_active() {
|
|
active += 1;
|
|
}
|
|
}
|
|
black_box(active)
|
|
})
|
|
});
|
|
|
|
// Random access pattern
|
|
group.bench_function("random_vertex_access", |b| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..200u16 {
|
|
graph.add_edge(i, i + 1, 100);
|
|
}
|
|
|
|
// Pseudo-random access pattern
|
|
let indices: Vec<u16> = (0..100).map(|i| (i * 37) % 256).collect();
|
|
|
|
b.iter(|| {
|
|
let mut sum = 0u8;
|
|
for &i in &indices {
|
|
sum = sum.wrapping_add(graph.vertices[i as usize].degree);
|
|
}
|
|
black_box(sum)
|
|
})
|
|
});
|
|
|
|
// Edge array scan
|
|
group.bench_function("edge_array_scan", |b| {
|
|
let mut graph = CompactGraph::new();
|
|
for i in 0..500u16 {
|
|
let src = i % 200;
|
|
let dst = (i % 200) + 1;
|
|
if src != dst {
|
|
graph.add_edge(src, dst, 100);
|
|
}
|
|
}
|
|
|
|
b.iter(|| {
|
|
let mut active = 0u16;
|
|
for edge in &graph.edges {
|
|
if edge.is_active() {
|
|
active += 1;
|
|
}
|
|
}
|
|
black_box(active)
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Criterion Groups
|
|
// ============================================================================
|
|
|
|
criterion_group!(edge_benches, bench_edge_insert, bench_edge_batch,);
|
|
|
|
criterion_group!(tick_benches, bench_tick, bench_tick_under_load,);
|
|
|
|
criterion_group!(evidence_benches, bench_evalue_update, bench_mixture_evalue,);
|
|
|
|
criterion_group!(
|
|
misc_benches,
|
|
bench_report_serialize,
|
|
bench_delta_ingestion,
|
|
bench_neighbor_iteration,
|
|
bench_memory_patterns,
|
|
);
|
|
|
|
criterion_main!(edge_benches, tick_benches, evidence_benches, misc_benches);
|