Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
//! Attention-weighted coherence benchmarks
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn attention_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("attention");
// Placeholder benchmark - requires attention feature
group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
group.finish();
}
criterion_group!(benches, attention_benchmark);
criterion_main!(benches);

View File

@@ -0,0 +1,15 @@
//! Coherence engine benchmarks
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
fn coherence_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("coherence");
// Placeholder benchmark - will be implemented when coherence module is complete
group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
group.finish();
}
criterion_group!(benches, coherence_benchmark);
criterion_main!(benches);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,546 @@
//! Benchmarks for full graph energy computation
//!
//! ADR-014 Performance Target: < 10ms for 10K nodes
//!
//! Global coherence energy: E(S) = sum(w_e * |r_e|^2)
//! This is the aggregate measure of system incoherence.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::HashMap;
// ============================================================================
// Graph Types (Simulated for benchmarking)
// ============================================================================
/// Simplified restriction map for energy benchmarks
#[derive(Clone)]
pub struct RestrictionMap {
pub matrix: Vec<f32>,
pub bias: Vec<f32>,
pub input_dim: usize,
pub output_dim: usize,
}
impl RestrictionMap {
pub fn identity(dim: usize) -> Self {
let mut matrix = vec![0.0f32; dim * dim];
for i in 0..dim {
matrix[i * dim + i] = 1.0;
}
Self {
matrix,
bias: vec![0.0; dim],
input_dim: dim,
output_dim: dim,
}
}
#[inline]
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
output.copy_from_slice(&self.bias);
for i in 0..self.output_dim {
let row_start = i * self.input_dim;
for j in 0..self.input_dim {
output[i] += self.matrix[row_start + j] * input[j];
}
}
}
}
/// Node in sheaf graph
#[derive(Clone)]
pub struct SheafNode {
pub id: u64,
pub state: Vec<f32>,
}
/// Edge with restriction maps
#[derive(Clone)]
pub struct SheafEdge {
pub source: u64,
pub target: u64,
pub weight: f32,
pub rho_source: RestrictionMap,
pub rho_target: RestrictionMap,
}
impl SheafEdge {
#[inline]
pub fn weighted_residual_energy_into(
&self,
source: &[f32],
target: &[f32],
source_buf: &mut [f32],
target_buf: &mut [f32],
) -> f32 {
self.rho_source.apply_into(source, source_buf);
self.rho_target.apply_into(target, target_buf);
let mut norm_sq = 0.0f32;
for i in 0..source_buf.len() {
let diff = source_buf[i] - target_buf[i];
norm_sq += diff * diff;
}
self.weight * norm_sq
}
}
/// Full sheaf graph for coherence computation
pub struct SheafGraph {
pub nodes: HashMap<u64, SheafNode>,
pub edges: Vec<SheafEdge>,
pub state_dim: usize,
}
/// Result of energy computation
pub struct CoherenceEnergy {
pub total_energy: f32,
pub edge_energies: Vec<f32>,
}
impl SheafGraph {
/// Generate a random graph for benchmarking
pub fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = || {
let mut h = DefaultHasher::new();
seed.hash(&mut h);
h
};
// Generate nodes
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
.map(|id| {
let state: Vec<f32> = (0..state_dim)
.map(|i| {
let mut h = hasher();
(id, i).hash(&mut h);
(h.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect();
(id, SheafNode { id, state })
})
.collect();
// Generate edges (random graph with target average degree)
let num_edges = (num_nodes * avg_degree) / 2;
let mut edges = Vec::with_capacity(num_edges);
for i in 0..num_edges {
let mut h = hasher();
(seed, i, "edge").hash(&mut h);
let source = (h.finish() % num_nodes as u64) as u64;
let mut h = hasher();
(seed, i, "target").hash(&mut h);
let target = (h.finish() % num_nodes as u64) as u64;
if source != target {
edges.push(SheafEdge {
source,
target,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
});
}
}
Self {
nodes,
edges,
state_dim,
}
}
/// Generate a chain graph (linear topology)
pub fn chain(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
.map(|id| {
let state: Vec<f32> = (0..state_dim)
.map(|i| {
let mut h = DefaultHasher::new();
(seed, id, i).hash(&mut h);
(h.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect();
(id, SheafNode { id, state })
})
.collect();
let edges: Vec<SheafEdge> = (0..num_nodes - 1)
.map(|i| SheafEdge {
source: i as u64,
target: (i + 1) as u64,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
})
.collect();
Self {
nodes,
edges,
state_dim,
}
}
/// Generate a dense graph (high connectivity)
pub fn dense(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
.map(|id| {
let state: Vec<f32> = (0..state_dim)
.map(|i| {
let mut h = DefaultHasher::new();
(seed, id, i).hash(&mut h);
(h.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect();
(id, SheafNode { id, state })
})
.collect();
// Dense: ~30% of possible edges
let mut edges = Vec::new();
for i in 0..num_nodes as u64 {
for j in (i + 1)..num_nodes as u64 {
let mut h = DefaultHasher::new();
(seed, i, j).hash(&mut h);
if h.finish() % 10 < 3 {
// 30% probability
edges.push(SheafEdge {
source: i,
target: j,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
});
}
}
}
Self {
nodes,
edges,
state_dim,
}
}
/// Compute global coherence energy (sequential)
pub fn compute_energy_sequential(&self) -> CoherenceEnergy {
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
let edge_energies: Vec<f32> = self
.edges
.iter()
.map(|edge| {
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
)
})
.collect();
let total_energy: f32 = edge_energies.iter().sum();
CoherenceEnergy {
total_energy,
edge_energies,
}
}
/// Compute global coherence energy (parallel with rayon)
#[cfg(feature = "parallel")]
pub fn compute_energy_parallel(&self) -> CoherenceEnergy {
use rayon::prelude::*;
let edge_energies: Vec<f32> = self
.edges
.par_iter()
.map(|edge| {
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
)
})
.collect();
let total_energy: f32 = edge_energies.par_iter().sum();
CoherenceEnergy {
total_energy,
edge_energies,
}
}
/// Compute just total energy (no per-edge tracking)
pub fn compute_total_energy(&self) -> f32 {
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
let mut total = 0.0f32;
for edge in &self.edges {
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
total += edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
);
}
total
}
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark full graph energy at various sizes
fn bench_full_graph_energy(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_full_graph");
// ADR-014 target: 10K nodes in <10ms
// Test progression: 100, 1K, 10K, 100K
for num_nodes in [100, 1_000, 10_000] {
let avg_degree = 4;
let state_dim = 64;
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(
BenchmarkId::new("sequential", format!("{}nodes", num_nodes)),
&num_nodes,
|b, _| b.iter(|| black_box(graph.compute_energy_sequential())),
);
// Total energy only (no per-edge allocation)
group.bench_with_input(
BenchmarkId::new("total_only", format!("{}nodes", num_nodes)),
&num_nodes,
|b, _| b.iter(|| black_box(graph.compute_total_energy())),
);
}
group.finish();
}
/// Benchmark with 100K nodes (reduced sample size due to runtime)
fn bench_large_graph_energy(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_large_graph");
group.sample_size(10);
let num_nodes = 100_000;
let avg_degree = 4;
let state_dim = 64;
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_function("100K_nodes_total_energy", |b| {
b.iter(|| black_box(graph.compute_total_energy()))
});
group.finish();
}
/// Benchmark energy computation for different graph topologies
fn bench_topology_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_topology");
let num_nodes = 1000;
let state_dim = 64;
// Chain topology (sparse, n-1 edges)
let chain = SheafGraph::chain(num_nodes, state_dim, 42);
group.throughput(Throughput::Elements(chain.edges.len() as u64));
group.bench_function("chain_1000", |b| {
b.iter(|| black_box(chain.compute_total_energy()))
});
// Random topology (avg degree 4)
let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
group.throughput(Throughput::Elements(random.edges.len() as u64));
group.bench_function("random_1000_deg4", |b| {
b.iter(|| black_box(random.compute_total_energy()))
});
// Dense topology (~30% edges)
let dense = SheafGraph::dense(100, state_dim, 42); // Smaller for dense
group.throughput(Throughput::Elements(dense.edges.len() as u64));
group.bench_function("dense_100", |b| {
b.iter(|| black_box(dense.compute_total_energy()))
});
group.finish();
}
/// Benchmark impact of state dimension on energy computation
fn bench_state_dimension(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_state_dim");
let num_nodes = 1000;
let avg_degree = 4;
for state_dim in [8, 32, 64, 128, 256] {
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(BenchmarkId::new("dim", state_dim), &state_dim, |b, _| {
b.iter(|| black_box(graph.compute_total_energy()))
});
}
group.finish();
}
/// Benchmark edge density scaling
fn bench_edge_density(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_edge_density");
let num_nodes = 1000;
let state_dim = 64;
// Varying average degree
for avg_degree in [2, 4, 8, 16, 32] {
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(
BenchmarkId::new("avg_degree", avg_degree),
&avg_degree,
|b, _| b.iter(|| black_box(graph.compute_total_energy())),
);
}
group.finish();
}
/// Benchmark scope-based energy aggregation
fn bench_scoped_energy(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_scoped");
let num_nodes = 10_000;
let avg_degree = 4;
let state_dim = 64;
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
// Simulate scope-based aggregation (e.g., by namespace)
let num_scopes = 10;
let scope_assignments: Vec<usize> = graph
.edges
.iter()
.enumerate()
.map(|(i, _)| i % num_scopes)
.collect();
group.bench_function("aggregate_by_scope", |b| {
b.iter(|| {
let mut source_buf = vec![0.0f32; state_dim];
let mut target_buf = vec![0.0f32; state_dim];
let mut scope_energies = vec![0.0f32; num_scopes];
for (i, edge) in graph.edges.iter().enumerate() {
let source_state = &graph.nodes[&edge.source].state;
let target_state = &graph.nodes[&edge.target].state;
let energy = edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
);
scope_energies[scope_assignments[i]] += energy;
}
black_box(scope_energies)
})
});
group.finish();
}
/// Benchmark energy fingerprint computation
fn bench_energy_fingerprint(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_fingerprint");
let num_nodes = 1000;
let avg_degree = 4;
let state_dim = 64;
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
group.bench_function("compute_with_fingerprint", |b| {
b.iter(|| {
let energy = graph.compute_energy_sequential();
// Compute fingerprint from edge energies
let mut fingerprint = 0u64;
for e in &energy.edge_energies {
fingerprint ^= e.to_bits() as u64;
fingerprint = fingerprint.rotate_left(7);
}
black_box((energy.total_energy, fingerprint))
})
});
group.finish();
}
/// Benchmark memory access patterns for energy computation
fn bench_memory_patterns(c: &mut Criterion) {
let mut group = c.benchmark_group("energy_memory");
let num_nodes = 10_000;
let state_dim = 64;
// Sequential node access (chain)
let chain = SheafGraph::chain(num_nodes, state_dim, 42);
group.bench_function("sequential_access", |b| {
b.iter(|| black_box(chain.compute_total_energy()))
});
// Random node access
let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
group.bench_function("random_access", |b| {
b.iter(|| black_box(random.compute_total_energy()))
});
group.finish();
}
criterion_group!(
benches,
bench_full_graph_energy,
bench_large_graph_energy,
bench_topology_impact,
bench_state_dimension,
bench_edge_density,
bench_scoped_energy,
bench_energy_fingerprint,
bench_memory_patterns,
);
criterion_main!(benches);

View File

@@ -0,0 +1,629 @@
//! Benchmarks for coherence gate evaluation
//!
//! ADR-014 Performance Target: < 500us per gate evaluation
//!
//! The gate is a deterministic decision point that:
//! 1. Evaluates current energy against thresholds
//! 2. Checks persistence history
//! 3. Determines compute lane (Reflex/Retrieval/Heavy/Human)
//! 4. Creates witness record
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::VecDeque;
use std::time::Duration;
// ============================================================================
// Types (Simulated for benchmarking)
// ============================================================================
/// Compute lanes for escalating complexity
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ComputeLane {
/// Lane 0: Local residual updates (<1ms)
Reflex = 0,
/// Lane 1: Evidence fetching (~10ms)
Retrieval = 1,
/// Lane 2: Multi-step planning (~100ms)
Heavy = 2,
/// Lane 3: Human escalation
Human = 3,
}
/// Coherence energy snapshot
#[derive(Clone)]
pub struct CoherenceEnergy {
pub total_energy: f32,
pub scope_energies: Vec<(u64, f32)>, // (scope_id, energy)
pub timestamp: u64,
pub fingerprint: u64,
}
impl CoherenceEnergy {
pub fn new(total: f32, num_scopes: usize) -> Self {
let scope_energies: Vec<(u64, f32)> = (0..num_scopes)
.map(|i| (i as u64, total / num_scopes as f32))
.collect();
Self {
total_energy: total,
scope_energies,
timestamp: 0,
fingerprint: (total.to_bits() as u64).wrapping_mul(0x517cc1b727220a95),
}
}
pub fn scope_energy(&self, scope_id: u64) -> f32 {
self.scope_energies
.iter()
.find(|(id, _)| *id == scope_id)
.map(|(_, e)| *e)
.unwrap_or(0.0)
}
}
/// Action to be gated
#[derive(Clone)]
pub struct Action {
pub id: u64,
pub scope_id: u64,
pub action_type: ActionType,
pub payload_hash: u64,
}
#[derive(Clone, Copy)]
pub enum ActionType {
Read,
Write,
Execute,
External,
}
/// Threshold configuration
#[derive(Clone)]
pub struct ThresholdConfig {
pub reflex: f32,
pub retrieval: f32,
pub heavy: f32,
pub persistence_window_ms: u64,
}
impl Default for ThresholdConfig {
fn default() -> Self {
Self {
reflex: 0.1,
retrieval: 0.5,
heavy: 1.0,
persistence_window_ms: 5000,
}
}
}
/// Energy history for persistence detection
pub struct EnergyHistory {
/// Rolling window of (timestamp_ms, energy) pairs per scope
history: Vec<VecDeque<(u64, f32)>>,
max_scopes: usize,
window_size: usize,
}
impl EnergyHistory {
pub fn new(max_scopes: usize, window_size: usize) -> Self {
Self {
history: (0..max_scopes)
.map(|_| VecDeque::with_capacity(window_size))
.collect(),
max_scopes,
window_size,
}
}
pub fn record(&mut self, scope_id: u64, timestamp_ms: u64, energy: f32) {
if (scope_id as usize) < self.max_scopes {
let queue = &mut self.history[scope_id as usize];
if queue.len() >= self.window_size {
queue.pop_front();
}
queue.push_back((timestamp_ms, energy));
}
}
pub fn is_above_threshold(
&self,
scope_id: u64,
threshold: f32,
window_ms: u64,
current_time_ms: u64,
) -> bool {
if (scope_id as usize) >= self.max_scopes {
return false;
}
let queue = &self.history[scope_id as usize];
let cutoff = current_time_ms.saturating_sub(window_ms);
// Check if all samples in window are above threshold
let samples_in_window: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
if samples_in_window.is_empty() {
return false;
}
samples_in_window.iter().all(|(_, e)| *e >= threshold)
}
pub fn trend(&self, scope_id: u64, window_ms: u64, current_time_ms: u64) -> Option<f32> {
if (scope_id as usize) >= self.max_scopes {
return None;
}
let queue = &self.history[scope_id as usize];
let cutoff = current_time_ms.saturating_sub(window_ms);
let samples: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
if samples.len() < 2 {
return None;
}
// Simple linear trend: (last - first) / count
let first = samples.first().unwrap().1;
let last = samples.last().unwrap().1;
Some((last - first) / samples.len() as f32)
}
}
/// Witness record for audit
#[derive(Clone)]
pub struct WitnessRecord {
pub id: u64,
pub action_hash: u64,
pub energy_fingerprint: u64,
pub lane: ComputeLane,
pub allowed: bool,
pub timestamp: u64,
pub content_hash: u64,
}
impl WitnessRecord {
pub fn new(
action: &Action,
energy: &CoherenceEnergy,
lane: ComputeLane,
allowed: bool,
timestamp: u64,
) -> Self {
let content_hash = Self::compute_hash(action, energy, lane, allowed, timestamp);
Self {
id: timestamp, // Simplified
action_hash: action.payload_hash,
energy_fingerprint: energy.fingerprint,
lane,
allowed,
timestamp,
content_hash,
}
}
fn compute_hash(
action: &Action,
energy: &CoherenceEnergy,
lane: ComputeLane,
allowed: bool,
timestamp: u64,
) -> u64 {
// Simplified hash computation (in production: use Blake3)
let mut h = action.payload_hash;
h = h.wrapping_mul(0x517cc1b727220a95);
h ^= energy.fingerprint;
h = h.wrapping_mul(0x517cc1b727220a95);
h ^= (lane as u64) << 32 | (allowed as u64);
h = h.wrapping_mul(0x517cc1b727220a95);
h ^= timestamp;
h
}
}
/// Gate decision result
pub struct GateDecision {
pub allow: bool,
pub lane: ComputeLane,
pub witness: WitnessRecord,
pub denial_reason: Option<&'static str>,
}
/// Coherence gate
pub struct CoherenceGate {
pub config: ThresholdConfig,
pub history: EnergyHistory,
current_time_ms: u64,
}
impl CoherenceGate {
pub fn new(config: ThresholdConfig, max_scopes: usize) -> Self {
Self {
config,
history: EnergyHistory::new(max_scopes, 100),
current_time_ms: 0,
}
}
/// Evaluate whether action should proceed
pub fn evaluate(&mut self, action: &Action, energy: &CoherenceEnergy) -> GateDecision {
let current_energy = energy.scope_energy(action.scope_id);
// Record in history
self.history
.record(action.scope_id, self.current_time_ms, current_energy);
// Determine lane based on energy
let lane = if current_energy < self.config.reflex {
ComputeLane::Reflex
} else if current_energy < self.config.retrieval {
ComputeLane::Retrieval
} else if current_energy < self.config.heavy {
ComputeLane::Heavy
} else {
ComputeLane::Human
};
// Check for persistent incoherence
let persistent = self.history.is_above_threshold(
action.scope_id,
self.config.retrieval,
self.config.persistence_window_ms,
self.current_time_ms,
);
// Check for growing incoherence (trend)
let growing = self
.history
.trend(
action.scope_id,
self.config.persistence_window_ms,
self.current_time_ms,
)
.map(|t| t > 0.01)
.unwrap_or(false);
// Escalate if persistent and not already at high lane
let final_lane = if (persistent || growing) && lane < ComputeLane::Heavy {
ComputeLane::Heavy
} else {
lane
};
// Allow unless Human lane
let allow = final_lane < ComputeLane::Human;
let denial_reason = if !allow {
Some("Energy exceeds all automatic thresholds")
} else if persistent {
Some("Persistent incoherence - escalated")
} else {
None
};
let witness = WitnessRecord::new(action, energy, final_lane, allow, self.current_time_ms);
self.current_time_ms += 1;
GateDecision {
allow,
lane: final_lane,
witness,
denial_reason,
}
}
/// Fast path evaluation (no history update)
#[inline]
pub fn evaluate_fast(&self, scope_energy: f32) -> ComputeLane {
if scope_energy < self.config.reflex {
ComputeLane::Reflex
} else if scope_energy < self.config.retrieval {
ComputeLane::Retrieval
} else if scope_energy < self.config.heavy {
ComputeLane::Heavy
} else {
ComputeLane::Human
}
}
/// Advance time (for benchmarking)
pub fn advance_time(&mut self, delta_ms: u64) {
self.current_time_ms += delta_ms;
}
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark full gate evaluation
fn bench_gate_evaluate(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_evaluate");
group.throughput(Throughput::Elements(1));
let config = ThresholdConfig::default();
let mut gate = CoherenceGate::new(config, 100);
let action = Action {
id: 1,
scope_id: 0,
action_type: ActionType::Write,
payload_hash: 0x12345678,
};
// Low energy (Reflex lane)
let low_energy = CoherenceEnergy::new(0.05, 10);
group.bench_function("low_energy_reflex", |b| {
b.iter(|| {
let decision = gate.evaluate(black_box(&action), black_box(&low_energy));
black_box(decision.lane)
})
});
// Medium energy (Retrieval lane)
let med_energy = CoherenceEnergy::new(0.3, 10);
group.bench_function("medium_energy_retrieval", |b| {
b.iter(|| {
let decision = gate.evaluate(black_box(&action), black_box(&med_energy));
black_box(decision.lane)
})
});
// High energy (Heavy lane)
let high_energy = CoherenceEnergy::new(0.8, 10);
group.bench_function("high_energy_heavy", |b| {
b.iter(|| {
let decision = gate.evaluate(black_box(&action), black_box(&high_energy));
black_box(decision.lane)
})
});
// Critical energy (Human lane)
let critical_energy = CoherenceEnergy::new(2.0, 10);
group.bench_function("critical_energy_human", |b| {
b.iter(|| {
let decision = gate.evaluate(black_box(&action), black_box(&critical_energy));
black_box(decision.lane)
})
});
group.finish();
}
/// Benchmark fast path evaluation (no history)
fn bench_gate_fast_path(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_fast_path");
group.throughput(Throughput::Elements(1));
let config = ThresholdConfig::default();
let gate = CoherenceGate::new(config, 100);
for energy in [0.05, 0.3, 0.8, 2.0] {
group.bench_with_input(
BenchmarkId::new("evaluate_fast", format!("{:.2}", energy)),
&energy,
|b, &e| b.iter(|| black_box(gate.evaluate_fast(black_box(e)))),
);
}
group.finish();
}
/// Benchmark witness record creation
fn bench_witness_creation(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_witness");
group.throughput(Throughput::Elements(1));
let action = Action {
id: 1,
scope_id: 0,
action_type: ActionType::Write,
payload_hash: 0x12345678,
};
let energy = CoherenceEnergy::new(0.3, 10);
group.bench_function("create_witness", |b| {
b.iter(|| {
WitnessRecord::new(
black_box(&action),
black_box(&energy),
black_box(ComputeLane::Retrieval),
black_box(true),
black_box(12345),
)
})
});
group.finish();
}
/// Benchmark history operations
fn bench_history_operations(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_history");
let mut history = EnergyHistory::new(100, 1000);
// Pre-populate with some history
for t in 0..500 {
for scope in 0..10u64 {
history.record(scope, t, 0.3 + (t % 10) as f32 * 0.01);
}
}
// Record operation
group.bench_function("record_single", |b| {
let mut t = 1000u64;
b.iter(|| {
history.record(black_box(5), black_box(t), black_box(0.35));
t += 1;
})
});
// Check threshold
group.bench_function("check_threshold", |b| {
b.iter(|| {
history.is_above_threshold(black_box(5), black_box(0.3), black_box(100), black_box(500))
})
});
// Compute trend
group.bench_function("compute_trend", |b| {
b.iter(|| history.trend(black_box(5), black_box(100), black_box(500)))
});
group.finish();
}
/// Benchmark persistence detection with various window sizes
fn bench_persistence_detection(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_persistence");
for window_size in [10, 100, 1000] {
let mut history = EnergyHistory::new(10, window_size);
// Fill history
for t in 0..window_size as u64 {
history.record(0, t, 0.4); // Consistently above retrieval threshold
}
group.bench_with_input(
BenchmarkId::new("check_persistent", window_size),
&window_size,
|b, &size| {
b.iter(|| {
history.is_above_threshold(
black_box(0),
black_box(0.3),
black_box(size as u64),
black_box(size as u64),
)
})
},
);
}
group.finish();
}
/// Benchmark batch evaluation (multiple actions)
fn bench_batch_evaluation(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_batch");
let config = ThresholdConfig::default();
let mut gate = CoherenceGate::new(config, 100);
for batch_size in [10, 100, 1000] {
let actions: Vec<Action> = (0..batch_size)
.map(|i| Action {
id: i as u64,
scope_id: (i % 10) as u64,
action_type: ActionType::Write,
payload_hash: i as u64 * 0x517cc1b727220a95,
})
.collect();
let energies: Vec<CoherenceEnergy> = (0..batch_size)
.map(|i| CoherenceEnergy::new(0.1 + (i % 20) as f32 * 0.05, 10))
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("evaluate_batch", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
let mut lanes = Vec::with_capacity(actions.len());
for (action, energy) in actions.iter().zip(energies.iter()) {
let decision = gate.evaluate(action, energy);
lanes.push(decision.lane);
}
black_box(lanes)
})
},
);
}
group.finish();
}
/// Benchmark scope energy lookup
fn bench_scope_lookup(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_scope_lookup");
for num_scopes in [10, 100, 1000] {
let energy = CoherenceEnergy::new(1.0, num_scopes);
group.bench_with_input(
BenchmarkId::new("lookup", num_scopes),
&num_scopes,
|b, &n| {
let scope_id = (n / 2) as u64;
b.iter(|| black_box(energy.scope_energy(black_box(scope_id))))
},
);
}
group.finish();
}
/// Benchmark threshold comparison patterns
fn bench_threshold_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("gate_threshold_cmp");
let config = ThresholdConfig::default();
// Sequential if-else (current implementation)
group.bench_function("sequential_if_else", |b| {
let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
b.iter(|| {
let mut lanes = [0u32; 4];
for &e in &energies {
let lane = if e < config.reflex {
0
} else if e < config.retrieval {
1
} else if e < config.heavy {
2
} else {
3
};
lanes[lane] += 1;
}
black_box(lanes)
})
});
// Binary search pattern
group.bench_function("binary_search", |b| {
let thresholds = [config.reflex, config.retrieval, config.heavy, f32::MAX];
let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
b.iter(|| {
let mut lanes = [0u32; 4];
for &e in &energies {
let lane = thresholds.partition_point(|&t| t <= e);
lanes[lane.min(3)] += 1;
}
black_box(lanes)
})
});
group.finish();
}
criterion_group!(
benches,
bench_gate_evaluate,
bench_gate_fast_path,
bench_witness_creation,
bench_history_operations,
bench_persistence_detection,
bench_batch_evaluation,
bench_scope_lookup,
bench_threshold_comparison,
);
criterion_main!(benches);

View File

@@ -0,0 +1,784 @@
//! GPU-Specific Benchmarks for Prime-Radiant Coherence Engine
//!
//! This benchmark suite compares CPU and GPU implementations of core
//! coherence operations. Requires the `gpu` feature to be enabled.
//!
//! ## Benchmark Categories
//! 1. Energy Computation - CPU vs GPU
//! 2. Attention Forward Pass - CPU vs GPU
//! 3. Batch Routing Decisions - CPU vs GPU
//! 4. Memory Transfer Overhead
//!
//! ## GPU Backend Notes
//! - Primary: wgpu (cross-platform WebGPU)
//! - Optional: CUDA (NVIDIA), Metal (Apple), Vulkan
//!
//! ## Running GPU Benchmarks
//! ```bash
//! cargo bench --features gpu --bench gpu_benchmarks
//! ```
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
// ============================================================================
// TEST DATA GENERATION
// ============================================================================
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
(0..len)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
(0..rows * cols)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
// ============================================================================
// CPU BASELINE IMPLEMENTATIONS
// ============================================================================
/// CPU coherence energy computation
#[derive(Clone)]
struct CpuSheafGraph {
nodes: HashMap<u64, Vec<f32>>,
edges: Vec<(u64, u64, f32)>, // (source, target, weight)
state_dim: usize,
}
impl CpuSheafGraph {
fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
let nodes: HashMap<u64, Vec<f32>> = (0..num_nodes as u64)
.map(|id| (id, generate_vec(state_dim, seed + id)))
.collect();
let num_edges = (num_nodes * avg_degree) / 2;
let edges: Vec<(u64, u64, f32)> = (0..num_edges)
.filter_map(|i| {
let mut h = DefaultHasher::new();
(seed, i, "src").hash(&mut h);
let source = h.finish() % num_nodes as u64;
let mut h = DefaultHasher::new();
(seed, i, "tgt").hash(&mut h);
let target = h.finish() % num_nodes as u64;
if source != target {
Some((source, target, 1.0))
} else {
None
}
})
.collect();
Self {
nodes,
edges,
state_dim,
}
}
/// Compute total energy on CPU
fn compute_energy_cpu(&self) -> f32 {
let mut total = 0.0f32;
for &(src, tgt, weight) in &self.edges {
let src_state = &self.nodes[&src];
let tgt_state = &self.nodes[&tgt];
let mut norm_sq = 0.0f32;
for i in 0..self.state_dim {
let diff = src_state[i] - tgt_state[i];
norm_sq += diff * diff;
}
total += weight * norm_sq;
}
total
}
/// Compute energy with per-edge results on CPU
fn compute_energy_with_edges_cpu(&self) -> (f32, Vec<f32>) {
let edge_energies: Vec<f32> = self
.edges
.iter()
.map(|&(src, tgt, weight)| {
let src_state = &self.nodes[&src];
let tgt_state = &self.nodes[&tgt];
let mut norm_sq = 0.0f32;
for i in 0..self.state_dim {
let diff = src_state[i] - tgt_state[i];
norm_sq += diff * diff;
}
weight * norm_sq
})
.collect();
let total: f32 = edge_energies.iter().sum();
(total, edge_energies)
}
}
/// CPU attention forward pass (simplified)
fn attention_forward_cpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
let scale = 1.0 / (head_dim as f32).sqrt();
// For each query position
for i in 0..seq_len {
let q_offset = i * head_dim;
// Compute attention scores
let mut scores = vec![0.0f32; seq_len];
let mut max_score = f32::NEG_INFINITY;
for j in 0..seq_len {
let k_offset = j * head_dim;
let mut dot = 0.0f32;
for k in 0..head_dim {
dot += queries[q_offset + k] * keys[k_offset + k];
}
scores[j] = dot * scale;
if scores[j] > max_score {
max_score = scores[j];
}
}
// Softmax
let mut sum_exp = 0.0f32;
for s in &mut scores {
*s = (*s - max_score).exp();
sum_exp += *s;
}
for s in &mut scores {
*s /= sum_exp;
}
// Weighted sum of values
let out_offset = i * head_dim;
for k in 0..head_dim {
let mut weighted_sum = 0.0f32;
for j in 0..seq_len {
let v_offset = j * head_dim;
weighted_sum += scores[j] * values[v_offset + k];
}
output[out_offset + k] = weighted_sum;
}
}
}
/// CPU batch routing (expert selection for MoE)
fn batch_routing_cpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
// token_embeddings: [num_tokens, embed_dim]
// expert_weights: [num_experts, embed_dim]
// Returns: for each token, the indices of top-k experts
let mut results = Vec::with_capacity(num_tokens);
for t in 0..num_tokens {
let token_offset = t * embed_dim;
let token = &token_embeddings[token_offset..token_offset + embed_dim];
// Compute scores for each expert
let mut expert_scores: Vec<(usize, f32)> = (0..num_experts)
.map(|e| {
let expert_offset = e * embed_dim;
let expert = &expert_weights[expert_offset..expert_offset + embed_dim];
let mut dot = 0.0f32;
for i in 0..embed_dim {
dot += token[i] * expert[i];
}
(e, dot)
})
.collect();
// Sort by score (descending) and take top-k
expert_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let top_experts: Vec<usize> = expert_scores
.iter()
.take(top_k)
.map(|(idx, _)| *idx)
.collect();
results.push((t, top_experts));
}
results
}
// ============================================================================
// GPU IMPLEMENTATIONS (SIMULATED WITHOUT ACTUAL GPU)
// When gpu feature is enabled, these would use actual GPU code
// ============================================================================
#[cfg(feature = "gpu")]
mod gpu_impl {
//! GPU implementations using wgpu or similar
//!
//! These would contain actual GPU shader code and buffer management.
//! For now, we simulate the overhead.
use super::*;
/// Simulated GPU energy computation
/// In reality, this would:
/// 1. Upload node states to GPU buffer
/// 2. Execute compute shader for parallel residual computation
/// 3. Reduce edge energies
/// 4. Read back result
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
// Simulate GPU overhead
let _upload_time = simulate_memory_transfer(
graph.nodes.len() * graph.state_dim * 4, // bytes
true, // host to device
);
// Actual computation would happen on GPU
// Here we just call CPU version
let result = graph.compute_energy_cpu();
let _download_time = simulate_memory_transfer(
4, // single f32 result
false,
);
result
}
/// Simulated GPU attention forward pass
pub fn attention_forward_gpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
// Simulate upload
let input_bytes = (queries.len() + keys.len() + values.len()) * 4;
let _upload_time = simulate_memory_transfer(input_bytes, true);
// CPU fallback
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
// Simulate download
let _download_time = simulate_memory_transfer(output.len() * 4, false);
}
/// Simulated GPU batch routing
pub fn batch_routing_gpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
// Simulate upload
let input_bytes = (token_embeddings.len() + expert_weights.len()) * 4;
let _upload_time = simulate_memory_transfer(input_bytes, true);
// CPU fallback
let result = batch_routing_cpu(
token_embeddings,
expert_weights,
num_tokens,
embed_dim,
num_experts,
top_k,
);
// Simulate download
let result_bytes = num_tokens * top_k * 4;
let _download_time = simulate_memory_transfer(result_bytes, false);
result
}
/// Simulate memory transfer time
/// Returns simulated nanoseconds
fn simulate_memory_transfer(bytes: usize, _host_to_device: bool) -> u64 {
// Assume ~10 GB/s transfer rate (PCIe 3.0 x16 theoretical)
// In practice, smaller transfers have higher overhead
let base_overhead_ns = 1000; // 1 microsecond base overhead
let transfer_ns = (bytes as u64 * 100) / 1_000_000_000; // ~10 GB/s
base_overhead_ns + transfer_ns
}
}
// Fallback for non-GPU builds
#[cfg(not(feature = "gpu"))]
mod gpu_impl {
use super::*;
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
graph.compute_energy_cpu()
}
pub fn attention_forward_gpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
}
pub fn batch_routing_gpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
batch_routing_cpu(
token_embeddings,
expert_weights,
num_tokens,
embed_dim,
num_experts,
top_k,
)
}
}
// ============================================================================
// ENERGY COMPUTATION BENCHMARKS
// ============================================================================
fn bench_energy_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_energy");
// Test at various graph sizes
let sizes = [(1_000, 50), (10_000, 30), (100_000, 10)];
for (num_nodes, sample_size) in sizes {
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
group.sample_size(sample_size);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(graph.compute_energy_cpu()))
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(gpu_impl::compute_energy_gpu(&graph)))
});
}
group.finish();
}
/// Benchmark energy computation with per-edge tracking
fn bench_energy_with_edges(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_energy_with_edges");
for num_nodes in [1_000, 10_000] {
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(graph.compute_energy_with_edges_cpu()))
});
// GPU version would return per-edge results
// Useful for hotspot detection
}
group.finish();
}
// ============================================================================
// ATTENTION BENCHMARKS
// ============================================================================
fn bench_attention_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_attention");
// Typical attention configurations
let configs = [
(128, 64, "small"), // seq_len=128, head_dim=64
(512, 64, "medium"), // seq_len=512, head_dim=64
(2048, 64, "large"), // seq_len=2048, head_dim=64
];
for (seq_len, head_dim, label) in configs {
let queries = generate_vec(seq_len * head_dim, 42);
let keys = generate_vec(seq_len * head_dim, 123);
let values = generate_vec(seq_len * head_dim, 456);
let mut output = vec![0.0f32; seq_len * head_dim];
// Attention is O(n^2) in sequence length
let sample_size = if seq_len > 1024 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements((seq_len * seq_len) as u64));
group.bench_with_input(BenchmarkId::new("cpu", label), &seq_len, |b, _| {
b.iter(|| {
attention_forward_cpu(
black_box(&queries),
black_box(&keys),
black_box(&values),
seq_len,
head_dim,
&mut output,
);
black_box(output[0])
})
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", label), &seq_len, |b, _| {
b.iter(|| {
gpu_impl::attention_forward_gpu(
black_box(&queries),
black_box(&keys),
black_box(&values),
seq_len,
head_dim,
&mut output,
);
black_box(output[0])
})
});
}
group.finish();
}
/// Benchmark multi-head attention
fn bench_multihead_attention(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_multihead_attention");
let seq_len = 512;
let head_dim = 64;
let num_heads = 8;
let queries = generate_vec(seq_len * head_dim * num_heads, 42);
let keys = generate_vec(seq_len * head_dim * num_heads, 123);
let values = generate_vec(seq_len * head_dim * num_heads, 456);
let mut output = vec![0.0f32; seq_len * head_dim * num_heads];
group.sample_size(20);
group.throughput(Throughput::Elements((seq_len * seq_len * num_heads) as u64));
// CPU: sequential over heads
group.bench_function("cpu_sequential_heads", |b| {
b.iter(|| {
for h in 0..num_heads {
let offset = h * seq_len * head_dim;
let q = &queries[offset..offset + seq_len * head_dim];
let k = &keys[offset..offset + seq_len * head_dim];
let v = &values[offset..offset + seq_len * head_dim];
let out = &mut output[offset..offset + seq_len * head_dim];
attention_forward_cpu(q, k, v, seq_len, head_dim, out);
}
black_box(output[0])
})
});
// GPU would parallelize across heads
#[cfg(feature = "gpu")]
group.bench_function("gpu_parallel_heads", |b| {
b.iter(|| {
// In reality, GPU would process all heads in parallel
for h in 0..num_heads {
let offset = h * seq_len * head_dim;
let q = &queries[offset..offset + seq_len * head_dim];
let k = &keys[offset..offset + seq_len * head_dim];
let v = &values[offset..offset + seq_len * head_dim];
let out = &mut output[offset..offset + seq_len * head_dim];
gpu_impl::attention_forward_gpu(q, k, v, seq_len, head_dim, out);
}
black_box(output[0])
})
});
group.finish();
}
// ============================================================================
// BATCH ROUTING BENCHMARKS (MoE)
// ============================================================================
fn bench_batch_routing_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_routing");
let embed_dim = 768; // Typical transformer embedding
let num_experts = 8;
let top_k = 2;
for num_tokens in [256, 1024, 4096] {
let token_embeddings = generate_vec(num_tokens * embed_dim, 42);
let expert_weights = generate_vec(num_experts * embed_dim, 123);
let sample_size = if num_tokens > 2048 { 20 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(num_tokens as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_tokens), &num_tokens, |b, _| {
b.iter(|| {
black_box(batch_routing_cpu(
black_box(&token_embeddings),
black_box(&expert_weights),
num_tokens,
embed_dim,
num_experts,
top_k,
))
})
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", num_tokens), &num_tokens, |b, _| {
b.iter(|| {
black_box(gpu_impl::batch_routing_gpu(
black_box(&token_embeddings),
black_box(&expert_weights),
num_tokens,
embed_dim,
num_experts,
top_k,
))
})
});
}
group.finish();
}
// ============================================================================
// MEMORY TRANSFER BENCHMARKS
// ============================================================================
fn bench_memory_transfer_overhead(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_memory_transfer");
// Simulate different transfer sizes
let sizes_kb = [1, 4, 16, 64, 256, 1024, 4096];
for &size_kb in &sizes_kb {
let data = generate_vec(size_kb * 1024 / 4, 42); // f32 = 4 bytes
group.throughput(Throughput::Bytes((size_kb * 1024) as u64));
// Baseline: just accessing memory on CPU
group.bench_with_input(
BenchmarkId::new("cpu_access", format!("{}KB", size_kb)),
&size_kb,
|b, _| {
b.iter(|| {
let sum: f32 = data.iter().sum();
black_box(sum)
})
},
);
// GPU would have additional transfer overhead
// This benchmark shows the amortization point
}
group.finish();
}
// ============================================================================
// CROSSOVER POINT BENCHMARKS
// ============================================================================
/// Find the problem size where GPU becomes faster than CPU
fn bench_gpu_crossover(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_crossover");
// Matrix multiply is a classic GPU workload
// Test different sizes to find crossover
let sizes = [32, 64, 128, 256, 512, 1024];
for &size in &sizes {
let a = generate_matrix(size, size, 42);
let b = generate_matrix(size, size, 123);
let mut c = vec![0.0f32; size * size];
group.throughput(Throughput::Elements((size * size * size) as u64)); // O(n^3)
let sample_size = if size > 512 { 10 } else { 50 };
group.sample_size(sample_size);
// CPU matrix multiply (naive)
group.bench_with_input(BenchmarkId::new("cpu_matmul", size), &size, |b_iter, _| {
b_iter.iter(|| {
for i in 0..size {
for j in 0..size {
let mut sum = 0.0f32;
for k in 0..size {
sum += a[i * size + k] * b[k * size + j];
}
c[i * size + j] = sum;
}
}
black_box(c[0])
})
});
// GPU would win for size >= 256 typically
}
group.finish();
}
// ============================================================================
// COHERENCE-SPECIFIC GPU PATTERNS
// ============================================================================
/// Benchmark parallel residual computation pattern
fn bench_parallel_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_parallel_residual");
let state_dim = 64;
for num_edges in [1_000, 10_000, 100_000] {
// Prepare edge data in GPU-friendly format
let sources: Vec<Vec<f32>> = (0..num_edges)
.map(|i| generate_vec(state_dim, i as u64))
.collect();
let targets: Vec<Vec<f32>> = (0..num_edges)
.map(|i| generate_vec(state_dim, i as u64 + 1000000))
.collect();
let sample_size = if num_edges > 50000 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(num_edges as u64));
// CPU sequential
group.bench_with_input(
BenchmarkId::new("cpu_sequential", num_edges),
&num_edges,
|b, _| {
b.iter(|| {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
let mut norm_sq = 0.0f32;
for i in 0..state_dim {
let diff = src[i] - tgt[i];
norm_sq += diff * diff;
}
total += norm_sq;
}
black_box(total)
})
},
);
// GPU would parallelize all edges
// Each work item computes one residual
}
group.finish();
}
/// Benchmark reduction patterns (sum of energies)
fn bench_gpu_reduction(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_reduction");
for size in [1_000, 10_000, 100_000, 1_000_000] {
let data = generate_vec(size, 42);
let sample_size = if size > 100000 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(size as u64));
// CPU sequential sum
group.bench_with_input(BenchmarkId::new("cpu_sum", size), &size, |b, _| {
b.iter(|| {
let sum: f32 = data.iter().sum();
black_box(sum)
})
});
// CPU parallel reduction would use multiple accumulators
group.bench_with_input(BenchmarkId::new("cpu_parallel", size), &size, |b, _| {
b.iter(|| {
let chunks = data.chunks(1024);
let partial_sums: Vec<f32> = chunks.map(|c| c.iter().sum()).collect();
let sum: f32 = partial_sums.iter().sum();
black_box(sum)
})
});
// GPU reduction uses tree-based parallel reduction
}
group.finish();
}
// ============================================================================
// CRITERION CONFIGURATION
// ============================================================================
criterion_group!(
energy_benches,
bench_energy_cpu_vs_gpu,
bench_energy_with_edges,
);
criterion_group!(
attention_benches,
bench_attention_cpu_vs_gpu,
bench_multihead_attention,
);
criterion_group!(routing_benches, bench_batch_routing_cpu_vs_gpu,);
criterion_group!(
transfer_benches,
bench_memory_transfer_overhead,
bench_gpu_crossover,
);
criterion_group!(
coherence_gpu_benches,
bench_parallel_residual,
bench_gpu_reduction,
);
criterion_main!(
energy_benches,
attention_benches,
routing_benches,
transfer_benches,
coherence_gpu_benches
);

View File

@@ -0,0 +1,488 @@
//! Benchmarks for Poincare distance computation
//!
//! ADR-014 Performance Target: < 500ns per Poincare distance
//!
//! Hyperbolic geometry enables hierarchy-aware coherence where
//! deeper nodes (further from origin) have different energy weights.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
// ============================================================================
// Hyperbolic Geometry Functions
// ============================================================================
/// Compute squared Euclidean norm
#[inline]
fn squared_norm(x: &[f32]) -> f32 {
x.iter().map(|v| v * v).sum()
}
/// Compute Euclidean norm
#[inline]
fn norm(x: &[f32]) -> f32 {
squared_norm(x).sqrt()
}
/// Compute squared Euclidean distance
#[inline]
fn squared_distance(x: &[f32], y: &[f32]) -> f32 {
x.iter().zip(y.iter()).map(|(a, b)| (a - b).powi(2)).sum()
}
/// Poincare distance in the Poincare ball model
///
/// d(x, y) = arcosh(1 + 2 * ||x - y||^2 / ((1 - ||x||^2) * (1 - ||y||^2)))
///
/// where arcosh(z) = ln(z + sqrt(z^2 - 1))
#[inline]
pub fn poincare_distance(x: &[f32], y: &[f32], curvature: f32) -> f32 {
let sq_norm_x = squared_norm(x);
let sq_norm_y = squared_norm(y);
let sq_dist = squared_distance(x, y);
// Clamp to valid range for numerical stability
let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
let arg = 1.0 + 2.0 * sq_dist / denom;
// arcosh(arg) = ln(arg + sqrt(arg^2 - 1))
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
// Scale by curvature
arcosh / (-curvature).sqrt()
}
/// Optimized Poincare distance with fused operations
#[inline]
pub fn poincare_distance_optimized(x: &[f32], y: &[f32], curvature: f32) -> f32 {
let mut sq_norm_x = 0.0f32;
let mut sq_norm_y = 0.0f32;
let mut sq_dist = 0.0f32;
for i in 0..x.len() {
sq_norm_x += x[i] * x[i];
sq_norm_y += y[i] * y[i];
let d = x[i] - y[i];
sq_dist += d * d;
}
let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
let arg = 1.0 + 2.0 * sq_dist / denom;
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
arcosh / (-curvature).sqrt()
}
/// SIMD-friendly Poincare distance (chunked)
#[inline]
pub fn poincare_distance_simd_friendly(x: &[f32], y: &[f32], curvature: f32) -> f32 {
// Process in chunks of 4 for potential auto-vectorization
let mut sq_norm_x = [0.0f32; 4];
let mut sq_norm_y = [0.0f32; 4];
let mut sq_dist = [0.0f32; 4];
let chunks = x.len() / 4;
for c in 0..chunks {
let base = c * 4;
for i in 0..4 {
let xi = x[base + i];
let yi = y[base + i];
sq_norm_x[i] += xi * xi;
sq_norm_y[i] += yi * yi;
let d = xi - yi;
sq_dist[i] += d * d;
}
}
// Handle remainder
let remainder = x.len() % 4;
let base = chunks * 4;
for i in 0..remainder {
let xi = x[base + i];
let yi = y[base + i];
sq_norm_x[0] += xi * xi;
sq_norm_y[0] += yi * yi;
let d = xi - yi;
sq_dist[0] += d * d;
}
// Reduce
let total_sq_norm_x: f32 = sq_norm_x.iter().sum();
let total_sq_norm_y: f32 = sq_norm_y.iter().sum();
let total_sq_dist: f32 = sq_dist.iter().sum();
let denom = (1.0 - total_sq_norm_x).max(1e-10) * (1.0 - total_sq_norm_y).max(1e-10);
let arg = 1.0 + 2.0 * total_sq_dist / denom;
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
arcosh / (-curvature).sqrt()
}
/// Mobius addition in the Poincare ball
///
/// x + y = ((1 + 2<x,y> + ||y||^2)x + (1 - ||x||^2)y) / (1 + 2<x,y> + ||x||^2||y||^2)
pub fn mobius_add(x: &[f32], y: &[f32], curvature: f32) -> Vec<f32> {
let c = -curvature;
let sq_norm_x = squared_norm(x);
let sq_norm_y = squared_norm(y);
let xy_dot: f32 = x.iter().zip(y.iter()).map(|(a, b)| a * b).sum();
let num_factor_x = 1.0 + 2.0 * c * xy_dot + c * sq_norm_y;
let num_factor_y = 1.0 - c * sq_norm_x;
let denom = 1.0 + 2.0 * c * xy_dot + c * c * sq_norm_x * sq_norm_y;
x.iter()
.zip(y.iter())
.map(|(xi, yi)| (num_factor_x * xi + num_factor_y * yi) / denom)
.collect()
}
/// Exponential map at point p with tangent vector v
pub fn exp_map(v: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
let c = -curvature;
let v_norm = norm(v);
if v_norm < 1e-10 {
return p.to_vec();
}
let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
let t = (c.sqrt() * lambda_p * v_norm / 2.0).tanh();
let factor = t / (c.sqrt() * v_norm);
let v_scaled: Vec<f32> = v.iter().map(|vi| factor * vi).collect();
mobius_add(p, &v_scaled, curvature)
}
/// Logarithmic map from point p to point q
pub fn log_map(q: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
let c = -curvature;
// Compute -p + q
let neg_p: Vec<f32> = p.iter().map(|x| -x).collect();
let diff = mobius_add(&neg_p, q, curvature);
let diff_norm = norm(&diff);
if diff_norm < 1e-10 {
return vec![0.0; p.len()];
}
let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
let factor = 2.0 / (c.sqrt() * lambda_p) * (c.sqrt() * diff_norm).atanh() / diff_norm;
diff.iter().map(|d| factor * d).collect()
}
/// Project vector to Poincare ball (ensure ||x|| < 1/sqrt(c))
pub fn project_to_ball(x: &[f32], curvature: f32) -> Vec<f32> {
let max_norm = 1.0 / (-curvature).sqrt() - 1e-5;
let current_norm = norm(x);
if current_norm >= max_norm {
let scale = max_norm / current_norm;
x.iter().map(|v| v * scale).collect()
} else {
x.to_vec()
}
}
/// Compute depth (distance from origin) in Poincare ball
#[inline]
pub fn poincare_depth(x: &[f32], curvature: f32) -> f32 {
let origin = vec![0.0f32; x.len()];
poincare_distance(x, &origin, curvature)
}
// ============================================================================
// Test Data Generation
// ============================================================================
fn generate_point(dim: usize, seed: u64, max_norm: f32) -> Vec<f32> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let raw: Vec<f32> = (0..dim)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect();
// Scale to be within ball
let n = norm(&raw);
if n > 0.0 {
let scale = max_norm / n * 0.9; // 90% of max
raw.iter().map(|v| v * scale).collect()
} else {
raw
}
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark Poincare distance at various dimensions
fn bench_poincare_distance(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_poincare_distance");
group.throughput(Throughput::Elements(1));
let curvature = -1.0;
for dim in [8, 32, 64, 128, 256, 512] {
let x = generate_point(dim, 42, 0.9);
let y = generate_point(dim, 123, 0.9);
// Standard implementation
group.bench_with_input(BenchmarkId::new("standard", dim), &dim, |b, _| {
b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(curvature)))
});
// Optimized implementation
group.bench_with_input(BenchmarkId::new("optimized", dim), &dim, |b, _| {
b.iter(|| {
poincare_distance_optimized(black_box(&x), black_box(&y), black_box(curvature))
})
});
// SIMD-friendly implementation
group.bench_with_input(BenchmarkId::new("simd_friendly", dim), &dim, |b, _| {
b.iter(|| {
poincare_distance_simd_friendly(black_box(&x), black_box(&y), black_box(curvature))
})
});
}
group.finish();
}
/// Benchmark Mobius addition
fn bench_mobius_add(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_mobius_add");
group.throughput(Throughput::Elements(1));
let curvature = -1.0;
for dim in [8, 32, 64, 128] {
let x = generate_point(dim, 42, 0.5);
let y = generate_point(dim, 123, 0.5);
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| mobius_add(black_box(&x), black_box(&y), black_box(curvature)))
});
}
group.finish();
}
/// Benchmark exp/log maps
fn bench_exp_log_map(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_exp_log");
let dim = 32;
let curvature = -1.0;
let p = generate_point(dim, 42, 0.3);
let v: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin() * 0.2)).collect();
let q = generate_point(dim, 123, 0.4);
group.bench_function("exp_map", |b| {
b.iter(|| exp_map(black_box(&v), black_box(&p), black_box(curvature)))
});
group.bench_function("log_map", |b| {
b.iter(|| log_map(black_box(&q), black_box(&p), black_box(curvature)))
});
group.finish();
}
/// Benchmark projection to ball
fn bench_projection(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_projection");
group.throughput(Throughput::Elements(1));
let curvature = -1.0;
for dim in [8, 32, 64, 128, 256] {
// Point that needs projection (outside ball)
let x: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin())).collect();
group.bench_with_input(BenchmarkId::new("project", dim), &dim, |b, _| {
b.iter(|| project_to_ball(black_box(&x), black_box(curvature)))
});
}
group.finish();
}
/// Benchmark depth computation
fn bench_depth(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_depth");
group.throughput(Throughput::Elements(1));
let curvature = -1.0;
for dim in [8, 32, 64, 128, 256] {
let x = generate_point(dim, 42, 0.9);
group.bench_with_input(BenchmarkId::new("depth", dim), &dim, |b, _| {
b.iter(|| poincare_depth(black_box(&x), black_box(curvature)))
});
}
group.finish();
}
/// Benchmark batch distance computation
fn bench_batch_distance(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_batch_distance");
let dim = 64;
let curvature = -1.0;
for batch_size in [10, 100, 1000] {
let points: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_point(dim, i as u64, 0.9))
.collect();
let query = generate_point(dim, 999, 0.9);
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("batch", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
let distances: Vec<f32> = points
.iter()
.map(|p| poincare_distance(&query, p, curvature))
.collect();
black_box(distances)
})
},
);
}
group.finish();
}
/// Benchmark k-nearest in hyperbolic space
fn bench_knn_hyperbolic(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_knn");
group.sample_size(50);
let dim = 64;
let curvature = -1.0;
let points: Vec<Vec<f32>> = (0..1000)
.map(|i| generate_point(dim, i as u64, 0.9))
.collect();
let query = generate_point(dim, 999, 0.9);
for k in [1, 5, 10, 50] {
group.bench_with_input(BenchmarkId::new("k", k), &k, |b, &k| {
b.iter(|| {
// Compute all distances
let mut distances: Vec<(usize, f32)> = points
.iter()
.enumerate()
.map(|(i, p)| (i, poincare_distance(&query, p, curvature)))
.collect();
// Partial sort for k-nearest
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let result = distances[..k]
.iter()
.map(|(i, d)| (*i, *d))
.collect::<Vec<_>>();
black_box(result)
})
});
}
group.finish();
}
/// Benchmark hierarchy-weighted energy computation
fn bench_hierarchy_weighted_energy(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_hierarchy_energy");
let dim = 64;
let curvature = -1.0;
// Create hierarchy: shallow and deep nodes
let shallow_nodes: Vec<Vec<f32>> = (0..100)
.map(|i| generate_point(dim, i as u64, 0.3)) // Near origin
.collect();
let deep_nodes: Vec<Vec<f32>> = (0..100)
.map(|i| generate_point(dim, (i + 100) as u64, 0.9)) // Far from origin
.collect();
group.bench_function("shallow_energy", |b| {
b.iter(|| {
let mut total_energy = 0.0f32;
for i in 0..shallow_nodes.len() - 1 {
let depth_a = poincare_depth(&shallow_nodes[i], curvature);
let depth_b = poincare_depth(&shallow_nodes[i + 1], curvature);
let avg_depth = (depth_a + depth_b) / 2.0;
let weight = 1.0 + avg_depth.ln().max(0.0);
let dist = poincare_distance(&shallow_nodes[i], &shallow_nodes[i + 1], curvature);
total_energy += weight * dist * dist;
}
black_box(total_energy)
})
});
group.bench_function("deep_energy", |b| {
b.iter(|| {
let mut total_energy = 0.0f32;
for i in 0..deep_nodes.len() - 1 {
let depth_a = poincare_depth(&deep_nodes[i], curvature);
let depth_b = poincare_depth(&deep_nodes[i + 1], curvature);
let avg_depth = (depth_a + depth_b) / 2.0;
let weight = 1.0 + avg_depth.ln().max(0.0);
let dist = poincare_distance(&deep_nodes[i], &deep_nodes[i + 1], curvature);
total_energy += weight * dist * dist;
}
black_box(total_energy)
})
});
group.finish();
}
/// Benchmark curvature impact
fn bench_curvature_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("hyperbolic_curvature");
let dim = 64;
let x = generate_point(dim, 42, 0.5);
let y = generate_point(dim, 123, 0.5);
for curvature in [-0.1, -0.5, -1.0, -2.0, -5.0] {
group.bench_with_input(
BenchmarkId::new("curvature", format!("{:.1}", curvature)),
&curvature,
|b, &c| b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(c))),
);
}
group.finish();
}
criterion_group!(
benches,
bench_poincare_distance,
bench_mobius_add,
bench_exp_log_map,
bench_projection,
bench_depth,
bench_batch_distance,
bench_knn_hyperbolic,
bench_hierarchy_weighted_energy,
bench_curvature_impact,
);
criterion_main!(benches);

View File

@@ -0,0 +1,608 @@
//! Benchmarks for incremental coherence updates
//!
//! ADR-014 Performance Target: < 100us for single node update
//!
//! Incremental computation recomputes only affected edges when
//! a single node changes, avoiding full graph recomputation.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::{HashMap, HashSet};
// ============================================================================
// Types (Simulated for benchmarking)
// ============================================================================
#[derive(Clone)]
pub struct RestrictionMap {
pub matrix: Vec<f32>,
pub bias: Vec<f32>,
pub input_dim: usize,
pub output_dim: usize,
}
impl RestrictionMap {
pub fn identity(dim: usize) -> Self {
let mut matrix = vec![0.0f32; dim * dim];
for i in 0..dim {
matrix[i * dim + i] = 1.0;
}
Self {
matrix,
bias: vec![0.0; dim],
input_dim: dim,
output_dim: dim,
}
}
#[inline]
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
output.copy_from_slice(&self.bias);
for i in 0..self.output_dim {
let row_start = i * self.input_dim;
for j in 0..self.input_dim {
output[i] += self.matrix[row_start + j] * input[j];
}
}
}
}
#[derive(Clone)]
pub struct SheafNode {
pub id: u64,
pub state: Vec<f32>,
}
#[derive(Clone)]
pub struct SheafEdge {
pub id: u64,
pub source: u64,
pub target: u64,
pub weight: f32,
pub rho_source: RestrictionMap,
pub rho_target: RestrictionMap,
}
impl SheafEdge {
#[inline]
pub fn weighted_residual_energy_into(
&self,
source: &[f32],
target: &[f32],
source_buf: &mut [f32],
target_buf: &mut [f32],
) -> f32 {
self.rho_source.apply_into(source, source_buf);
self.rho_target.apply_into(target, target_buf);
let mut norm_sq = 0.0f32;
for i in 0..source_buf.len() {
let diff = source_buf[i] - target_buf[i];
norm_sq += diff * diff;
}
self.weight * norm_sq
}
}
/// Incremental coherence tracker
pub struct IncrementalCoherence {
pub nodes: HashMap<u64, SheafNode>,
pub edges: Vec<SheafEdge>,
pub state_dim: usize,
/// Node -> incident edge indices
pub node_to_edges: HashMap<u64, Vec<usize>>,
/// Cached per-edge energies
pub edge_energies: Vec<f32>,
/// Cached total energy
pub total_energy: f32,
/// Fingerprint for staleness detection
pub fingerprint: u64,
}
impl IncrementalCoherence {
pub fn new(nodes: HashMap<u64, SheafNode>, edges: Vec<SheafEdge>, state_dim: usize) -> Self {
// Build node-to-edge index
let mut node_to_edges: HashMap<u64, Vec<usize>> = HashMap::new();
for (idx, edge) in edges.iter().enumerate() {
node_to_edges.entry(edge.source).or_default().push(idx);
node_to_edges.entry(edge.target).or_default().push(idx);
}
let mut tracker = Self {
nodes,
edges,
state_dim,
node_to_edges,
edge_energies: Vec::new(),
total_energy: 0.0,
fingerprint: 0,
};
tracker.full_recompute();
tracker
}
/// Full recomputation (initial or when needed)
pub fn full_recompute(&mut self) {
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
self.edge_energies = self
.edges
.iter()
.map(|edge| {
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
)
})
.collect();
self.total_energy = self.edge_energies.iter().sum();
self.update_fingerprint();
}
/// Update single node and recompute affected edges only
pub fn update_node(&mut self, node_id: u64, new_state: Vec<f32>) {
// Update node state
if let Some(node) = self.nodes.get_mut(&node_id) {
node.state = new_state;
} else {
return;
}
// Get affected edges
let affected_edges = match self.node_to_edges.get(&node_id) {
Some(edges) => edges.clone(),
None => return,
};
// Recompute only affected edges
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
let mut energy_delta = 0.0f32;
for &edge_idx in &affected_edges {
let edge = &self.edges[edge_idx];
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
let old_energy = self.edge_energies[edge_idx];
let new_energy = edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
);
energy_delta += new_energy - old_energy;
self.edge_energies[edge_idx] = new_energy;
}
self.total_energy += energy_delta;
self.update_fingerprint();
}
/// Update multiple nodes in batch
pub fn update_nodes_batch(&mut self, updates: Vec<(u64, Vec<f32>)>) {
// Collect all affected edges
let mut affected_edges: HashSet<usize> = HashSet::new();
for (node_id, new_state) in updates {
if let Some(node) = self.nodes.get_mut(&node_id) {
node.state = new_state;
}
if let Some(edges) = self.node_to_edges.get(&node_id) {
affected_edges.extend(edges.iter());
}
}
// Recompute affected edges
let mut source_buf = vec![0.0f32; self.state_dim];
let mut target_buf = vec![0.0f32; self.state_dim];
let mut energy_delta = 0.0f32;
for edge_idx in affected_edges {
let edge = &self.edges[edge_idx];
let source_state = &self.nodes[&edge.source].state;
let target_state = &self.nodes[&edge.target].state;
let old_energy = self.edge_energies[edge_idx];
let new_energy = edge.weighted_residual_energy_into(
source_state,
target_state,
&mut source_buf,
&mut target_buf,
);
energy_delta += new_energy - old_energy;
self.edge_energies[edge_idx] = new_energy;
}
self.total_energy += energy_delta;
self.update_fingerprint();
}
fn update_fingerprint(&mut self) {
self.fingerprint = self.fingerprint.wrapping_add(1);
}
/// Get current total energy
pub fn energy(&self) -> f32 {
self.total_energy
}
/// Get energy for specific edge
pub fn edge_energy(&self, edge_idx: usize) -> f32 {
self.edge_energies[edge_idx]
}
/// Check if cache is stale (fingerprint changed)
pub fn is_stale(&self, last_fingerprint: u64) -> bool {
self.fingerprint != last_fingerprint
}
}
// ============================================================================
// Test Data Generation
// ============================================================================
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
(0..dim)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
fn create_random_graph(
num_nodes: usize,
avg_degree: usize,
state_dim: usize,
) -> IncrementalCoherence {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
.map(|id| {
(
id,
SheafNode {
id,
state: generate_state(state_dim, id),
},
)
})
.collect();
let num_edges = (num_nodes * avg_degree) / 2;
let edges: Vec<SheafEdge> = (0..num_edges)
.filter_map(|i| {
let mut hasher = DefaultHasher::new();
(42u64, i, "src").hash(&mut hasher);
let source = hasher.finish() % num_nodes as u64;
let mut hasher = DefaultHasher::new();
(42u64, i, "tgt").hash(&mut hasher);
let target = hasher.finish() % num_nodes as u64;
if source != target {
Some(SheafEdge {
id: i as u64,
source,
target,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
})
} else {
None
}
})
.collect();
IncrementalCoherence::new(nodes, edges, state_dim)
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark single node update at various graph sizes
fn bench_single_node_update(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_single_node");
group.throughput(Throughput::Elements(1));
// ADR-014 target: <100us for single node update
for num_nodes in [100, 1_000, 10_000] {
let state_dim = 64;
let avg_degree = 4;
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
group.bench_with_input(
BenchmarkId::new("update", format!("{}nodes", num_nodes)),
&num_nodes,
|b, _| {
let node_id = (num_nodes / 2) as u64; // Update middle node
b.iter(|| {
let new_state = generate_state(state_dim, black_box(rand::random()));
tracker.update_node(black_box(node_id), new_state);
black_box(tracker.energy())
})
},
);
}
group.finish();
}
/// Benchmark incremental vs full recomputation
fn bench_incremental_vs_full(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_vs_full");
let num_nodes = 10_000;
let state_dim = 64;
let avg_degree = 4;
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
// Incremental update
group.bench_function("incremental_single", |b| {
let node_id = 5000u64;
b.iter(|| {
let new_state = generate_state(state_dim, rand::random());
tracker.update_node(black_box(node_id), new_state);
black_box(tracker.energy())
})
});
// Full recomputation
group.bench_function("full_recompute", |b| {
b.iter(|| {
tracker.full_recompute();
black_box(tracker.energy())
})
});
group.finish();
}
/// Benchmark node degree impact on update time
fn bench_node_degree_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_degree_impact");
let num_nodes = 10_000;
let state_dim = 64;
// Create graph with hub node (high degree)
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
.map(|id| {
(
id,
SheafNode {
id,
state: generate_state(state_dim, id),
},
)
})
.collect();
// Hub node 0 connects to many nodes
let hub_degree = 1000;
let mut edges: Vec<SheafEdge> = (1..=hub_degree)
.map(|i| SheafEdge {
id: i as u64,
source: 0,
target: i as u64,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
})
.collect();
// Regular edges for other nodes (degree ~4)
for i in hub_degree + 1..num_nodes - 1 {
edges.push(SheafEdge {
id: i as u64,
source: i as u64,
target: (i + 1) as u64,
weight: 1.0,
rho_source: RestrictionMap::identity(state_dim),
rho_target: RestrictionMap::identity(state_dim),
});
}
let mut tracker = IncrementalCoherence::new(nodes, edges, state_dim);
// Update hub node (high degree)
group.bench_function("update_hub_1000_edges", |b| {
b.iter(|| {
let new_state = generate_state(state_dim, rand::random());
tracker.update_node(black_box(0), new_state);
black_box(tracker.energy())
})
});
// Update leaf node (degree 1-2)
group.bench_function("update_leaf_2_edges", |b| {
let leaf_id = (hub_degree + 100) as u64;
b.iter(|| {
let new_state = generate_state(state_dim, rand::random());
tracker.update_node(black_box(leaf_id), new_state);
black_box(tracker.energy())
})
});
group.finish();
}
/// Benchmark batch updates
fn bench_batch_updates(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_batch");
let num_nodes = 10_000;
let state_dim = 64;
let avg_degree = 4;
for batch_size in [1, 10, 100, 1000] {
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("batch_update", batch_size),
&batch_size,
|b, &size| {
b.iter(|| {
let updates: Vec<(u64, Vec<f32>)> = (0..size)
.map(|i| {
let node_id = (i * 10) as u64 % num_nodes as u64;
let state = generate_state(state_dim, rand::random());
(node_id, state)
})
.collect();
tracker.update_nodes_batch(black_box(updates));
black_box(tracker.energy())
})
},
);
}
group.finish();
}
/// Benchmark state dimension impact
fn bench_state_dim_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_state_dim");
let num_nodes = 10_000;
let avg_degree = 4;
for state_dim in [8, 32, 64, 128, 256] {
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
group.bench_with_input(
BenchmarkId::new("update", state_dim),
&state_dim,
|b, &dim| {
let node_id = 5000u64;
b.iter(|| {
let new_state = generate_state(dim, rand::random());
tracker.update_node(black_box(node_id), new_state);
black_box(tracker.energy())
})
},
);
}
group.finish();
}
/// Benchmark index lookup performance
fn bench_index_lookup(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_index_lookup");
let num_nodes = 100_000;
let avg_degree = 4;
let state_dim = 64;
let tracker = create_random_graph(num_nodes, avg_degree, state_dim);
// Lookup incident edges for a node
group.bench_function("lookup_incident_edges", |b| {
b.iter(|| {
let node_id = black_box(50_000u64);
black_box(tracker.node_to_edges.get(&node_id))
})
});
// Iterate incident edges
group.bench_function("iterate_incident_edges", |b| {
let node_id = 50_000u64;
b.iter(|| {
let sum = if let Some(edges) = tracker.node_to_edges.get(&node_id) {
edges.iter().map(|&idx| tracker.edge_energies[idx]).sum()
} else {
0.0f32
};
black_box(sum)
})
});
group.finish();
}
/// Benchmark fingerprint operations
fn bench_fingerprint(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_fingerprint");
let num_nodes = 10_000;
let avg_degree = 4;
let state_dim = 64;
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
group.bench_function("check_staleness", |b| {
let fp = tracker.fingerprint;
b.iter(|| black_box(tracker.is_stale(black_box(fp))))
});
group.bench_function("update_with_fingerprint_check", |b| {
let node_id = 5000u64;
b.iter(|| {
let old_fp = tracker.fingerprint;
let new_state = generate_state(state_dim, rand::random());
tracker.update_node(black_box(node_id), new_state);
let is_changed = tracker.is_stale(old_fp);
black_box((tracker.energy(), is_changed))
})
});
group.finish();
}
/// Benchmark worst case: update all nodes sequentially
fn bench_sequential_all_updates(c: &mut Criterion) {
let mut group = c.benchmark_group("incremental_sequential_all");
group.sample_size(10);
let num_nodes = 1000;
let avg_degree = 4;
let state_dim = 64;
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
group.bench_function("update_all_1000_sequential", |b| {
b.iter(|| {
for node_id in 0..num_nodes as u64 {
let new_state = generate_state(state_dim, node_id);
tracker.update_node(node_id, new_state);
}
black_box(tracker.energy())
})
});
group.finish();
}
criterion_group!(
benches,
bench_single_node_update,
bench_incremental_vs_full,
bench_node_degree_impact,
bench_batch_updates,
bench_state_dim_impact,
bench_index_lookup,
bench_fingerprint,
bench_sequential_all_updates,
);
criterion_main!(benches);

View File

@@ -0,0 +1,630 @@
//! Benchmarks for dynamic mincut updates
//!
//! ADR-014 Performance Target: n^o(1) amortized time per update
//!
//! The mincut algorithm isolates incoherent subgraphs using
//! subpolynomial dynamic updates.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::{HashMap, HashSet, VecDeque};
// ============================================================================
// Dynamic MinCut Types (Simulated for benchmarking)
// ============================================================================
/// Edge in dynamic graph
#[derive(Clone, Copy)]
pub struct Edge {
pub source: u64,
pub target: u64,
pub weight: f64,
}
/// Dynamic graph with mincut tracking
pub struct DynamicGraph {
/// Adjacency lists
adjacency: HashMap<u64, HashMap<u64, f64>>,
/// Total edge count
edge_count: usize,
/// Vertex count
vertex_count: usize,
/// Cached connected components
components: Option<Vec<HashSet<u64>>>,
/// Modification counter for cache invalidation
mod_count: u64,
}
impl DynamicGraph {
pub fn new() -> Self {
Self {
adjacency: HashMap::new(),
edge_count: 0,
vertex_count: 0,
components: None,
mod_count: 0,
}
}
pub fn with_capacity(vertices: usize, _edges: usize) -> Self {
Self {
adjacency: HashMap::with_capacity(vertices),
edge_count: 0,
vertex_count: 0,
components: None,
mod_count: 0,
}
}
/// Insert edge
pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
self.components = None;
self.mod_count += 1;
let adj = self.adjacency.entry(source).or_insert_with(HashMap::new);
if adj.contains_key(&target) {
return false;
}
adj.insert(target, weight);
let adj = self.adjacency.entry(target).or_insert_with(HashMap::new);
adj.insert(source, weight);
self.edge_count += 1;
self.vertex_count = self.adjacency.len();
true
}
/// Delete edge
pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
self.components = None;
self.mod_count += 1;
let removed = if let Some(adj) = self.adjacency.get_mut(&source) {
adj.remove(&target).is_some()
} else {
false
};
if removed {
if let Some(adj) = self.adjacency.get_mut(&target) {
adj.remove(&source);
}
self.edge_count -= 1;
}
removed
}
/// Check if edge exists
pub fn has_edge(&self, source: u64, target: u64) -> bool {
self.adjacency
.get(&source)
.map(|adj| adj.contains_key(&target))
.unwrap_or(false)
}
/// Get vertex degree
pub fn degree(&self, vertex: u64) -> usize {
self.adjacency
.get(&vertex)
.map(|adj| adj.len())
.unwrap_or(0)
}
/// Get neighbors
pub fn neighbors(&self, vertex: u64) -> Vec<u64> {
self.adjacency
.get(&vertex)
.map(|adj| adj.keys().copied().collect())
.unwrap_or_default()
}
/// Compute connected components using BFS
pub fn connected_components(&mut self) -> &Vec<HashSet<u64>> {
if self.components.is_some() {
return self.components.as_ref().unwrap();
}
let mut visited = HashSet::new();
let mut components = Vec::new();
for &vertex in self.adjacency.keys() {
if visited.contains(&vertex) {
continue;
}
let mut component = HashSet::new();
let mut queue = VecDeque::new();
queue.push_back(vertex);
while let Some(v) = queue.pop_front() {
if visited.insert(v) {
component.insert(v);
if let Some(neighbors) = self.adjacency.get(&v) {
for &neighbor in neighbors.keys() {
if !visited.contains(&neighbor) {
queue.push_back(neighbor);
}
}
}
}
}
components.push(component);
}
self.components = Some(components);
self.components.as_ref().unwrap()
}
/// Check if graph is connected
pub fn is_connected(&mut self) -> bool {
let components = self.connected_components();
components.len() <= 1
}
/// Get edges as list
pub fn edges(&self) -> Vec<Edge> {
let mut edges = Vec::with_capacity(self.edge_count);
let mut seen = HashSet::new();
for (&source, neighbors) in &self.adjacency {
for (&target, &weight) in neighbors {
let key = if source < target {
(source, target)
} else {
(target, source)
};
if seen.insert(key) {
edges.push(Edge {
source,
target,
weight,
});
}
}
}
edges
}
/// Get graph statistics
pub fn stats(&self) -> GraphStats {
GraphStats {
vertices: self.vertex_count,
edges: self.edge_count,
max_degree: self
.adjacency
.values()
.map(|adj| adj.len())
.max()
.unwrap_or(0),
avg_degree: if self.vertex_count > 0 {
(self.edge_count * 2) as f64 / self.vertex_count as f64
} else {
0.0
},
}
}
}
pub struct GraphStats {
pub vertices: usize,
pub edges: usize,
pub max_degree: usize,
pub avg_degree: f64,
}
/// Subpolynomial MinCut (simplified simulation)
/// Real implementation would use randomized contraction or tree packing
pub struct SubpolynomialMinCut {
graph: DynamicGraph,
/// Cached mincut value
cached_mincut: Option<f64>,
/// Update count since last computation
updates_since_compute: usize,
/// Threshold for recomputation
recompute_threshold: usize,
}
impl SubpolynomialMinCut {
pub fn new() -> Self {
Self {
graph: DynamicGraph::new(),
cached_mincut: None,
updates_since_compute: 0,
recompute_threshold: 10,
}
}
pub fn with_capacity(vertices: usize, edges: usize) -> Self {
Self {
graph: DynamicGraph::with_capacity(vertices, edges),
cached_mincut: None,
updates_since_compute: 0,
recompute_threshold: ((vertices as f64).sqrt() as usize).max(10),
}
}
/// Insert edge with lazy mincut update
pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
let result = self.graph.insert_edge(source, target, weight);
if result {
self.updates_since_compute += 1;
// Mincut can only decrease or stay same on edge insertion
// So we can keep cached value as upper bound
}
result
}
/// Delete edge with lazy mincut update
pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
let result = self.graph.delete_edge(source, target);
if result {
self.updates_since_compute += 1;
// Mincut might have decreased, invalidate cache
self.cached_mincut = None;
}
result
}
/// Compute mincut (lazy - uses cache if available)
pub fn min_cut(&mut self) -> f64 {
if let Some(cached) = self.cached_mincut {
if self.updates_since_compute < self.recompute_threshold {
return cached;
}
}
// Simplified: use min degree as lower bound approximation
// Real implementation: Karger's algorithm or tree packing
let mincut = self.compute_mincut_approximation();
self.cached_mincut = Some(mincut);
self.updates_since_compute = 0;
mincut
}
/// Approximate mincut using min degree heuristic
fn compute_mincut_approximation(&self) -> f64 {
// Min cut <= min weighted degree
let mut min_cut = f64::MAX;
for (_vertex, neighbors) in &self.graph.adjacency {
let weighted_degree: f64 = neighbors.values().sum();
if weighted_degree < min_cut {
min_cut = weighted_degree;
}
}
if min_cut == f64::MAX {
0.0
} else {
min_cut
}
}
/// Get partition (simplified: just split by component)
pub fn partition(&mut self) -> (HashSet<u64>, HashSet<u64>) {
let components = self.graph.connected_components();
if components.is_empty() {
return (HashSet::new(), HashSet::new());
}
if components.len() == 1 {
// Single component - split roughly in half
let vertices: Vec<_> = components[0].iter().copied().collect();
let mid = vertices.len() / 2;
let left: HashSet<_> = vertices[..mid].iter().copied().collect();
let right: HashSet<_> = vertices[mid..].iter().copied().collect();
(left, right)
} else {
// Multiple components - use first vs rest
let left = components[0].clone();
let right: HashSet<_> = components[1..]
.iter()
.flat_map(|c| c.iter())
.copied()
.collect();
(left, right)
}
}
}
// ============================================================================
// Test Data Generation
// ============================================================================
fn generate_random_graph(n: usize, m: usize, seed: u64) -> Vec<(u64, u64, f64)> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut edges = Vec::with_capacity(m);
let mut edge_set = HashSet::new();
for i in 0..m * 2 {
if edges.len() >= m {
break;
}
let mut hasher = DefaultHasher::new();
(seed, i, "source").hash(&mut hasher);
let u = hasher.finish() % n as u64;
let mut hasher = DefaultHasher::new();
(seed, i, "target").hash(&mut hasher);
let v = hasher.finish() % n as u64;
if u != v {
let key = if u < v { (u, v) } else { (v, u) };
if edge_set.insert(key) {
edges.push((u, v, 1.0));
}
}
}
edges
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark edge insertion
fn bench_insert_edge(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_insert");
group.throughput(Throughput::Elements(1));
for size in [100, 1000, 10000] {
let edges = generate_random_graph(size, size * 2, 42);
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
// Pre-populate
for (u, v, w) in &edges[..edges.len() / 2] {
mincut.insert_edge(*u, *v, *w);
}
group.bench_with_input(BenchmarkId::new("insert_single", size), &size, |b, &n| {
let mut i = edges.len() / 2;
b.iter(|| {
let (u, v, w) = edges[i % edges.len()];
black_box(mincut.insert_edge(u + n as u64, v + n as u64, w));
i += 1;
})
});
}
group.finish();
}
/// Benchmark edge deletion
fn bench_delete_edge(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_delete");
group.throughput(Throughput::Elements(1));
for size in [100, 1000, 10000] {
let edges = generate_random_graph(size, size * 2, 42);
group.bench_with_input(BenchmarkId::new("delete_single", size), &size, |b, _| {
b.iter_batched(
|| {
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
for (u, v, w) in &edges {
mincut.insert_edge(*u, *v, *w);
}
(mincut, edges.clone())
},
|(mut mincut, edges)| {
let (u, v, _) = edges[edges.len() / 2];
black_box(mincut.delete_edge(u, v))
},
criterion::BatchSize::SmallInput,
)
});
}
group.finish();
}
/// Benchmark mincut query
fn bench_mincut_query(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_query");
group.throughput(Throughput::Elements(1));
for size in [100, 1000, 10000] {
let edges = generate_random_graph(size, size * 2, 42);
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
for (u, v, w) in &edges {
mincut.insert_edge(*u, *v, *w);
}
// Cold query (no cache)
group.bench_with_input(BenchmarkId::new("cold_query", size), &size, |b, _| {
b.iter_batched(
|| {
let mc = mincut.graph.adjacency.clone();
SubpolynomialMinCut {
graph: DynamicGraph {
adjacency: mc,
edge_count: mincut.graph.edge_count,
vertex_count: mincut.graph.vertex_count,
components: None,
mod_count: 0,
},
cached_mincut: None,
updates_since_compute: 0,
recompute_threshold: 10,
}
},
|mut mc| black_box(mc.min_cut()),
criterion::BatchSize::SmallInput,
)
});
// Warm query (cached)
mincut.min_cut(); // Prime cache
group.bench_with_input(BenchmarkId::new("warm_query", size), &size, |b, _| {
b.iter(|| black_box(mincut.min_cut()))
});
}
group.finish();
}
/// Benchmark scaling behavior (verify subpolynomial)
fn bench_scaling(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_scaling");
group.sample_size(20);
// Sizes chosen for subpolynomial verification
// n^(2/3) scaling should show sub-linear growth
let sizes = vec![100, 316, 1000, 3162, 10000];
for size in sizes {
let edges = generate_random_graph(size, size * 2, 42);
// Measure insert amortized time
group.throughput(Throughput::Elements(1));
group.bench_with_input(
BenchmarkId::new("insert_amortized", size),
&size,
|b, &n| {
b.iter_batched(
|| {
let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
for (u, v, w) in &edges[..edges.len() / 2] {
mincut.insert_edge(*u, *v, *w);
}
(mincut, n)
},
|(mut mincut, n)| {
for i in 0..10 {
let u = (i * 37) as u64 % n as u64;
let v = (i * 73 + 1) as u64 % n as u64;
if u != v {
mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
}
}
black_box(mincut.min_cut())
},
criterion::BatchSize::SmallInput,
)
},
);
}
group.finish();
}
/// Benchmark mixed workload
fn bench_mixed_workload(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_mixed");
group.throughput(Throughput::Elements(1));
for size in [100, 1000, 10000] {
let edges = generate_random_graph(size, size * 2, 42);
group.bench_with_input(BenchmarkId::new("mixed_ops", size), &size, |b, &n| {
b.iter_batched(
|| {
let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
for (u, v, w) in &edges {
mincut.insert_edge(*u, *v, *w);
}
(mincut, 0usize)
},
|(mut mincut, mut op_idx)| {
// 50% insert, 30% delete, 20% query
match op_idx % 10 {
0..=4 => {
let u = (op_idx * 37) as u64 % n as u64;
let v = (op_idx * 73 + 1) as u64 % n as u64;
if u != v {
mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
}
}
5..=7 => {
if !edges.is_empty() {
let (u, v, _) = edges[op_idx % edges.len()];
mincut.delete_edge(u, v);
}
}
_ => {
let _ = mincut.min_cut();
}
}
op_idx += 1;
black_box(op_idx)
},
criterion::BatchSize::SmallInput,
)
});
}
group.finish();
}
/// Benchmark partition computation
fn bench_partition(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_partition");
for size in [100, 1000, 10000] {
let edges = generate_random_graph(size, size * 2, 42);
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
for (u, v, w) in &edges {
mincut.insert_edge(*u, *v, *w);
}
group.bench_with_input(BenchmarkId::new("partition", size), &size, |b, _| {
b.iter(|| black_box(mincut.partition()))
});
}
group.finish();
}
/// Benchmark connected components
fn bench_components(c: &mut Criterion) {
let mut group = c.benchmark_group("mincut_components");
for size in [100, 1000, 10000] {
// Create graph with multiple components
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 2);
let component_size = size / 5;
for comp in 0..5 {
let offset = comp * component_size;
for i in 0..component_size - 1 {
let u = (offset + i) as u64;
let v = (offset + i + 1) as u64;
mincut.insert_edge(u, v, 1.0);
}
}
group.bench_with_input(BenchmarkId::new("multi_component", size), &size, |b, _| {
b.iter(|| {
// Force recomputation
mincut.graph.components = None;
let components = mincut.graph.connected_components();
black_box(components.len())
})
});
}
group.finish();
}
criterion_group!(
benches,
bench_insert_edge,
bench_delete_edge,
bench_mincut_query,
bench_scaling,
bench_mixed_workload,
bench_partition,
bench_components,
);
criterion_main!(benches);

View File

@@ -0,0 +1,506 @@
//! Benchmarks for single residual calculation
//!
//! ADR-014 Performance Target: < 1us per residual calculation
//!
//! Residual is the core primitive: r_e = rho_u(x_u) - rho_v(x_v)
//! This measures the local constraint violation at each edge.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
// ============================================================================
// Restriction Map Types (Simulated for benchmarking)
// ============================================================================
/// Linear restriction map: y = Ax + b
/// Maps node state to shared constraint space
#[derive(Clone)]
pub struct RestrictionMap {
/// Linear transformation matrix (row-major, output_dim x input_dim)
pub matrix: Vec<f32>,
/// Bias vector
pub bias: Vec<f32>,
/// Input dimension
pub input_dim: usize,
/// Output dimension
pub output_dim: usize,
}
impl RestrictionMap {
/// Create identity restriction map
pub fn identity(dim: usize) -> Self {
let mut matrix = vec![0.0f32; dim * dim];
for i in 0..dim {
matrix[i * dim + i] = 1.0;
}
Self {
matrix,
bias: vec![0.0; dim],
input_dim: dim,
output_dim: dim,
}
}
/// Create random restriction map for testing
pub fn random(input_dim: usize, output_dim: usize, seed: u64) -> Self {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut matrix = Vec::with_capacity(output_dim * input_dim);
let mut bias = Vec::with_capacity(output_dim);
for i in 0..(output_dim * input_dim) {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
matrix.push(val);
}
for i in 0..output_dim {
let mut hasher = DefaultHasher::new();
(seed, i, "bias").hash(&mut hasher);
let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
bias.push(val);
}
Self {
matrix,
bias,
input_dim,
output_dim,
}
}
/// Apply restriction map: y = Ax + b
#[inline]
pub fn apply(&self, input: &[f32]) -> Vec<f32> {
debug_assert_eq!(input.len(), self.input_dim);
let mut output = self.bias.clone();
for i in 0..self.output_dim {
let row_start = i * self.input_dim;
for j in 0..self.input_dim {
output[i] += self.matrix[row_start + j] * input[j];
}
}
output
}
/// Apply restriction map with SIMD-friendly layout (output buffer provided)
#[inline]
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
debug_assert_eq!(input.len(), self.input_dim);
debug_assert_eq!(output.len(), self.output_dim);
// Copy bias first
output.copy_from_slice(&self.bias);
// Matrix-vector multiply
for i in 0..self.output_dim {
let row_start = i * self.input_dim;
for j in 0..self.input_dim {
output[i] += self.matrix[row_start + j] * input[j];
}
}
}
}
/// Edge with restriction maps
pub struct SheafEdge {
pub source: u64,
pub target: u64,
pub weight: f32,
pub rho_source: RestrictionMap,
pub rho_target: RestrictionMap,
}
impl SheafEdge {
/// Calculate the edge residual (local mismatch)
/// r_e = rho_u(x_u) - rho_v(x_v)
#[inline]
pub fn residual(&self, source_state: &[f32], target_state: &[f32]) -> Vec<f32> {
let projected_source = self.rho_source.apply(source_state);
let projected_target = self.rho_target.apply(target_state);
projected_source
.iter()
.zip(projected_target.iter())
.map(|(a, b)| a - b)
.collect()
}
/// Calculate residual with pre-allocated buffers (zero allocation)
#[inline]
pub fn residual_into(
&self,
source_state: &[f32],
target_state: &[f32],
source_buf: &mut [f32],
target_buf: &mut [f32],
residual: &mut [f32],
) {
self.rho_source.apply_into(source_state, source_buf);
self.rho_target.apply_into(target_state, target_buf);
for i in 0..residual.len() {
residual[i] = source_buf[i] - target_buf[i];
}
}
/// Calculate weighted residual norm squared: w_e * |r_e|^2
#[inline]
pub fn weighted_residual_energy(&self, source: &[f32], target: &[f32]) -> f32 {
let r = self.residual(source, target);
let norm_sq: f32 = r.iter().map(|x| x * x).sum();
self.weight * norm_sq
}
/// Weighted residual energy with pre-allocated buffers
#[inline]
pub fn weighted_residual_energy_into(
&self,
source: &[f32],
target: &[f32],
source_buf: &mut [f32],
target_buf: &mut [f32],
) -> f32 {
self.rho_source.apply_into(source, source_buf);
self.rho_target.apply_into(target, target_buf);
let mut norm_sq = 0.0f32;
for i in 0..source_buf.len() {
let diff = source_buf[i] - target_buf[i];
norm_sq += diff * diff;
}
self.weight * norm_sq
}
}
// ============================================================================
// Benchmarks
// ============================================================================
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
(0..dim)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
/// Benchmark single residual calculation at various dimensions
fn bench_single_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("residual_single");
group.throughput(Throughput::Elements(1));
// Test dimensions relevant for coherence engine:
// 8: Minimal state
// 32: Compact embedding
// 64: Standard embedding
// 128: Rich state
// 256: Large state
for dim in [8, 32, 64, 128, 256] {
let rho_source = RestrictionMap::identity(dim);
let rho_target = RestrictionMap::identity(dim);
let source_state = generate_state(dim, 42);
let target_state = generate_state(dim, 123);
let edge = SheafEdge {
source: 0,
target: 1,
weight: 1.0,
rho_source,
rho_target,
};
group.bench_with_input(BenchmarkId::new("identity_map", dim), &dim, |b, _| {
b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state)))
});
}
// Test with projection (non-identity maps)
for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128)] {
let rho_source = RestrictionMap::random(input_dim, output_dim, 42);
let rho_target = RestrictionMap::random(input_dim, output_dim, 123);
let source_state = generate_state(input_dim, 42);
let target_state = generate_state(input_dim, 123);
let edge = SheafEdge {
source: 0,
target: 1,
weight: 1.0,
rho_source,
rho_target,
};
group.bench_with_input(
BenchmarkId::new("projection_map", format!("{}to{}", input_dim, output_dim)),
&(input_dim, output_dim),
|b, _| b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state))),
);
}
group.finish();
}
/// Benchmark residual calculation with pre-allocated buffers (zero allocation)
fn bench_residual_zero_alloc(c: &mut Criterion) {
let mut group = c.benchmark_group("residual_zero_alloc");
group.throughput(Throughput::Elements(1));
for dim in [32, 64, 128, 256] {
let rho_source = RestrictionMap::identity(dim);
let rho_target = RestrictionMap::identity(dim);
let source_state = generate_state(dim, 42);
let target_state = generate_state(dim, 123);
let edge = SheafEdge {
source: 0,
target: 1,
weight: 1.0,
rho_source,
rho_target,
};
// Pre-allocate buffers
let mut source_buf = vec![0.0f32; dim];
let mut target_buf = vec![0.0f32; dim];
let mut residual = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| {
edge.residual_into(
black_box(&source_state),
black_box(&target_state),
black_box(&mut source_buf),
black_box(&mut target_buf),
black_box(&mut residual),
)
})
});
}
group.finish();
}
/// Benchmark weighted residual energy computation
fn bench_weighted_energy(c: &mut Criterion) {
let mut group = c.benchmark_group("residual_weighted_energy");
group.throughput(Throughput::Elements(1));
for dim in [32, 64, 128, 256] {
let rho_source = RestrictionMap::identity(dim);
let rho_target = RestrictionMap::identity(dim);
let source_state = generate_state(dim, 42);
let target_state = generate_state(dim, 123);
let edge = SheafEdge {
source: 0,
target: 1,
weight: 1.5,
rho_source,
rho_target,
};
group.bench_with_input(BenchmarkId::new("allocating", dim), &dim, |b, _| {
b.iter(|| {
edge.weighted_residual_energy(black_box(&source_state), black_box(&target_state))
})
});
// Pre-allocate buffers for zero-alloc version
let mut source_buf = vec![0.0f32; dim];
let mut target_buf = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("zero_alloc", dim), &dim, |b, _| {
b.iter(|| {
edge.weighted_residual_energy_into(
black_box(&source_state),
black_box(&target_state),
black_box(&mut source_buf),
black_box(&mut target_buf),
)
})
});
}
group.finish();
}
/// Benchmark batch residual computation (for parallel evaluation)
fn bench_batch_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("residual_batch");
for batch_size in [10, 100, 1000] {
let dim = 64;
// Create batch of edges
let edges: Vec<SheafEdge> = (0..batch_size)
.map(|i| SheafEdge {
source: i as u64,
target: (i + 1) as u64,
weight: 1.0,
rho_source: RestrictionMap::identity(dim),
rho_target: RestrictionMap::identity(dim),
})
.collect();
let states: Vec<Vec<f32>> = (0..batch_size + 1)
.map(|i| generate_state(dim, i as u64))
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
// Sequential computation
group.bench_with_input(
BenchmarkId::new("sequential", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
let mut total_energy = 0.0f32;
for (i, edge) in edges.iter().enumerate() {
total_energy += edge.weighted_residual_energy(
black_box(&states[i]),
black_box(&states[i + 1]),
);
}
black_box(total_energy)
})
},
);
}
group.finish();
}
/// Benchmark restriction map application alone
fn bench_restriction_map(c: &mut Criterion) {
let mut group = c.benchmark_group("restriction_map");
group.throughput(Throughput::Elements(1));
// Identity maps
for dim in [32, 64, 128, 256] {
let rho = RestrictionMap::identity(dim);
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("identity_apply", dim), &dim, |b, _| {
b.iter(|| rho.apply(black_box(&input)))
});
group.bench_with_input(
BenchmarkId::new("identity_apply_into", dim),
&dim,
|b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
);
}
// Projection maps (dense matrix multiply)
for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128), (512, 256)] {
let rho = RestrictionMap::random(input_dim, output_dim, 42);
let input = generate_state(input_dim, 42);
let mut output = vec![0.0f32; output_dim];
group.bench_with_input(
BenchmarkId::new("projection_apply", format!("{}x{}", input_dim, output_dim)),
&(input_dim, output_dim),
|b, _| b.iter(|| rho.apply(black_box(&input))),
);
group.bench_with_input(
BenchmarkId::new(
"projection_apply_into",
format!("{}x{}", input_dim, output_dim),
),
&(input_dim, output_dim),
|b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
);
}
group.finish();
}
/// Benchmark SIMD-optimized residual patterns
fn bench_simd_patterns(c: &mut Criterion) {
let mut group = c.benchmark_group("residual_simd_patterns");
group.throughput(Throughput::Elements(1));
// Aligned dimensions for SIMD (multiples of 8 for AVX2, 16 for AVX-512)
for dim in [32, 64, 128, 256, 512] {
let a = generate_state(dim, 42);
let b = generate_state(dim, 123);
// Scalar subtraction and norm
group.bench_with_input(
BenchmarkId::new("scalar_diff_norm", dim),
&dim,
|b_iter, _| {
b_iter.iter(|| {
let mut norm_sq = 0.0f32;
for i in 0..dim {
let diff = a[i] - b[i];
norm_sq += diff * diff;
}
black_box(norm_sq)
})
},
);
// Iterator-based (auto-vectorization friendly)
group.bench_with_input(
BenchmarkId::new("iter_diff_norm", dim),
&dim,
|b_iter, _| {
b_iter.iter(|| {
let norm_sq: f32 = a
.iter()
.zip(b.iter())
.map(|(x, y)| {
let d = x - y;
d * d
})
.sum();
black_box(norm_sq)
})
},
);
// Chunked for explicit SIMD opportunity
group.bench_with_input(
BenchmarkId::new("chunked_diff_norm", dim),
&dim,
|b_iter, _| {
b_iter.iter(|| {
let mut accum = [0.0f32; 8];
for (chunk_a, chunk_b) in a.chunks(8).zip(b.chunks(8)) {
for i in 0..chunk_a.len() {
let d = chunk_a[i] - chunk_b[i];
accum[i] += d * d;
}
}
black_box(accum.iter().sum::<f32>())
})
},
);
}
group.finish();
}
criterion_group!(
benches,
bench_single_residual,
bench_residual_zero_alloc,
bench_weighted_energy,
bench_batch_residual,
bench_restriction_map,
bench_simd_patterns,
);
criterion_main!(benches);

View File

@@ -0,0 +1,800 @@
//! SIMD-Specific Benchmarks for Prime-Radiant Coherence Engine
//!
//! This benchmark suite compares naive/scalar implementations against
//! SIMD-optimized versions for core coherence operations.
//!
//! ## Benchmark Categories
//! 1. Dense Matrix Multiply - naive vs SIMD
//! 2. Vector Norm Computation - naive vs SIMD
//! 3. Batch Residual Computation - naive vs SIMD
//! 4. Dot Products and Reductions
//!
//! ## Architecture Notes
//! - x86_64: AVX2 (256-bit, f32x8) or AVX-512 (512-bit, f32x16)
//! - aarch64: NEON (128-bit, f32x4)
//! - WASM: SIMD128 (128-bit)
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
// ============================================================================
// TEST DATA GENERATION
// ============================================================================
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
(0..len)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
(0..rows * cols)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
// ============================================================================
// NAIVE IMPLEMENTATIONS (BASELINE)
// ============================================================================
/// Naive matrix-vector multiply: y = Ax
#[inline(never)]
fn matmul_naive(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let mut sum = 0.0f32;
let row_start = i * cols;
for j in 0..cols {
sum += matrix[row_start + j] * x[j];
}
y[i] = sum;
}
}
/// Naive squared norm: |v|^2
#[inline(never)]
fn norm_sq_naive(v: &[f32]) -> f32 {
let mut sum = 0.0f32;
for &x in v {
sum += x * x;
}
sum
}
/// Naive dot product: a . b
#[inline(never)]
fn dot_naive(a: &[f32], b: &[f32]) -> f32 {
let mut sum = 0.0f32;
for i in 0..a.len() {
sum += a[i] * b[i];
}
sum
}
/// Naive residual norm: |a - b|^2
#[inline(never)]
fn residual_norm_naive(a: &[f32], b: &[f32]) -> f32 {
let mut sum = 0.0f32;
for i in 0..a.len() {
let diff = a[i] - b[i];
sum += diff * diff;
}
sum
}
/// Naive batch residual computation
#[inline(never)]
fn batch_residual_naive(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_naive(src, tgt);
}
total
}
// ============================================================================
// SIMD-FRIENDLY IMPLEMENTATIONS
// ============================================================================
/// Unrolled matrix-vector multiply (auto-vectorization friendly)
#[inline(never)]
fn matmul_unrolled(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let row_start = i * cols;
// Process in chunks of 8
let chunks = cols / 8;
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
let mut acc4 = 0.0f32;
let mut acc5 = 0.0f32;
let mut acc6 = 0.0f32;
let mut acc7 = 0.0f32;
for c in 0..chunks {
let base = row_start + c * 8;
acc0 += matrix[base] * x[c * 8];
acc1 += matrix[base + 1] * x[c * 8 + 1];
acc2 += matrix[base + 2] * x[c * 8 + 2];
acc3 += matrix[base + 3] * x[c * 8 + 3];
acc4 += matrix[base + 4] * x[c * 8 + 4];
acc5 += matrix[base + 5] * x[c * 8 + 5];
acc6 += matrix[base + 6] * x[c * 8 + 6];
acc7 += matrix[base + 7] * x[c * 8 + 7];
}
let mut sum = acc0 + acc1 + acc2 + acc3 + acc4 + acc5 + acc6 + acc7;
// Handle remainder
for j in (chunks * 8)..cols {
sum += matrix[row_start + j] * x[j];
}
y[i] = sum;
}
}
/// Unrolled squared norm with 4 accumulators
#[inline(never)]
fn norm_sq_unrolled(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(4);
let remainder = chunks.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for chunk in chunks {
acc0 += chunk[0] * chunk[0];
acc1 += chunk[1] * chunk[1];
acc2 += chunk[2] * chunk[2];
acc3 += chunk[3] * chunk[3];
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for &x in remainder {
sum += x * x;
}
sum
}
/// Unrolled squared norm with 8 accumulators (better for wider SIMD)
#[inline(never)]
fn norm_sq_unrolled_8(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(8);
let remainder = chunks.remainder();
let mut acc = [0.0f32; 8];
for chunk in chunks {
acc[0] += chunk[0] * chunk[0];
acc[1] += chunk[1] * chunk[1];
acc[2] += chunk[2] * chunk[2];
acc[3] += chunk[3] * chunk[3];
acc[4] += chunk[4] * chunk[4];
acc[5] += chunk[5] * chunk[5];
acc[6] += chunk[6] * chunk[6];
acc[7] += chunk[7] * chunk[7];
}
let mut sum: f32 = acc.iter().sum();
for &x in remainder {
sum += x * x;
}
sum
}
/// Iterator-based squared norm (relies on auto-vectorization)
#[inline(never)]
fn norm_sq_iter(v: &[f32]) -> f32 {
v.iter().map(|x| x * x).sum()
}
/// Unrolled dot product
#[inline(never)]
fn dot_unrolled(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for (ca, cb) in chunks_a.zip(chunks_b) {
acc0 += ca[0] * cb[0];
acc1 += ca[1] * cb[1];
acc2 += ca[2] * cb[2];
acc3 += ca[3] * cb[3];
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
sum += a * b;
}
sum
}
/// Unrolled residual norm
#[inline(never)]
fn residual_norm_unrolled(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for (ca, cb) in chunks_a.zip(chunks_b) {
let d0 = ca[0] - cb[0];
let d1 = ca[1] - cb[1];
let d2 = ca[2] - cb[2];
let d3 = ca[3] - cb[3];
acc0 += d0 * d0;
acc1 += d1 * d1;
acc2 += d2 * d2;
acc3 += d3 * d3;
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
let d = a - b;
sum += d * d;
}
sum
}
/// Batch residual with unrolled inner loop
#[inline(never)]
fn batch_residual_unrolled(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_unrolled(src, tgt);
}
total
}
// ============================================================================
// EXPLICIT SIMD (when wide crate is available)
// ============================================================================
#[cfg(feature = "simd")]
mod simd_impl {
use wide::f32x8;
/// SIMD squared norm using f32x8
#[inline(never)]
pub fn norm_sq_simd(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(8);
let remainder = chunks.remainder();
let mut acc = f32x8::ZERO;
for chunk in chunks {
let vals = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
acc += vals * vals;
}
let mut sum: f32 = acc.reduce_add();
for &x in remainder {
sum += x * x;
}
sum
}
/// SIMD dot product using f32x8
#[inline(never)]
pub fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(8);
let chunks_b = b.chunks_exact(8);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc = f32x8::ZERO;
for (ca, cb) in chunks_a.zip(chunks_b) {
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
acc += va * vb;
}
let mut sum: f32 = acc.reduce_add();
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
sum += a * b;
}
sum
}
/// SIMD residual norm using f32x8
#[inline(never)]
pub fn residual_norm_simd(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(8);
let chunks_b = b.chunks_exact(8);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc = f32x8::ZERO;
for (ca, cb) in chunks_a.zip(chunks_b) {
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
let diff = va - vb;
acc += diff * diff;
}
let mut sum: f32 = acc.reduce_add();
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
let d = a - b;
sum += d * d;
}
sum
}
/// SIMD matrix-vector multiply
#[inline(never)]
pub fn matmul_simd(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let row_start = i * cols;
let row = &matrix[row_start..row_start + cols];
let chunks_m = row.chunks_exact(8);
let chunks_x = x.chunks_exact(8);
let rem_m = chunks_m.remainder();
let rem_x = chunks_x.remainder();
let mut acc = f32x8::ZERO;
for (cm, cx) in chunks_m.zip(chunks_x) {
let vm = f32x8::from(<[f32; 8]>::try_from(cm).unwrap());
let vx = f32x8::from(<[f32; 8]>::try_from(cx).unwrap());
acc += vm * vx;
}
let mut sum: f32 = acc.reduce_add();
for (&m, &xv) in rem_m.iter().zip(rem_x.iter()) {
sum += m * xv;
}
y[i] = sum;
}
}
/// SIMD batch residual
#[inline(never)]
pub fn batch_residual_simd(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_simd(src, tgt);
}
total
}
}
// ============================================================================
// DENSE MATRIX MULTIPLY BENCHMARKS
// ============================================================================
fn bench_dense_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_matmul");
// Test matrix sizes: 64x64, 128x128, 256x256
for size in [64, 128, 256] {
let matrix = generate_matrix(size, size, 42);
let x = generate_vec(size, 123);
let mut y = vec![0.0f32; size];
group.throughput(Throughput::Elements((size * size) as u64));
group.bench_with_input(BenchmarkId::new("naive", size), &size, |b, _| {
b.iter(|| {
matmul_naive(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
group.bench_with_input(BenchmarkId::new("unrolled", size), &size, |b, _| {
b.iter(|| {
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", size), &size, |b, _| {
b.iter(|| {
simd_impl::matmul_simd(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
}
group.finish();
}
/// Benchmark non-square matrix multiply (projection)
fn bench_projection_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_matmul_projection");
// Common projection sizes in coherence: 64->32, 128->64, 256->128
for (in_dim, out_dim) in [(64, 32), (128, 64), (256, 128)] {
let matrix = generate_matrix(out_dim, in_dim, 42);
let x = generate_vec(in_dim, 123);
let mut y = vec![0.0f32; out_dim];
group.throughput(Throughput::Elements((out_dim * in_dim) as u64));
group.bench_with_input(
BenchmarkId::new("naive", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
matmul_naive(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
black_box(y[0])
})
},
);
group.bench_with_input(
BenchmarkId::new("unrolled", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
black_box(y[0])
})
},
);
#[cfg(feature = "simd")]
group.bench_with_input(
BenchmarkId::new("simd", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
simd_impl::matmul_simd(
black_box(&matrix),
black_box(&x),
&mut y,
out_dim,
in_dim,
);
black_box(y[0])
})
},
);
}
group.finish();
}
// ============================================================================
// NORM COMPUTATION BENCHMARKS
// ============================================================================
fn bench_norm_computation(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_norm");
// Test dimensions aligned for SIMD
for dim in [64, 128, 256, 512, 1024] {
let v = generate_vec(dim, 42);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_naive(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("iter", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_iter(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("unrolled_4", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_unrolled(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("unrolled_8", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd_f32x8", dim), &dim, |b, _| {
b.iter(|| black_box(simd_impl::norm_sq_simd(black_box(&v))))
});
}
group.finish();
}
// ============================================================================
// DOT PRODUCT BENCHMARKS
// ============================================================================
fn bench_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_dot");
for dim in [64, 256, 1024] {
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(dot_naive(black_box(&a), black_box(&b))))
});
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(dot_unrolled(black_box(&a), black_box(&b))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(simd_impl::dot_simd(black_box(&a), black_box(&b))))
});
}
group.finish();
}
// ============================================================================
// RESIDUAL NORM BENCHMARKS (CORE COHERENCE OPERATION)
// ============================================================================
fn bench_residual_norm(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_residual_norm");
for dim in [64, 256, 1024] {
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(residual_norm_naive(black_box(&a), black_box(&b))))
});
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
});
}
group.finish();
}
// ============================================================================
// BATCH RESIDUAL BENCHMARKS
// ============================================================================
fn bench_batch_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_batch_residual");
let dim = 64;
for batch_size in [100, 1000, 10000] {
let sources: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_vec(dim, i as u64))
.collect();
let targets: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_vec(dim, i as u64 + 10000))
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("naive", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
black_box(batch_residual_naive(
black_box(&sources),
black_box(&targets),
))
})
},
);
group.bench_with_input(
BenchmarkId::new("unrolled", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
black_box(batch_residual_unrolled(
black_box(&sources),
black_box(&targets),
))
})
},
);
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", batch_size), &batch_size, |b, _| {
b.iter(|| {
black_box(simd_impl::batch_residual_simd(
black_box(&sources),
black_box(&targets),
))
})
});
}
group.finish();
}
// ============================================================================
// MEMORY ALIGNMENT BENCHMARKS
// ============================================================================
fn bench_alignment_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_alignment");
let dim = 256;
// Aligned (multiple of 8)
{
let v = generate_vec(dim, 42);
group.bench_function("aligned_256", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
// Misaligned (not multiple of 8)
{
let v = generate_vec(dim + 3, 42);
group.bench_function("misaligned_259", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
// Small vector (below SIMD threshold)
{
let v = generate_vec(7, 42);
group.bench_function("small_7", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
group.finish();
}
// ============================================================================
// THROUGHPUT SCALING BENCHMARKS
// ============================================================================
fn bench_throughput_scaling(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_throughput_scaling");
// Test how throughput scales with vector size
let sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
for &size in &sizes {
let a = generate_vec(size, 42);
let b = generate_vec(size, 123);
group.throughput(Throughput::Bytes((size * 4 * 2) as u64)); // 2 vectors, 4 bytes each
group.bench_with_input(
BenchmarkId::new("residual_unrolled", size),
&size,
|bench, _| {
bench.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
},
);
#[cfg(feature = "simd")]
group.bench_with_input(
BenchmarkId::new("residual_simd", size),
&size,
|bench, _| {
bench
.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
},
);
}
group.finish();
}
// ============================================================================
// COHERENCE-SPECIFIC SIMD PATTERNS
// ============================================================================
/// Fused multiply-add pattern for coherence energy
fn bench_fma_pattern(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_fma_pattern");
let dim = 256;
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
let weight = 1.5f32;
// Without FMA (separate multiply and add)
group.bench_function("separate_ops", |bench| {
bench.iter(|| {
let mut sum = 0.0f32;
for i in 0..dim {
let diff = a[i] - b[i];
let sq = diff * diff;
sum += sq;
}
black_box(weight * sum)
})
});
// With potential FMA (compiler may optimize)
group.bench_function("fma_friendly", |bench| {
bench.iter(|| {
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
let chunks = dim / 4;
for c in 0..chunks {
let base = c * 4;
let d0 = a[base] - b[base];
let d1 = a[base + 1] - b[base + 1];
let d2 = a[base + 2] - b[base + 2];
let d3 = a[base + 3] - b[base + 3];
// These can become FMA operations
acc0 = d0.mul_add(d0, acc0);
acc1 = d1.mul_add(d1, acc1);
acc2 = d2.mul_add(d2, acc2);
acc3 = d3.mul_add(d3, acc3);
}
black_box(weight * (acc0 + acc1 + acc2 + acc3))
})
});
group.finish();
}
// ============================================================================
// CRITERION CONFIGURATION
// ============================================================================
criterion_group!(matmul_benches, bench_dense_matmul, bench_projection_matmul,);
criterion_group!(
vector_ops_benches,
bench_norm_computation,
bench_dot_product,
bench_residual_norm,
);
criterion_group!(batch_benches, bench_batch_residual,);
criterion_group!(
optimization_benches,
bench_alignment_impact,
bench_throughput_scaling,
bench_fma_pattern,
);
criterion_main!(
matmul_benches,
vector_ops_benches,
batch_benches,
optimization_benches
);

View File

@@ -0,0 +1,549 @@
//! Benchmarks for SONA Micro-LoRA instant adaptation
//!
//! ADR-014 Performance Target: < 0.05ms (50us) for instant adaptation
//!
//! SONA provides self-optimizing threshold tuning with:
//! - Micro-LoRA: Ultra-low rank (1-2) for instant learning
//! - Base-LoRA: Standard LoRA for background learning
//! - EWC++: Elastic Weight Consolidation to prevent forgetting
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
// ============================================================================
// SONA Types (Simulated for benchmarking)
// ============================================================================
/// Micro-LoRA layer (rank 1-2 for instant adaptation)
pub struct MicroLoRA {
/// Low-rank factor A (dim x rank)
pub a: Vec<f32>,
/// Low-rank factor B (rank x dim)
pub b: Vec<f32>,
/// Scaling factor
pub scale: f32,
/// Input dimension
pub dim: usize,
/// Rank (typically 1-2)
pub rank: usize,
}
impl MicroLoRA {
pub fn new(dim: usize, rank: usize) -> Self {
// Initialize with small random values
let a: Vec<f32> = (0..dim * rank)
.map(|i| ((i as f32 * 0.1234).sin() * 0.01))
.collect();
let b: Vec<f32> = (0..rank * dim)
.map(|i| ((i as f32 * 0.5678).cos() * 0.01))
.collect();
Self {
a,
b,
scale: 0.1,
dim,
rank,
}
}
/// Apply micro-LoRA transform: y = x + scale * B @ A @ x
#[inline]
pub fn apply(&self, input: &[f32], output: &mut [f32]) {
debug_assert_eq!(input.len(), self.dim);
debug_assert_eq!(output.len(), self.dim);
// Copy input to output first (identity component)
output.copy_from_slice(input);
// Compute A @ x -> hidden (rank-dimensional)
let mut hidden = vec![0.0f32; self.rank];
for r in 0..self.rank {
for i in 0..self.dim {
hidden[r] += self.a[i * self.rank + r] * input[i];
}
}
// Compute B @ hidden and add to output
for i in 0..self.dim {
let mut delta = 0.0f32;
for r in 0..self.rank {
delta += self.b[r * self.dim + i] * hidden[r];
}
output[i] += self.scale * delta;
}
}
/// Apply with pre-allocated hidden buffer (zero allocation)
#[inline]
pub fn apply_zero_alloc(&self, input: &[f32], hidden: &mut [f32], output: &mut [f32]) {
debug_assert_eq!(hidden.len(), self.rank);
// Copy input
output.copy_from_slice(input);
// A @ x
hidden.fill(0.0);
for r in 0..self.rank {
for i in 0..self.dim {
hidden[r] += self.a[i * self.rank + r] * input[i];
}
}
// B @ hidden
for i in 0..self.dim {
let mut delta = 0.0f32;
for r in 0..self.rank {
delta += self.b[r * self.dim + i] * hidden[r];
}
output[i] += self.scale * delta;
}
}
/// Update weights from gradient (instant learning)
#[inline]
pub fn update(&mut self, grad_a: &[f32], grad_b: &[f32], learning_rate: f32) {
for i in 0..self.a.len() {
self.a[i] -= learning_rate * grad_a[i];
}
for i in 0..self.b.len() {
self.b[i] -= learning_rate * grad_b[i];
}
}
}
/// Base-LoRA layer (higher rank for background learning)
pub struct BaseLoRA {
pub a: Vec<f32>,
pub b: Vec<f32>,
pub scale: f32,
pub dim: usize,
pub rank: usize,
}
impl BaseLoRA {
pub fn new(dim: usize, rank: usize) -> Self {
let a: Vec<f32> = (0..dim * rank)
.map(|i| ((i as f32 * 0.3456).sin() * 0.01))
.collect();
let b: Vec<f32> = (0..rank * dim)
.map(|i| ((i as f32 * 0.7890).cos() * 0.01))
.collect();
Self {
a,
b,
scale: 0.05,
dim,
rank,
}
}
#[inline]
pub fn apply(&self, input: &[f32], output: &mut [f32]) {
output.copy_from_slice(input);
let mut hidden = vec![0.0f32; self.rank];
for r in 0..self.rank {
for i in 0..self.dim {
hidden[r] += self.a[i * self.rank + r] * input[i];
}
}
for i in 0..self.dim {
let mut delta = 0.0f32;
for r in 0..self.rank {
delta += self.b[r * self.dim + i] * hidden[r];
}
output[i] += self.scale * delta;
}
}
}
/// EWC++ weight importance
pub struct EwcPlusPlus {
/// Fisher information diagonal
pub fisher: Vec<f32>,
/// Optimal weights from previous tasks
pub optimal_weights: Vec<f32>,
/// Regularization strength
pub lambda: f32,
}
impl EwcPlusPlus {
pub fn new(param_count: usize, lambda: f32) -> Self {
Self {
fisher: vec![1.0; param_count],
optimal_weights: vec![0.0; param_count],
lambda,
}
}
/// Compute EWC penalty for given weights
#[inline]
pub fn penalty(&self, weights: &[f32]) -> f32 {
let mut penalty = 0.0f32;
for i in 0..weights.len().min(self.fisher.len()) {
let diff = weights[i] - self.optimal_weights[i];
penalty += self.fisher[i] * diff * diff;
}
self.lambda * 0.5 * penalty
}
/// Update Fisher information (consolidation)
pub fn consolidate(&mut self, weights: &[f32], new_fisher: &[f32]) {
for i in 0..self.fisher.len().min(new_fisher.len()) {
// Online Fisher update (running average)
self.fisher[i] = 0.9 * self.fisher[i] + 0.1 * new_fisher[i];
self.optimal_weights[i] = weights[i];
}
}
}
/// Trajectory step for learning
#[derive(Clone)]
pub struct TrajectoryStep {
pub state: Vec<f32>,
pub action_embedding: Vec<f32>,
pub reward: f32,
}
/// Trajectory builder
pub struct TrajectoryBuilder {
pub initial_state: Vec<f32>,
pub steps: Vec<TrajectoryStep>,
}
impl TrajectoryBuilder {
pub fn new(initial_state: Vec<f32>) -> Self {
Self {
initial_state,
steps: Vec::new(),
}
}
pub fn add_step(&mut self, state: Vec<f32>, action: Vec<f32>, reward: f32) {
self.steps.push(TrajectoryStep {
state,
action_embedding: action,
reward,
});
}
}
/// SONA engine (simplified for benchmarking)
pub struct SonaEngine {
pub micro_lora: MicroLoRA,
pub base_lora: BaseLoRA,
pub ewc: EwcPlusPlus,
pub dim: usize,
}
impl SonaEngine {
pub fn new(dim: usize) -> Self {
let micro_rank = 2;
let base_rank = 8;
let param_count = dim * micro_rank * 2 + dim * base_rank * 2;
Self {
micro_lora: MicroLoRA::new(dim, micro_rank),
base_lora: BaseLoRA::new(dim, base_rank),
ewc: EwcPlusPlus::new(param_count, 0.4),
dim,
}
}
/// Begin trajectory
pub fn begin_trajectory(&self, initial_state: Vec<f32>) -> TrajectoryBuilder {
TrajectoryBuilder::new(initial_state)
}
/// End trajectory and trigger learning
pub fn end_trajectory(&mut self, builder: TrajectoryBuilder, final_reward: f32) {
// Simplified learning: update micro-LoRA based on reward
let lr = 0.001 * final_reward.max(0.0);
// Pseudo-gradient (simplified)
let grad_a: Vec<f32> = self.micro_lora.a.iter().map(|w| w * lr).collect();
let grad_b: Vec<f32> = self.micro_lora.b.iter().map(|w| w * lr).collect();
self.micro_lora.update(&grad_a, &grad_b, lr);
}
/// Apply micro-LoRA (instant)
#[inline]
pub fn apply_micro(&self, input: &[f32], output: &mut [f32]) {
self.micro_lora.apply(input, output);
}
/// Apply base-LoRA (background)
pub fn apply_base(&self, input: &[f32], output: &mut [f32]) {
self.base_lora.apply(input, output);
}
/// Apply both LoRAs combined
pub fn apply_combined(&self, input: &[f32], output: &mut [f32]) {
// Apply micro first
let mut intermediate = vec![0.0f32; self.dim];
self.micro_lora.apply(input, &mut intermediate);
// Then base
self.base_lora.apply(&intermediate, output);
}
}
// ============================================================================
// Benchmarks
// ============================================================================
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
(0..dim)
.map(|i| ((seed as f32 * 0.123 + i as f32 * 0.456).sin()))
.collect()
}
/// Benchmark Micro-LoRA application (target: <50us)
fn bench_micro_lora_apply(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_micro_lora_apply");
group.throughput(Throughput::Elements(1));
for dim in [64, 128, 256, 512] {
let lora = MicroLoRA::new(dim, 2); // Rank 2
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
});
}
// Different ranks
let dim = 256;
for rank in [1, 2, 4] {
let lora = MicroLoRA::new(dim, rank);
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
});
}
group.finish();
}
/// Benchmark zero-allocation Micro-LoRA
fn bench_micro_lora_zero_alloc(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_micro_lora_zero_alloc");
group.throughput(Throughput::Elements(1));
for dim in [64, 128, 256, 512] {
let lora = MicroLoRA::new(dim, 2);
let input = generate_state(dim, 42);
let mut hidden = vec![0.0f32; 2];
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| {
lora.apply_zero_alloc(
black_box(&input),
black_box(&mut hidden),
black_box(&mut output),
)
})
});
}
group.finish();
}
/// Benchmark Base-LoRA application
fn bench_base_lora_apply(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_base_lora_apply");
group.throughput(Throughput::Elements(1));
for dim in [64, 128, 256, 512] {
let lora = BaseLoRA::new(dim, 8); // Rank 8
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
});
}
// Different ranks
let dim = 256;
for rank in [4, 8, 16, 32] {
let lora = BaseLoRA::new(dim, rank);
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
});
}
group.finish();
}
/// Benchmark EWC++ penalty computation
fn bench_ewc_penalty(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_ewc_penalty");
group.throughput(Throughput::Elements(1));
for param_count in [1000, 10000, 100000] {
let ewc = EwcPlusPlus::new(param_count, 0.4);
let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
group.bench_with_input(
BenchmarkId::new("params", param_count),
&param_count,
|b, _| b.iter(|| black_box(ewc.penalty(black_box(&weights)))),
);
}
group.finish();
}
/// Benchmark EWC++ consolidation
fn bench_ewc_consolidate(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_ewc_consolidate");
for param_count in [1000, 10000, 100000] {
let mut ewc = EwcPlusPlus::new(param_count, 0.4);
let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
let new_fisher: Vec<f32> = (0..param_count)
.map(|i| (i as f32 * 0.002).cos().abs())
.collect();
group.bench_with_input(
BenchmarkId::new("params", param_count),
&param_count,
|b, _| b.iter(|| ewc.consolidate(black_box(&weights), black_box(&new_fisher))),
);
}
group.finish();
}
/// Benchmark full trajectory learning cycle
fn bench_trajectory_learning(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_trajectory_learning");
let dim = 256;
let mut engine = SonaEngine::new(dim);
// Single step trajectory
group.bench_function("single_step_trajectory", |b| {
b.iter(|| {
let mut builder = engine.begin_trajectory(generate_state(dim, 42));
builder.add_step(generate_state(dim, 43), vec![], 0.8);
engine.end_trajectory(builder, black_box(0.85));
})
});
// Multi-step trajectory
group.bench_function("10_step_trajectory", |b| {
b.iter(|| {
let mut builder = engine.begin_trajectory(generate_state(dim, 42));
for i in 0..10 {
builder.add_step(generate_state(dim, 43 + i), vec![], 0.5 + (i as f32) * 0.05);
}
engine.end_trajectory(builder, black_box(0.9));
})
});
group.finish();
}
/// Benchmark combined LoRA application
fn bench_combined_lora(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_combined_lora");
for dim in [64, 128, 256, 512] {
let engine = SonaEngine::new(dim);
let input = generate_state(dim, 42);
let mut output = vec![0.0f32; dim];
// Micro only
group.bench_with_input(BenchmarkId::new("micro_only", dim), &dim, |b, _| {
b.iter(|| engine.apply_micro(black_box(&input), black_box(&mut output)))
});
// Base only
group.bench_with_input(BenchmarkId::new("base_only", dim), &dim, |b, _| {
b.iter(|| engine.apply_base(black_box(&input), black_box(&mut output)))
});
// Combined
group.bench_with_input(BenchmarkId::new("combined", dim), &dim, |b, _| {
b.iter(|| engine.apply_combined(black_box(&input), black_box(&mut output)))
});
}
group.finish();
}
/// Benchmark batch inference
fn bench_batch_inference(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_batch_inference");
let dim = 256;
let engine = SonaEngine::new(dim);
for batch_size in [1, 10, 100, 1000] {
let inputs: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_state(dim, i as u64))
.collect();
let mut outputs: Vec<Vec<f32>> = (0..batch_size).map(|_| vec![0.0f32; dim]).collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("batch", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
engine.apply_micro(input, output);
}
black_box(outputs.len())
})
},
);
}
group.finish();
}
/// Benchmark weight update (instant learning)
fn bench_weight_update(c: &mut Criterion) {
let mut group = c.benchmark_group("sona_weight_update");
for dim in [64, 128, 256, 512] {
let mut lora = MicroLoRA::new(dim, 2);
let grad_a: Vec<f32> = (0..dim * 2).map(|i| (i as f32 * 0.001).sin()).collect();
let grad_b: Vec<f32> = (0..2 * dim).map(|i| (i as f32 * 0.002).cos()).collect();
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
b.iter(|| {
lora.update(black_box(&grad_a), black_box(&grad_b), black_box(0.001));
})
});
}
group.finish();
}
criterion_group!(
benches,
bench_micro_lora_apply,
bench_micro_lora_zero_alloc,
bench_base_lora_apply,
bench_ewc_penalty,
bench_ewc_consolidate,
bench_trajectory_learning,
bench_combined_lora,
bench_batch_inference,
bench_weight_update,
);
criterion_main!(benches);

View File

@@ -0,0 +1,663 @@
//! Benchmarks for 256-tile parallel tick
//!
//! ADR-014 Performance Target: < 1ms for 256-tile parallel tick
//!
//! The cognitum-gate-kernel provides 256 WASM tiles, each maintaining
//! a local graph shard with E-value accumulation and witness fragments.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
// ============================================================================
// Tile Types (Simulated, matching cognitum-gate-kernel structure)
// ============================================================================
/// Maximum delta buffer per tile
pub const MAX_DELTA_BUFFER: usize = 64;
/// Number of tiles in fabric
pub const NUM_TILES: usize = 256;
/// Maximum vertices per shard
pub const MAX_SHARD_VERTICES: usize = 256;
/// Maximum edges per shard
pub const MAX_SHARD_EDGES: usize = 1024;
/// Delta operation type
#[derive(Clone, Copy)]
pub enum DeltaType {
EdgeAdd,
EdgeRemove,
Observation,
WeightUpdate,
}
/// Delta (change event) for tile
#[derive(Clone, Copy)]
pub struct Delta {
pub delta_type: DeltaType,
pub source: u16,
pub target: u16,
pub weight: u16,
pub payload: u32,
}
impl Delta {
pub fn edge_add(src: u16, tgt: u16, weight: u16) -> Self {
Self {
delta_type: DeltaType::EdgeAdd,
source: src,
target: tgt,
weight,
payload: 0,
}
}
pub fn observation(vertex: u16, positive: bool) -> Self {
Self {
delta_type: DeltaType::Observation,
source: vertex,
target: 0,
weight: 0,
payload: positive as u32,
}
}
}
/// Compact vertex state
#[derive(Clone, Copy, Default)]
pub struct VertexState {
pub degree: u8,
pub component_id: u8,
pub active: bool,
pub energy_contrib: f32,
}
impl VertexState {
pub fn is_active(&self) -> bool {
self.active
}
}
/// Compact edge
#[derive(Clone, Copy, Default)]
pub struct CompactEdge {
pub source: u16,
pub target: u16,
pub weight: u16,
pub active: bool,
}
impl CompactEdge {
pub fn is_active(&self) -> bool {
self.active
}
}
/// Compact graph for single tile
pub struct CompactGraph {
pub vertices: [VertexState; MAX_SHARD_VERTICES],
pub edges: [CompactEdge; MAX_SHARD_EDGES],
pub edge_count: usize,
pub vertex_count: usize,
pub component_count: u8,
}
impl CompactGraph {
pub fn new() -> Self {
Self {
vertices: [VertexState::default(); MAX_SHARD_VERTICES],
edges: [CompactEdge::default(); MAX_SHARD_EDGES],
edge_count: 0,
vertex_count: 0,
component_count: 0,
}
}
pub fn add_edge(&mut self, src: u16, tgt: u16, weight: u16) -> bool {
if self.edge_count >= MAX_SHARD_EDGES {
return false;
}
// Activate vertices
self.vertices[src as usize].active = true;
self.vertices[src as usize].degree += 1;
self.vertices[tgt as usize].active = true;
self.vertices[tgt as usize].degree += 1;
// Add edge
self.edges[self.edge_count] = CompactEdge {
source: src,
target: tgt,
weight,
active: true,
};
self.edge_count += 1;
true
}
pub fn recompute_components(&mut self) {
// Simple union-find simulation
let mut parent = [0u8; MAX_SHARD_VERTICES];
for i in 0..MAX_SHARD_VERTICES {
parent[i] = i as u8;
}
// Union edges
for edge in &self.edges[..self.edge_count] {
if edge.active {
let s = edge.source as usize;
let t = edge.target as usize;
parent[s] = parent[t];
}
}
// Count unique components
let mut seen = [false; MAX_SHARD_VERTICES];
let mut count = 0u8;
for i in 0..MAX_SHARD_VERTICES {
if self.vertices[i].active && !seen[parent[i] as usize] {
seen[parent[i] as usize] = true;
count += 1;
}
}
self.component_count = count;
}
pub fn compute_total_energy(&self) -> f32 {
let mut energy = 0.0f32;
for edge in &self.edges[..self.edge_count] {
if edge.active {
// Simplified: weight as energy contribution
energy += edge.weight as f32 / 100.0;
}
}
energy
}
}
/// E-value accumulator (log-space evidence)
pub struct EvidenceAccumulator {
/// Log e-value (fixed-point: value / 65536 = log2(e-value))
pub log_e_values: Vec<i32>,
pub hypothesis_count: usize,
}
impl EvidenceAccumulator {
pub fn new(capacity: usize) -> Self {
Self {
log_e_values: vec![0; capacity],
hypothesis_count: 0,
}
}
pub fn add_hypothesis(&mut self) -> usize {
let idx = self.hypothesis_count;
if idx < self.log_e_values.len() {
self.hypothesis_count += 1;
}
idx
}
#[inline]
pub fn update(&mut self, idx: usize, log_lr: i32) {
if idx < self.hypothesis_count {
self.log_e_values[idx] = self.log_e_values[idx].saturating_add(log_lr);
}
}
pub fn global_log_e(&self) -> i64 {
self.log_e_values[..self.hypothesis_count]
.iter()
.map(|&v| v as i64)
.sum()
}
}
/// Tile report (output of tick)
#[derive(Clone, Copy)]
pub struct TileReport {
pub tile_id: u8,
pub tick: u32,
pub connected: bool,
pub component_count: u8,
pub log_e_value: i64,
pub energy: f32,
pub witness_hash: u64,
}
impl TileReport {
pub fn new(tile_id: u8) -> Self {
Self {
tile_id,
tick: 0,
connected: true,
component_count: 1,
log_e_value: 0,
energy: 0.0,
witness_hash: 0,
}
}
}
/// Single tile state
pub struct TileState {
pub tile_id: u8,
pub graph: CompactGraph,
pub evidence: EvidenceAccumulator,
pub delta_buffer: Vec<Delta>,
pub tick_count: u32,
}
impl TileState {
pub fn new(tile_id: u8) -> Self {
Self {
tile_id,
graph: CompactGraph::new(),
evidence: EvidenceAccumulator::new(64),
delta_buffer: Vec::with_capacity(MAX_DELTA_BUFFER),
tick_count: 0,
}
}
pub fn ingest_delta(&mut self, delta: &Delta) -> bool {
if self.delta_buffer.len() >= MAX_DELTA_BUFFER {
return false;
}
self.delta_buffer.push(*delta);
true
}
pub fn tick(&mut self, tick_number: u32) -> TileReport {
// Process pending deltas
for delta in self.delta_buffer.drain(..) {
match delta.delta_type {
DeltaType::EdgeAdd => {
self.graph
.add_edge(delta.source, delta.target, delta.weight);
}
DeltaType::Observation => {
// Update evidence accumulator
let log_lr = if delta.payload != 0 { 65536 } else { -65536 };
if self.evidence.hypothesis_count > 0 {
self.evidence.update(0, log_lr);
}
}
_ => {}
}
}
// Recompute components if needed
self.graph.recompute_components();
// Compute energy
let energy = self.graph.compute_total_energy();
// Build report
self.tick_count = tick_number;
TileReport {
tile_id: self.tile_id,
tick: tick_number,
connected: self.graph.component_count <= 1,
component_count: self.graph.component_count,
log_e_value: self.evidence.global_log_e(),
energy,
witness_hash: self.compute_witness_hash(),
}
}
fn compute_witness_hash(&self) -> u64 {
let mut hash = self.tile_id as u64;
hash = hash.wrapping_mul(0x517cc1b727220a95);
hash ^= self.tick_count as u64;
hash = hash.wrapping_mul(0x517cc1b727220a95);
hash ^= self.graph.edge_count as u64;
hash
}
pub fn reset(&mut self) {
self.graph = CompactGraph::new();
self.delta_buffer.clear();
self.tick_count = 0;
}
}
/// 256-tile coherence fabric
pub struct CoherenceFabric {
pub tiles: Vec<TileState>,
}
impl CoherenceFabric {
pub fn new() -> Self {
Self {
tiles: (0..NUM_TILES).map(|i| TileState::new(i as u8)).collect(),
}
}
/// Execute tick on all tiles sequentially
pub fn tick_sequential(&mut self, tick_number: u32) -> Vec<TileReport> {
self.tiles.iter_mut().map(|t| t.tick(tick_number)).collect()
}
/// Aggregate reports into global coherence
pub fn aggregate_reports(reports: &[TileReport]) -> FabricReport {
let total_energy: f32 = reports.iter().map(|r| r.energy).sum();
let total_log_e: i64 = reports.iter().map(|r| r.log_e_value).sum();
let all_connected = reports.iter().all(|r| r.connected);
// Compute global witness hash
let mut global_hash = 0u64;
for r in reports {
global_hash = global_hash.wrapping_mul(0x517cc1b727220a95);
global_hash ^= r.witness_hash;
}
FabricReport {
tick: reports.first().map(|r| r.tick).unwrap_or(0),
total_energy,
total_log_e,
all_connected,
global_witness_hash: global_hash,
}
}
/// Distribute delta to appropriate tile
pub fn distribute_delta(&mut self, node_id: u64, delta: &Delta) {
let tile_id = (node_id % NUM_TILES as u64) as usize;
self.tiles[tile_id].ingest_delta(delta);
}
}
/// Aggregated fabric report
pub struct FabricReport {
pub tick: u32,
pub total_energy: f32,
pub total_log_e: i64,
pub all_connected: bool,
pub global_witness_hash: u64,
}
// ============================================================================
// Benchmarks
// ============================================================================
/// Benchmark single tile tick
fn bench_single_tile_tick(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_single_tick");
group.throughput(Throughput::Elements(1));
// Empty tick
let mut tile = TileState::new(0);
group.bench_function("empty", |b| b.iter(|| black_box(tile.tick(black_box(1)))));
// Tick with small graph
let mut tile = TileState::new(0);
for i in 0..20u16 {
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
}
tile.tick(0);
group.bench_function("small_graph_20_edges", |b| {
b.iter(|| black_box(tile.tick(black_box(1))))
});
// Tick with pending deltas
group.bench_function("with_10_deltas", |b| {
b.iter_batched(
|| {
let mut t = TileState::new(0);
for i in 0..10u16 {
t.ingest_delta(&Delta::edge_add(i, i + 1, 100));
}
t
},
|mut t| black_box(t.tick(1)),
criterion::BatchSize::SmallInput,
)
});
// Tick with full delta buffer
group.bench_function("with_64_deltas", |b| {
b.iter_batched(
|| {
let mut t = TileState::new(0);
for i in 0..MAX_DELTA_BUFFER as u16 {
t.ingest_delta(&Delta::edge_add(i % 200, (i + 1) % 200, 100));
}
t
},
|mut t| black_box(t.tick(1)),
criterion::BatchSize::SmallInput,
)
});
group.finish();
}
/// Benchmark 256-tile parallel tick (sequential baseline)
fn bench_256_tile_tick_sequential(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_256_sequential");
group.throughput(Throughput::Elements(NUM_TILES as u64));
// Empty fabric
let mut fabric = CoherenceFabric::new();
group.bench_function("empty_fabric", |b| {
b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
});
// Fabric with some data per tile
let mut fabric = CoherenceFabric::new();
for i in 0..NUM_TILES {
for j in 0..10u16 {
fabric.tiles[i].ingest_delta(&Delta::edge_add(j, j + 1, 100));
}
fabric.tiles[i].tick(0);
}
group.bench_function("populated_10_edges_per_tile", |b| {
b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
});
group.finish();
}
/// Benchmark report aggregation
fn bench_report_aggregation(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_report_aggregation");
group.throughput(Throughput::Elements(NUM_TILES as u64));
// Generate 256 reports
let reports: Vec<TileReport> = (0..NUM_TILES)
.map(|i| TileReport {
tile_id: i as u8,
tick: 1,
connected: i % 10 != 0,
component_count: (i % 5) as u8 + 1,
log_e_value: (i as i64) * 1000 - 128000,
energy: (i as f32) * 0.1,
witness_hash: i as u64 * 0x517cc1b727220a95,
})
.collect();
group.bench_function("aggregate_256_reports", |b| {
b.iter(|| black_box(CoherenceFabric::aggregate_reports(black_box(&reports))))
});
group.finish();
}
/// Benchmark delta distribution
fn bench_delta_distribution(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_delta_distribution");
let mut fabric = CoherenceFabric::new();
// Single delta
let delta = Delta::edge_add(0, 1, 100);
group.bench_function("distribute_single", |b| {
b.iter(|| fabric.distribute_delta(black_box(12345), black_box(&delta)))
});
// Batch distribution
for batch_size in [100, 1000, 10000] {
let deltas: Vec<(u64, Delta)> = (0..batch_size)
.map(|i| {
(
i as u64,
Delta::edge_add((i % 200) as u16, ((i + 1) % 200) as u16, 100),
)
})
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("distribute_batch", batch_size),
&deltas,
|b, deltas| {
b.iter(|| {
for (node_id, delta) in deltas {
fabric.distribute_delta(*node_id, delta);
}
})
},
);
}
group.finish();
}
/// Benchmark evidence accumulator
fn bench_evidence_accumulator(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_evidence");
let mut acc = EvidenceAccumulator::new(64);
for _ in 0..16 {
acc.add_hypothesis();
}
// Single update
group.bench_function("update_single", |b| {
b.iter(|| acc.update(black_box(5), black_box(65536)))
});
// Global e-value computation
group.bench_function("global_log_e_16_hyp", |b| {
b.iter(|| black_box(acc.global_log_e()))
});
// 64 hypotheses
let mut acc64 = EvidenceAccumulator::new(64);
for _ in 0..64 {
acc64.add_hypothesis();
}
for i in 0..64 {
acc64.log_e_values[i] = (i as i32 - 32) * 1000;
}
group.bench_function("global_log_e_64_hyp", |b| {
b.iter(|| black_box(acc64.global_log_e()))
});
group.finish();
}
/// Benchmark component recomputation
fn bench_component_recompute(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_component_recompute");
for edge_count in [50, 200, 500, 1000] {
let mut graph = CompactGraph::new();
for i in 0..edge_count.min(MAX_SHARD_EDGES) {
let src = (i % 200) as u16;
let tgt = ((i + 1) % 200) as u16;
if src != tgt {
graph.add_edge(src, tgt, 100);
}
}
group.bench_with_input(
BenchmarkId::new("recompute", edge_count),
&edge_count,
|b, _| {
b.iter(|| {
graph.recompute_components();
black_box(graph.component_count)
})
},
);
}
group.finish();
}
/// Benchmark full tick + aggregate cycle
fn bench_full_cycle(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_full_cycle");
group.sample_size(50);
// Populate fabric
let mut fabric = CoherenceFabric::new();
for i in 0..NUM_TILES {
for j in 0..50u16 {
fabric.tiles[i].ingest_delta(&Delta::edge_add(j, (j + 1) % 200, 100));
}
fabric.tiles[i].tick(0);
}
group.bench_function("tick_and_aggregate_256_tiles", |b| {
let mut tick = 1u32;
b.iter(|| {
let reports = fabric.tick_sequential(tick);
let fabric_report = CoherenceFabric::aggregate_reports(&reports);
tick += 1;
black_box(fabric_report)
})
});
group.finish();
}
/// Benchmark memory access patterns
fn bench_memory_patterns(c: &mut Criterion) {
let mut group = c.benchmark_group("tile_memory");
// Sequential tile access
let fabric = CoherenceFabric::new();
group.bench_function("sequential_tile_scan", |b| {
b.iter(|| {
let mut total = 0usize;
for tile in &fabric.tiles {
total += tile.graph.edge_count;
}
black_box(total)
})
});
// Strided tile access
group.bench_function("strided_tile_scan", |b| {
let stride = 7;
b.iter(|| {
let mut total = 0usize;
let mut idx = 0;
for _ in 0..NUM_TILES {
total += fabric.tiles[idx % NUM_TILES].graph.edge_count;
idx += stride;
}
black_box(total)
})
});
group.finish();
}
criterion_group!(
benches,
bench_single_tile_tick,
bench_256_tile_tick_sequential,
bench_report_aggregation,
bench_delta_distribution,
bench_evidence_accumulator,
bench_component_recompute,
bench_full_cycle,
bench_memory_patterns,
);
criterion_main!(benches);