Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
15
vendor/ruvector/crates/prime-radiant/benches/attention_bench.rs
vendored
Normal file
15
vendor/ruvector/crates/prime-radiant/benches/attention_bench.rs
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
//! Attention-weighted coherence benchmarks
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
|
||||
fn attention_benchmark(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("attention");
|
||||
|
||||
// Placeholder benchmark - requires attention feature
|
||||
group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, attention_benchmark);
|
||||
criterion_main!(benches);
|
||||
15
vendor/ruvector/crates/prime-radiant/benches/coherence_bench.rs
vendored
Normal file
15
vendor/ruvector/crates/prime-radiant/benches/coherence_bench.rs
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
//! Coherence engine benchmarks
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
|
||||
fn coherence_benchmark(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("coherence");
|
||||
|
||||
// Placeholder benchmark - will be implemented when coherence module is complete
|
||||
group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, coherence_benchmark);
|
||||
criterion_main!(benches);
|
||||
1063
vendor/ruvector/crates/prime-radiant/benches/coherence_benchmarks.rs
vendored
Normal file
1063
vendor/ruvector/crates/prime-radiant/benches/coherence_benchmarks.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
546
vendor/ruvector/crates/prime-radiant/benches/energy_bench.rs
vendored
Normal file
546
vendor/ruvector/crates/prime-radiant/benches/energy_bench.rs
vendored
Normal file
@@ -0,0 +1,546 @@
|
||||
//! Benchmarks for full graph energy computation
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 10ms for 10K nodes
|
||||
//!
|
||||
//! Global coherence energy: E(S) = sum(w_e * |r_e|^2)
|
||||
//! This is the aggregate measure of system incoherence.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::HashMap;
|
||||
|
||||
// ============================================================================
|
||||
// Graph Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
/// Simplified restriction map for energy benchmarks
|
||||
#[derive(Clone)]
|
||||
pub struct RestrictionMap {
|
||||
pub matrix: Vec<f32>,
|
||||
pub bias: Vec<f32>,
|
||||
pub input_dim: usize,
|
||||
pub output_dim: usize,
|
||||
}
|
||||
|
||||
impl RestrictionMap {
|
||||
pub fn identity(dim: usize) -> Self {
|
||||
let mut matrix = vec![0.0f32; dim * dim];
|
||||
for i in 0..dim {
|
||||
matrix[i * dim + i] = 1.0;
|
||||
}
|
||||
Self {
|
||||
matrix,
|
||||
bias: vec![0.0; dim],
|
||||
input_dim: dim,
|
||||
output_dim: dim,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
|
||||
output.copy_from_slice(&self.bias);
|
||||
for i in 0..self.output_dim {
|
||||
let row_start = i * self.input_dim;
|
||||
for j in 0..self.input_dim {
|
||||
output[i] += self.matrix[row_start + j] * input[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Node in sheaf graph
|
||||
#[derive(Clone)]
|
||||
pub struct SheafNode {
|
||||
pub id: u64,
|
||||
pub state: Vec<f32>,
|
||||
}
|
||||
|
||||
/// Edge with restriction maps
|
||||
#[derive(Clone)]
|
||||
pub struct SheafEdge {
|
||||
pub source: u64,
|
||||
pub target: u64,
|
||||
pub weight: f32,
|
||||
pub rho_source: RestrictionMap,
|
||||
pub rho_target: RestrictionMap,
|
||||
}
|
||||
|
||||
impl SheafEdge {
|
||||
#[inline]
|
||||
pub fn weighted_residual_energy_into(
|
||||
&self,
|
||||
source: &[f32],
|
||||
target: &[f32],
|
||||
source_buf: &mut [f32],
|
||||
target_buf: &mut [f32],
|
||||
) -> f32 {
|
||||
self.rho_source.apply_into(source, source_buf);
|
||||
self.rho_target.apply_into(target, target_buf);
|
||||
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..source_buf.len() {
|
||||
let diff = source_buf[i] - target_buf[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
|
||||
self.weight * norm_sq
|
||||
}
|
||||
}
|
||||
|
||||
/// Full sheaf graph for coherence computation
|
||||
pub struct SheafGraph {
|
||||
pub nodes: HashMap<u64, SheafNode>,
|
||||
pub edges: Vec<SheafEdge>,
|
||||
pub state_dim: usize,
|
||||
}
|
||||
|
||||
/// Result of energy computation
|
||||
pub struct CoherenceEnergy {
|
||||
pub total_energy: f32,
|
||||
pub edge_energies: Vec<f32>,
|
||||
}
|
||||
|
||||
impl SheafGraph {
|
||||
/// Generate a random graph for benchmarking
|
||||
pub fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut hasher = || {
|
||||
let mut h = DefaultHasher::new();
|
||||
seed.hash(&mut h);
|
||||
h
|
||||
};
|
||||
|
||||
// Generate nodes
|
||||
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
|
||||
.map(|id| {
|
||||
let state: Vec<f32> = (0..state_dim)
|
||||
.map(|i| {
|
||||
let mut h = hasher();
|
||||
(id, i).hash(&mut h);
|
||||
(h.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect();
|
||||
(id, SheafNode { id, state })
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Generate edges (random graph with target average degree)
|
||||
let num_edges = (num_nodes * avg_degree) / 2;
|
||||
let mut edges = Vec::with_capacity(num_edges);
|
||||
|
||||
for i in 0..num_edges {
|
||||
let mut h = hasher();
|
||||
(seed, i, "edge").hash(&mut h);
|
||||
let source = (h.finish() % num_nodes as u64) as u64;
|
||||
|
||||
let mut h = hasher();
|
||||
(seed, i, "target").hash(&mut h);
|
||||
let target = (h.finish() % num_nodes as u64) as u64;
|
||||
|
||||
if source != target {
|
||||
edges.push(SheafEdge {
|
||||
source,
|
||||
target,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
nodes,
|
||||
edges,
|
||||
state_dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a chain graph (linear topology)
|
||||
pub fn chain(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
|
||||
.map(|id| {
|
||||
let state: Vec<f32> = (0..state_dim)
|
||||
.map(|i| {
|
||||
let mut h = DefaultHasher::new();
|
||||
(seed, id, i).hash(&mut h);
|
||||
(h.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect();
|
||||
(id, SheafNode { id, state })
|
||||
})
|
||||
.collect();
|
||||
|
||||
let edges: Vec<SheafEdge> = (0..num_nodes - 1)
|
||||
.map(|i| SheafEdge {
|
||||
source: i as u64,
|
||||
target: (i + 1) as u64,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
nodes,
|
||||
edges,
|
||||
state_dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a dense graph (high connectivity)
|
||||
pub fn dense(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
|
||||
.map(|id| {
|
||||
let state: Vec<f32> = (0..state_dim)
|
||||
.map(|i| {
|
||||
let mut h = DefaultHasher::new();
|
||||
(seed, id, i).hash(&mut h);
|
||||
(h.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect();
|
||||
(id, SheafNode { id, state })
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Dense: ~30% of possible edges
|
||||
let mut edges = Vec::new();
|
||||
for i in 0..num_nodes as u64 {
|
||||
for j in (i + 1)..num_nodes as u64 {
|
||||
let mut h = DefaultHasher::new();
|
||||
(seed, i, j).hash(&mut h);
|
||||
if h.finish() % 10 < 3 {
|
||||
// 30% probability
|
||||
edges.push(SheafEdge {
|
||||
source: i,
|
||||
target: j,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
nodes,
|
||||
edges,
|
||||
state_dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute global coherence energy (sequential)
|
||||
pub fn compute_energy_sequential(&self) -> CoherenceEnergy {
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
|
||||
let edge_energies: Vec<f32> = self
|
||||
.edges
|
||||
.iter()
|
||||
.map(|edge| {
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_energy: f32 = edge_energies.iter().sum();
|
||||
|
||||
CoherenceEnergy {
|
||||
total_energy,
|
||||
edge_energies,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute global coherence energy (parallel with rayon)
|
||||
#[cfg(feature = "parallel")]
|
||||
pub fn compute_energy_parallel(&self) -> CoherenceEnergy {
|
||||
use rayon::prelude::*;
|
||||
|
||||
let edge_energies: Vec<f32> = self
|
||||
.edges
|
||||
.par_iter()
|
||||
.map(|edge| {
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_energy: f32 = edge_energies.par_iter().sum();
|
||||
|
||||
CoherenceEnergy {
|
||||
total_energy,
|
||||
edge_energies,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute just total energy (no per-edge tracking)
|
||||
pub fn compute_total_energy(&self) -> f32 {
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
let mut total = 0.0f32;
|
||||
|
||||
for edge in &self.edges {
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
total += edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
);
|
||||
}
|
||||
|
||||
total
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark full graph energy at various sizes
|
||||
fn bench_full_graph_energy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_full_graph");
|
||||
|
||||
// ADR-014 target: 10K nodes in <10ms
|
||||
// Test progression: 100, 1K, 10K, 100K
|
||||
for num_nodes in [100, 1_000, 10_000] {
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("sequential", format!("{}nodes", num_nodes)),
|
||||
&num_nodes,
|
||||
|b, _| b.iter(|| black_box(graph.compute_energy_sequential())),
|
||||
);
|
||||
|
||||
// Total energy only (no per-edge allocation)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("total_only", format!("{}nodes", num_nodes)),
|
||||
&num_nodes,
|
||||
|b, _| b.iter(|| black_box(graph.compute_total_energy())),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark with 100K nodes (reduced sample size due to runtime)
|
||||
fn bench_large_graph_energy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_large_graph");
|
||||
group.sample_size(10);
|
||||
|
||||
let num_nodes = 100_000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
|
||||
group.bench_function("100K_nodes_total_energy", |b| {
|
||||
b.iter(|| black_box(graph.compute_total_energy()))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark energy computation for different graph topologies
|
||||
fn bench_topology_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_topology");
|
||||
|
||||
let num_nodes = 1000;
|
||||
let state_dim = 64;
|
||||
|
||||
// Chain topology (sparse, n-1 edges)
|
||||
let chain = SheafGraph::chain(num_nodes, state_dim, 42);
|
||||
group.throughput(Throughput::Elements(chain.edges.len() as u64));
|
||||
group.bench_function("chain_1000", |b| {
|
||||
b.iter(|| black_box(chain.compute_total_energy()))
|
||||
});
|
||||
|
||||
// Random topology (avg degree 4)
|
||||
let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
|
||||
group.throughput(Throughput::Elements(random.edges.len() as u64));
|
||||
group.bench_function("random_1000_deg4", |b| {
|
||||
b.iter(|| black_box(random.compute_total_energy()))
|
||||
});
|
||||
|
||||
// Dense topology (~30% edges)
|
||||
let dense = SheafGraph::dense(100, state_dim, 42); // Smaller for dense
|
||||
group.throughput(Throughput::Elements(dense.edges.len() as u64));
|
||||
group.bench_function("dense_100", |b| {
|
||||
b.iter(|| black_box(dense.compute_total_energy()))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark impact of state dimension on energy computation
|
||||
fn bench_state_dimension(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_state_dim");
|
||||
|
||||
let num_nodes = 1000;
|
||||
let avg_degree = 4;
|
||||
|
||||
for state_dim in [8, 32, 64, 128, 256] {
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
group.bench_with_input(BenchmarkId::new("dim", state_dim), &state_dim, |b, _| {
|
||||
b.iter(|| black_box(graph.compute_total_energy()))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark edge density scaling
|
||||
fn bench_edge_density(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_edge_density");
|
||||
|
||||
let num_nodes = 1000;
|
||||
let state_dim = 64;
|
||||
|
||||
// Varying average degree
|
||||
for avg_degree in [2, 4, 8, 16, 32] {
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("avg_degree", avg_degree),
|
||||
&avg_degree,
|
||||
|b, _| b.iter(|| black_box(graph.compute_total_energy())),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark scope-based energy aggregation
|
||||
fn bench_scoped_energy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_scoped");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
// Simulate scope-based aggregation (e.g., by namespace)
|
||||
let num_scopes = 10;
|
||||
let scope_assignments: Vec<usize> = graph
|
||||
.edges
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, _)| i % num_scopes)
|
||||
.collect();
|
||||
|
||||
group.bench_function("aggregate_by_scope", |b| {
|
||||
b.iter(|| {
|
||||
let mut source_buf = vec![0.0f32; state_dim];
|
||||
let mut target_buf = vec![0.0f32; state_dim];
|
||||
let mut scope_energies = vec![0.0f32; num_scopes];
|
||||
|
||||
for (i, edge) in graph.edges.iter().enumerate() {
|
||||
let source_state = &graph.nodes[&edge.source].state;
|
||||
let target_state = &graph.nodes[&edge.target].state;
|
||||
let energy = edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
);
|
||||
scope_energies[scope_assignments[i]] += energy;
|
||||
}
|
||||
|
||||
black_box(scope_energies)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark energy fingerprint computation
|
||||
fn bench_energy_fingerprint(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_fingerprint");
|
||||
|
||||
let num_nodes = 1000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
|
||||
|
||||
group.bench_function("compute_with_fingerprint", |b| {
|
||||
b.iter(|| {
|
||||
let energy = graph.compute_energy_sequential();
|
||||
|
||||
// Compute fingerprint from edge energies
|
||||
let mut fingerprint = 0u64;
|
||||
for e in &energy.edge_energies {
|
||||
fingerprint ^= e.to_bits() as u64;
|
||||
fingerprint = fingerprint.rotate_left(7);
|
||||
}
|
||||
|
||||
black_box((energy.total_energy, fingerprint))
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark memory access patterns for energy computation
|
||||
fn bench_memory_patterns(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("energy_memory");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let state_dim = 64;
|
||||
|
||||
// Sequential node access (chain)
|
||||
let chain = SheafGraph::chain(num_nodes, state_dim, 42);
|
||||
group.bench_function("sequential_access", |b| {
|
||||
b.iter(|| black_box(chain.compute_total_energy()))
|
||||
});
|
||||
|
||||
// Random node access
|
||||
let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
|
||||
group.bench_function("random_access", |b| {
|
||||
b.iter(|| black_box(random.compute_total_energy()))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_full_graph_energy,
|
||||
bench_large_graph_energy,
|
||||
bench_topology_impact,
|
||||
bench_state_dimension,
|
||||
bench_edge_density,
|
||||
bench_scoped_energy,
|
||||
bench_energy_fingerprint,
|
||||
bench_memory_patterns,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
629
vendor/ruvector/crates/prime-radiant/benches/gate_bench.rs
vendored
Normal file
629
vendor/ruvector/crates/prime-radiant/benches/gate_bench.rs
vendored
Normal file
@@ -0,0 +1,629 @@
|
||||
//! Benchmarks for coherence gate evaluation
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 500us per gate evaluation
|
||||
//!
|
||||
//! The gate is a deterministic decision point that:
|
||||
//! 1. Evaluates current energy against thresholds
|
||||
//! 2. Checks persistence history
|
||||
//! 3. Determines compute lane (Reflex/Retrieval/Heavy/Human)
|
||||
//! 4. Creates witness record
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::VecDeque;
|
||||
use std::time::Duration;
|
||||
|
||||
// ============================================================================
|
||||
// Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
/// Compute lanes for escalating complexity
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ComputeLane {
|
||||
/// Lane 0: Local residual updates (<1ms)
|
||||
Reflex = 0,
|
||||
/// Lane 1: Evidence fetching (~10ms)
|
||||
Retrieval = 1,
|
||||
/// Lane 2: Multi-step planning (~100ms)
|
||||
Heavy = 2,
|
||||
/// Lane 3: Human escalation
|
||||
Human = 3,
|
||||
}
|
||||
|
||||
/// Coherence energy snapshot
|
||||
#[derive(Clone)]
|
||||
pub struct CoherenceEnergy {
|
||||
pub total_energy: f32,
|
||||
pub scope_energies: Vec<(u64, f32)>, // (scope_id, energy)
|
||||
pub timestamp: u64,
|
||||
pub fingerprint: u64,
|
||||
}
|
||||
|
||||
impl CoherenceEnergy {
|
||||
pub fn new(total: f32, num_scopes: usize) -> Self {
|
||||
let scope_energies: Vec<(u64, f32)> = (0..num_scopes)
|
||||
.map(|i| (i as u64, total / num_scopes as f32))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
total_energy: total,
|
||||
scope_energies,
|
||||
timestamp: 0,
|
||||
fingerprint: (total.to_bits() as u64).wrapping_mul(0x517cc1b727220a95),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn scope_energy(&self, scope_id: u64) -> f32 {
|
||||
self.scope_energies
|
||||
.iter()
|
||||
.find(|(id, _)| *id == scope_id)
|
||||
.map(|(_, e)| *e)
|
||||
.unwrap_or(0.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Action to be gated
|
||||
#[derive(Clone)]
|
||||
pub struct Action {
|
||||
pub id: u64,
|
||||
pub scope_id: u64,
|
||||
pub action_type: ActionType,
|
||||
pub payload_hash: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ActionType {
|
||||
Read,
|
||||
Write,
|
||||
Execute,
|
||||
External,
|
||||
}
|
||||
|
||||
/// Threshold configuration
|
||||
#[derive(Clone)]
|
||||
pub struct ThresholdConfig {
|
||||
pub reflex: f32,
|
||||
pub retrieval: f32,
|
||||
pub heavy: f32,
|
||||
pub persistence_window_ms: u64,
|
||||
}
|
||||
|
||||
impl Default for ThresholdConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
reflex: 0.1,
|
||||
retrieval: 0.5,
|
||||
heavy: 1.0,
|
||||
persistence_window_ms: 5000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Energy history for persistence detection
|
||||
pub struct EnergyHistory {
|
||||
/// Rolling window of (timestamp_ms, energy) pairs per scope
|
||||
history: Vec<VecDeque<(u64, f32)>>,
|
||||
max_scopes: usize,
|
||||
window_size: usize,
|
||||
}
|
||||
|
||||
impl EnergyHistory {
|
||||
pub fn new(max_scopes: usize, window_size: usize) -> Self {
|
||||
Self {
|
||||
history: (0..max_scopes)
|
||||
.map(|_| VecDeque::with_capacity(window_size))
|
||||
.collect(),
|
||||
max_scopes,
|
||||
window_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record(&mut self, scope_id: u64, timestamp_ms: u64, energy: f32) {
|
||||
if (scope_id as usize) < self.max_scopes {
|
||||
let queue = &mut self.history[scope_id as usize];
|
||||
if queue.len() >= self.window_size {
|
||||
queue.pop_front();
|
||||
}
|
||||
queue.push_back((timestamp_ms, energy));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_above_threshold(
|
||||
&self,
|
||||
scope_id: u64,
|
||||
threshold: f32,
|
||||
window_ms: u64,
|
||||
current_time_ms: u64,
|
||||
) -> bool {
|
||||
if (scope_id as usize) >= self.max_scopes {
|
||||
return false;
|
||||
}
|
||||
|
||||
let queue = &self.history[scope_id as usize];
|
||||
let cutoff = current_time_ms.saturating_sub(window_ms);
|
||||
|
||||
// Check if all samples in window are above threshold
|
||||
let samples_in_window: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
|
||||
|
||||
if samples_in_window.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
samples_in_window.iter().all(|(_, e)| *e >= threshold)
|
||||
}
|
||||
|
||||
pub fn trend(&self, scope_id: u64, window_ms: u64, current_time_ms: u64) -> Option<f32> {
|
||||
if (scope_id as usize) >= self.max_scopes {
|
||||
return None;
|
||||
}
|
||||
|
||||
let queue = &self.history[scope_id as usize];
|
||||
let cutoff = current_time_ms.saturating_sub(window_ms);
|
||||
|
||||
let samples: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
|
||||
|
||||
if samples.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Simple linear trend: (last - first) / count
|
||||
let first = samples.first().unwrap().1;
|
||||
let last = samples.last().unwrap().1;
|
||||
Some((last - first) / samples.len() as f32)
|
||||
}
|
||||
}
|
||||
|
||||
/// Witness record for audit
|
||||
#[derive(Clone)]
|
||||
pub struct WitnessRecord {
|
||||
pub id: u64,
|
||||
pub action_hash: u64,
|
||||
pub energy_fingerprint: u64,
|
||||
pub lane: ComputeLane,
|
||||
pub allowed: bool,
|
||||
pub timestamp: u64,
|
||||
pub content_hash: u64,
|
||||
}
|
||||
|
||||
impl WitnessRecord {
|
||||
pub fn new(
|
||||
action: &Action,
|
||||
energy: &CoherenceEnergy,
|
||||
lane: ComputeLane,
|
||||
allowed: bool,
|
||||
timestamp: u64,
|
||||
) -> Self {
|
||||
let content_hash = Self::compute_hash(action, energy, lane, allowed, timestamp);
|
||||
|
||||
Self {
|
||||
id: timestamp, // Simplified
|
||||
action_hash: action.payload_hash,
|
||||
energy_fingerprint: energy.fingerprint,
|
||||
lane,
|
||||
allowed,
|
||||
timestamp,
|
||||
content_hash,
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_hash(
|
||||
action: &Action,
|
||||
energy: &CoherenceEnergy,
|
||||
lane: ComputeLane,
|
||||
allowed: bool,
|
||||
timestamp: u64,
|
||||
) -> u64 {
|
||||
// Simplified hash computation (in production: use Blake3)
|
||||
let mut h = action.payload_hash;
|
||||
h = h.wrapping_mul(0x517cc1b727220a95);
|
||||
h ^= energy.fingerprint;
|
||||
h = h.wrapping_mul(0x517cc1b727220a95);
|
||||
h ^= (lane as u64) << 32 | (allowed as u64);
|
||||
h = h.wrapping_mul(0x517cc1b727220a95);
|
||||
h ^= timestamp;
|
||||
h
|
||||
}
|
||||
}
|
||||
|
||||
/// Gate decision result
|
||||
pub struct GateDecision {
|
||||
pub allow: bool,
|
||||
pub lane: ComputeLane,
|
||||
pub witness: WitnessRecord,
|
||||
pub denial_reason: Option<&'static str>,
|
||||
}
|
||||
|
||||
/// Coherence gate
|
||||
pub struct CoherenceGate {
|
||||
pub config: ThresholdConfig,
|
||||
pub history: EnergyHistory,
|
||||
current_time_ms: u64,
|
||||
}
|
||||
|
||||
impl CoherenceGate {
|
||||
pub fn new(config: ThresholdConfig, max_scopes: usize) -> Self {
|
||||
Self {
|
||||
config,
|
||||
history: EnergyHistory::new(max_scopes, 100),
|
||||
current_time_ms: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate whether action should proceed
|
||||
pub fn evaluate(&mut self, action: &Action, energy: &CoherenceEnergy) -> GateDecision {
|
||||
let current_energy = energy.scope_energy(action.scope_id);
|
||||
|
||||
// Record in history
|
||||
self.history
|
||||
.record(action.scope_id, self.current_time_ms, current_energy);
|
||||
|
||||
// Determine lane based on energy
|
||||
let lane = if current_energy < self.config.reflex {
|
||||
ComputeLane::Reflex
|
||||
} else if current_energy < self.config.retrieval {
|
||||
ComputeLane::Retrieval
|
||||
} else if current_energy < self.config.heavy {
|
||||
ComputeLane::Heavy
|
||||
} else {
|
||||
ComputeLane::Human
|
||||
};
|
||||
|
||||
// Check for persistent incoherence
|
||||
let persistent = self.history.is_above_threshold(
|
||||
action.scope_id,
|
||||
self.config.retrieval,
|
||||
self.config.persistence_window_ms,
|
||||
self.current_time_ms,
|
||||
);
|
||||
|
||||
// Check for growing incoherence (trend)
|
||||
let growing = self
|
||||
.history
|
||||
.trend(
|
||||
action.scope_id,
|
||||
self.config.persistence_window_ms,
|
||||
self.current_time_ms,
|
||||
)
|
||||
.map(|t| t > 0.01)
|
||||
.unwrap_or(false);
|
||||
|
||||
// Escalate if persistent and not already at high lane
|
||||
let final_lane = if (persistent || growing) && lane < ComputeLane::Heavy {
|
||||
ComputeLane::Heavy
|
||||
} else {
|
||||
lane
|
||||
};
|
||||
|
||||
// Allow unless Human lane
|
||||
let allow = final_lane < ComputeLane::Human;
|
||||
|
||||
let denial_reason = if !allow {
|
||||
Some("Energy exceeds all automatic thresholds")
|
||||
} else if persistent {
|
||||
Some("Persistent incoherence - escalated")
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let witness = WitnessRecord::new(action, energy, final_lane, allow, self.current_time_ms);
|
||||
|
||||
self.current_time_ms += 1;
|
||||
|
||||
GateDecision {
|
||||
allow,
|
||||
lane: final_lane,
|
||||
witness,
|
||||
denial_reason,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast path evaluation (no history update)
|
||||
#[inline]
|
||||
pub fn evaluate_fast(&self, scope_energy: f32) -> ComputeLane {
|
||||
if scope_energy < self.config.reflex {
|
||||
ComputeLane::Reflex
|
||||
} else if scope_energy < self.config.retrieval {
|
||||
ComputeLane::Retrieval
|
||||
} else if scope_energy < self.config.heavy {
|
||||
ComputeLane::Heavy
|
||||
} else {
|
||||
ComputeLane::Human
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance time (for benchmarking)
|
||||
pub fn advance_time(&mut self, delta_ms: u64) {
|
||||
self.current_time_ms += delta_ms;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark full gate evaluation
|
||||
fn bench_gate_evaluate(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_evaluate");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let config = ThresholdConfig::default();
|
||||
let mut gate = CoherenceGate::new(config, 100);
|
||||
|
||||
let action = Action {
|
||||
id: 1,
|
||||
scope_id: 0,
|
||||
action_type: ActionType::Write,
|
||||
payload_hash: 0x12345678,
|
||||
};
|
||||
|
||||
// Low energy (Reflex lane)
|
||||
let low_energy = CoherenceEnergy::new(0.05, 10);
|
||||
group.bench_function("low_energy_reflex", |b| {
|
||||
b.iter(|| {
|
||||
let decision = gate.evaluate(black_box(&action), black_box(&low_energy));
|
||||
black_box(decision.lane)
|
||||
})
|
||||
});
|
||||
|
||||
// Medium energy (Retrieval lane)
|
||||
let med_energy = CoherenceEnergy::new(0.3, 10);
|
||||
group.bench_function("medium_energy_retrieval", |b| {
|
||||
b.iter(|| {
|
||||
let decision = gate.evaluate(black_box(&action), black_box(&med_energy));
|
||||
black_box(decision.lane)
|
||||
})
|
||||
});
|
||||
|
||||
// High energy (Heavy lane)
|
||||
let high_energy = CoherenceEnergy::new(0.8, 10);
|
||||
group.bench_function("high_energy_heavy", |b| {
|
||||
b.iter(|| {
|
||||
let decision = gate.evaluate(black_box(&action), black_box(&high_energy));
|
||||
black_box(decision.lane)
|
||||
})
|
||||
});
|
||||
|
||||
// Critical energy (Human lane)
|
||||
let critical_energy = CoherenceEnergy::new(2.0, 10);
|
||||
group.bench_function("critical_energy_human", |b| {
|
||||
b.iter(|| {
|
||||
let decision = gate.evaluate(black_box(&action), black_box(&critical_energy));
|
||||
black_box(decision.lane)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark fast path evaluation (no history)
|
||||
fn bench_gate_fast_path(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_fast_path");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let config = ThresholdConfig::default();
|
||||
let gate = CoherenceGate::new(config, 100);
|
||||
|
||||
for energy in [0.05, 0.3, 0.8, 2.0] {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("evaluate_fast", format!("{:.2}", energy)),
|
||||
&energy,
|
||||
|b, &e| b.iter(|| black_box(gate.evaluate_fast(black_box(e)))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark witness record creation
|
||||
fn bench_witness_creation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_witness");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let action = Action {
|
||||
id: 1,
|
||||
scope_id: 0,
|
||||
action_type: ActionType::Write,
|
||||
payload_hash: 0x12345678,
|
||||
};
|
||||
let energy = CoherenceEnergy::new(0.3, 10);
|
||||
|
||||
group.bench_function("create_witness", |b| {
|
||||
b.iter(|| {
|
||||
WitnessRecord::new(
|
||||
black_box(&action),
|
||||
black_box(&energy),
|
||||
black_box(ComputeLane::Retrieval),
|
||||
black_box(true),
|
||||
black_box(12345),
|
||||
)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark history operations
|
||||
fn bench_history_operations(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_history");
|
||||
|
||||
let mut history = EnergyHistory::new(100, 1000);
|
||||
|
||||
// Pre-populate with some history
|
||||
for t in 0..500 {
|
||||
for scope in 0..10u64 {
|
||||
history.record(scope, t, 0.3 + (t % 10) as f32 * 0.01);
|
||||
}
|
||||
}
|
||||
|
||||
// Record operation
|
||||
group.bench_function("record_single", |b| {
|
||||
let mut t = 1000u64;
|
||||
b.iter(|| {
|
||||
history.record(black_box(5), black_box(t), black_box(0.35));
|
||||
t += 1;
|
||||
})
|
||||
});
|
||||
|
||||
// Check threshold
|
||||
group.bench_function("check_threshold", |b| {
|
||||
b.iter(|| {
|
||||
history.is_above_threshold(black_box(5), black_box(0.3), black_box(100), black_box(500))
|
||||
})
|
||||
});
|
||||
|
||||
// Compute trend
|
||||
group.bench_function("compute_trend", |b| {
|
||||
b.iter(|| history.trend(black_box(5), black_box(100), black_box(500)))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark persistence detection with various window sizes
|
||||
fn bench_persistence_detection(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_persistence");
|
||||
|
||||
for window_size in [10, 100, 1000] {
|
||||
let mut history = EnergyHistory::new(10, window_size);
|
||||
|
||||
// Fill history
|
||||
for t in 0..window_size as u64 {
|
||||
history.record(0, t, 0.4); // Consistently above retrieval threshold
|
||||
}
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("check_persistent", window_size),
|
||||
&window_size,
|
||||
|b, &size| {
|
||||
b.iter(|| {
|
||||
history.is_above_threshold(
|
||||
black_box(0),
|
||||
black_box(0.3),
|
||||
black_box(size as u64),
|
||||
black_box(size as u64),
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark batch evaluation (multiple actions)
|
||||
fn bench_batch_evaluation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_batch");
|
||||
|
||||
let config = ThresholdConfig::default();
|
||||
let mut gate = CoherenceGate::new(config, 100);
|
||||
|
||||
for batch_size in [10, 100, 1000] {
|
||||
let actions: Vec<Action> = (0..batch_size)
|
||||
.map(|i| Action {
|
||||
id: i as u64,
|
||||
scope_id: (i % 10) as u64,
|
||||
action_type: ActionType::Write,
|
||||
payload_hash: i as u64 * 0x517cc1b727220a95,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let energies: Vec<CoherenceEnergy> = (0..batch_size)
|
||||
.map(|i| CoherenceEnergy::new(0.1 + (i % 20) as f32 * 0.05, 10))
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("evaluate_batch", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let mut lanes = Vec::with_capacity(actions.len());
|
||||
for (action, energy) in actions.iter().zip(energies.iter()) {
|
||||
let decision = gate.evaluate(action, energy);
|
||||
lanes.push(decision.lane);
|
||||
}
|
||||
black_box(lanes)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark scope energy lookup
|
||||
fn bench_scope_lookup(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_scope_lookup");
|
||||
|
||||
for num_scopes in [10, 100, 1000] {
|
||||
let energy = CoherenceEnergy::new(1.0, num_scopes);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("lookup", num_scopes),
|
||||
&num_scopes,
|
||||
|b, &n| {
|
||||
let scope_id = (n / 2) as u64;
|
||||
b.iter(|| black_box(energy.scope_energy(black_box(scope_id))))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark threshold comparison patterns
|
||||
fn bench_threshold_comparison(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gate_threshold_cmp");
|
||||
|
||||
let config = ThresholdConfig::default();
|
||||
|
||||
// Sequential if-else (current implementation)
|
||||
group.bench_function("sequential_if_else", |b| {
|
||||
let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
|
||||
b.iter(|| {
|
||||
let mut lanes = [0u32; 4];
|
||||
for &e in &energies {
|
||||
let lane = if e < config.reflex {
|
||||
0
|
||||
} else if e < config.retrieval {
|
||||
1
|
||||
} else if e < config.heavy {
|
||||
2
|
||||
} else {
|
||||
3
|
||||
};
|
||||
lanes[lane] += 1;
|
||||
}
|
||||
black_box(lanes)
|
||||
})
|
||||
});
|
||||
|
||||
// Binary search pattern
|
||||
group.bench_function("binary_search", |b| {
|
||||
let thresholds = [config.reflex, config.retrieval, config.heavy, f32::MAX];
|
||||
let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
|
||||
b.iter(|| {
|
||||
let mut lanes = [0u32; 4];
|
||||
for &e in &energies {
|
||||
let lane = thresholds.partition_point(|&t| t <= e);
|
||||
lanes[lane.min(3)] += 1;
|
||||
}
|
||||
black_box(lanes)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_gate_evaluate,
|
||||
bench_gate_fast_path,
|
||||
bench_witness_creation,
|
||||
bench_history_operations,
|
||||
bench_persistence_detection,
|
||||
bench_batch_evaluation,
|
||||
bench_scope_lookup,
|
||||
bench_threshold_comparison,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
784
vendor/ruvector/crates/prime-radiant/benches/gpu_benchmarks.rs
vendored
Normal file
784
vendor/ruvector/crates/prime-radiant/benches/gpu_benchmarks.rs
vendored
Normal file
@@ -0,0 +1,784 @@
|
||||
//! GPU-Specific Benchmarks for Prime-Radiant Coherence Engine
|
||||
//!
|
||||
//! This benchmark suite compares CPU and GPU implementations of core
|
||||
//! coherence operations. Requires the `gpu` feature to be enabled.
|
||||
//!
|
||||
//! ## Benchmark Categories
|
||||
//! 1. Energy Computation - CPU vs GPU
|
||||
//! 2. Attention Forward Pass - CPU vs GPU
|
||||
//! 3. Batch Routing Decisions - CPU vs GPU
|
||||
//! 4. Memory Transfer Overhead
|
||||
//!
|
||||
//! ## GPU Backend Notes
|
||||
//! - Primary: wgpu (cross-platform WebGPU)
|
||||
//! - Optional: CUDA (NVIDIA), Metal (Apple), Vulkan
|
||||
//!
|
||||
//! ## Running GPU Benchmarks
|
||||
//! ```bash
|
||||
//! cargo bench --features gpu --bench gpu_benchmarks
|
||||
//! ```
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
// ============================================================================
|
||||
// TEST DATA GENERATION
|
||||
// ============================================================================
|
||||
|
||||
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
|
||||
(0..len)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
|
||||
(0..rows * cols)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CPU BASELINE IMPLEMENTATIONS
|
||||
// ============================================================================
|
||||
|
||||
/// CPU coherence energy computation
|
||||
#[derive(Clone)]
|
||||
struct CpuSheafGraph {
|
||||
nodes: HashMap<u64, Vec<f32>>,
|
||||
edges: Vec<(u64, u64, f32)>, // (source, target, weight)
|
||||
state_dim: usize,
|
||||
}
|
||||
|
||||
impl CpuSheafGraph {
|
||||
fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
|
||||
let nodes: HashMap<u64, Vec<f32>> = (0..num_nodes as u64)
|
||||
.map(|id| (id, generate_vec(state_dim, seed + id)))
|
||||
.collect();
|
||||
|
||||
let num_edges = (num_nodes * avg_degree) / 2;
|
||||
let edges: Vec<(u64, u64, f32)> = (0..num_edges)
|
||||
.filter_map(|i| {
|
||||
let mut h = DefaultHasher::new();
|
||||
(seed, i, "src").hash(&mut h);
|
||||
let source = h.finish() % num_nodes as u64;
|
||||
|
||||
let mut h = DefaultHasher::new();
|
||||
(seed, i, "tgt").hash(&mut h);
|
||||
let target = h.finish() % num_nodes as u64;
|
||||
|
||||
if source != target {
|
||||
Some((source, target, 1.0))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
nodes,
|
||||
edges,
|
||||
state_dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute total energy on CPU
|
||||
fn compute_energy_cpu(&self) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for &(src, tgt, weight) in &self.edges {
|
||||
let src_state = &self.nodes[&src];
|
||||
let tgt_state = &self.nodes[&tgt];
|
||||
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..self.state_dim {
|
||||
let diff = src_state[i] - tgt_state[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
total += weight * norm_sq;
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
/// Compute energy with per-edge results on CPU
|
||||
fn compute_energy_with_edges_cpu(&self) -> (f32, Vec<f32>) {
|
||||
let edge_energies: Vec<f32> = self
|
||||
.edges
|
||||
.iter()
|
||||
.map(|&(src, tgt, weight)| {
|
||||
let src_state = &self.nodes[&src];
|
||||
let tgt_state = &self.nodes[&tgt];
|
||||
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..self.state_dim {
|
||||
let diff = src_state[i] - tgt_state[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
weight * norm_sq
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total: f32 = edge_energies.iter().sum();
|
||||
(total, edge_energies)
|
||||
}
|
||||
}
|
||||
|
||||
/// CPU attention forward pass (simplified)
|
||||
fn attention_forward_cpu(
|
||||
queries: &[f32],
|
||||
keys: &[f32],
|
||||
values: &[f32],
|
||||
seq_len: usize,
|
||||
head_dim: usize,
|
||||
output: &mut [f32],
|
||||
) {
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
// For each query position
|
||||
for i in 0..seq_len {
|
||||
let q_offset = i * head_dim;
|
||||
|
||||
// Compute attention scores
|
||||
let mut scores = vec![0.0f32; seq_len];
|
||||
let mut max_score = f32::NEG_INFINITY;
|
||||
|
||||
for j in 0..seq_len {
|
||||
let k_offset = j * head_dim;
|
||||
let mut dot = 0.0f32;
|
||||
for k in 0..head_dim {
|
||||
dot += queries[q_offset + k] * keys[k_offset + k];
|
||||
}
|
||||
scores[j] = dot * scale;
|
||||
if scores[j] > max_score {
|
||||
max_score = scores[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Softmax
|
||||
let mut sum_exp = 0.0f32;
|
||||
for s in &mut scores {
|
||||
*s = (*s - max_score).exp();
|
||||
sum_exp += *s;
|
||||
}
|
||||
for s in &mut scores {
|
||||
*s /= sum_exp;
|
||||
}
|
||||
|
||||
// Weighted sum of values
|
||||
let out_offset = i * head_dim;
|
||||
for k in 0..head_dim {
|
||||
let mut weighted_sum = 0.0f32;
|
||||
for j in 0..seq_len {
|
||||
let v_offset = j * head_dim;
|
||||
weighted_sum += scores[j] * values[v_offset + k];
|
||||
}
|
||||
output[out_offset + k] = weighted_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// CPU batch routing (expert selection for MoE)
|
||||
fn batch_routing_cpu(
|
||||
token_embeddings: &[f32],
|
||||
expert_weights: &[f32],
|
||||
num_tokens: usize,
|
||||
embed_dim: usize,
|
||||
num_experts: usize,
|
||||
top_k: usize,
|
||||
) -> Vec<(usize, Vec<usize>)> {
|
||||
// token_embeddings: [num_tokens, embed_dim]
|
||||
// expert_weights: [num_experts, embed_dim]
|
||||
// Returns: for each token, the indices of top-k experts
|
||||
|
||||
let mut results = Vec::with_capacity(num_tokens);
|
||||
|
||||
for t in 0..num_tokens {
|
||||
let token_offset = t * embed_dim;
|
||||
let token = &token_embeddings[token_offset..token_offset + embed_dim];
|
||||
|
||||
// Compute scores for each expert
|
||||
let mut expert_scores: Vec<(usize, f32)> = (0..num_experts)
|
||||
.map(|e| {
|
||||
let expert_offset = e * embed_dim;
|
||||
let expert = &expert_weights[expert_offset..expert_offset + embed_dim];
|
||||
|
||||
let mut dot = 0.0f32;
|
||||
for i in 0..embed_dim {
|
||||
dot += token[i] * expert[i];
|
||||
}
|
||||
(e, dot)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by score (descending) and take top-k
|
||||
expert_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
|
||||
let top_experts: Vec<usize> = expert_scores
|
||||
.iter()
|
||||
.take(top_k)
|
||||
.map(|(idx, _)| *idx)
|
||||
.collect();
|
||||
|
||||
results.push((t, top_experts));
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// GPU IMPLEMENTATIONS (SIMULATED WITHOUT ACTUAL GPU)
|
||||
// When gpu feature is enabled, these would use actual GPU code
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
mod gpu_impl {
|
||||
//! GPU implementations using wgpu or similar
|
||||
//!
|
||||
//! These would contain actual GPU shader code and buffer management.
|
||||
//! For now, we simulate the overhead.
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Simulated GPU energy computation
|
||||
/// In reality, this would:
|
||||
/// 1. Upload node states to GPU buffer
|
||||
/// 2. Execute compute shader for parallel residual computation
|
||||
/// 3. Reduce edge energies
|
||||
/// 4. Read back result
|
||||
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
|
||||
// Simulate GPU overhead
|
||||
let _upload_time = simulate_memory_transfer(
|
||||
graph.nodes.len() * graph.state_dim * 4, // bytes
|
||||
true, // host to device
|
||||
);
|
||||
|
||||
// Actual computation would happen on GPU
|
||||
// Here we just call CPU version
|
||||
let result = graph.compute_energy_cpu();
|
||||
|
||||
let _download_time = simulate_memory_transfer(
|
||||
4, // single f32 result
|
||||
false,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Simulated GPU attention forward pass
|
||||
pub fn attention_forward_gpu(
|
||||
queries: &[f32],
|
||||
keys: &[f32],
|
||||
values: &[f32],
|
||||
seq_len: usize,
|
||||
head_dim: usize,
|
||||
output: &mut [f32],
|
||||
) {
|
||||
// Simulate upload
|
||||
let input_bytes = (queries.len() + keys.len() + values.len()) * 4;
|
||||
let _upload_time = simulate_memory_transfer(input_bytes, true);
|
||||
|
||||
// CPU fallback
|
||||
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
|
||||
|
||||
// Simulate download
|
||||
let _download_time = simulate_memory_transfer(output.len() * 4, false);
|
||||
}
|
||||
|
||||
/// Simulated GPU batch routing
|
||||
pub fn batch_routing_gpu(
|
||||
token_embeddings: &[f32],
|
||||
expert_weights: &[f32],
|
||||
num_tokens: usize,
|
||||
embed_dim: usize,
|
||||
num_experts: usize,
|
||||
top_k: usize,
|
||||
) -> Vec<(usize, Vec<usize>)> {
|
||||
// Simulate upload
|
||||
let input_bytes = (token_embeddings.len() + expert_weights.len()) * 4;
|
||||
let _upload_time = simulate_memory_transfer(input_bytes, true);
|
||||
|
||||
// CPU fallback
|
||||
let result = batch_routing_cpu(
|
||||
token_embeddings,
|
||||
expert_weights,
|
||||
num_tokens,
|
||||
embed_dim,
|
||||
num_experts,
|
||||
top_k,
|
||||
);
|
||||
|
||||
// Simulate download
|
||||
let result_bytes = num_tokens * top_k * 4;
|
||||
let _download_time = simulate_memory_transfer(result_bytes, false);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Simulate memory transfer time
|
||||
/// Returns simulated nanoseconds
|
||||
fn simulate_memory_transfer(bytes: usize, _host_to_device: bool) -> u64 {
|
||||
// Assume ~10 GB/s transfer rate (PCIe 3.0 x16 theoretical)
|
||||
// In practice, smaller transfers have higher overhead
|
||||
let base_overhead_ns = 1000; // 1 microsecond base overhead
|
||||
let transfer_ns = (bytes as u64 * 100) / 1_000_000_000; // ~10 GB/s
|
||||
base_overhead_ns + transfer_ns
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback for non-GPU builds
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
mod gpu_impl {
|
||||
use super::*;
|
||||
|
||||
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
|
||||
graph.compute_energy_cpu()
|
||||
}
|
||||
|
||||
pub fn attention_forward_gpu(
|
||||
queries: &[f32],
|
||||
keys: &[f32],
|
||||
values: &[f32],
|
||||
seq_len: usize,
|
||||
head_dim: usize,
|
||||
output: &mut [f32],
|
||||
) {
|
||||
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
|
||||
}
|
||||
|
||||
pub fn batch_routing_gpu(
|
||||
token_embeddings: &[f32],
|
||||
expert_weights: &[f32],
|
||||
num_tokens: usize,
|
||||
embed_dim: usize,
|
||||
num_experts: usize,
|
||||
top_k: usize,
|
||||
) -> Vec<(usize, Vec<usize>)> {
|
||||
batch_routing_cpu(
|
||||
token_embeddings,
|
||||
expert_weights,
|
||||
num_tokens,
|
||||
embed_dim,
|
||||
num_experts,
|
||||
top_k,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// ENERGY COMPUTATION BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_energy_cpu_vs_gpu(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_energy");
|
||||
|
||||
// Test at various graph sizes
|
||||
let sizes = [(1_000, 50), (10_000, 30), (100_000, 10)];
|
||||
|
||||
for (num_nodes, sample_size) in sizes {
|
||||
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
|
||||
|
||||
group.sample_size(sample_size);
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
|
||||
b.iter(|| black_box(graph.compute_energy_cpu()))
|
||||
});
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
group.bench_with_input(BenchmarkId::new("gpu", num_nodes), &num_nodes, |b, _| {
|
||||
b.iter(|| black_box(gpu_impl::compute_energy_gpu(&graph)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark energy computation with per-edge tracking
|
||||
fn bench_energy_with_edges(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_energy_with_edges");
|
||||
|
||||
for num_nodes in [1_000, 10_000] {
|
||||
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(graph.edges.len() as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
|
||||
b.iter(|| black_box(graph.compute_energy_with_edges_cpu()))
|
||||
});
|
||||
|
||||
// GPU version would return per-edge results
|
||||
// Useful for hotspot detection
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// ATTENTION BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_attention_cpu_vs_gpu(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_attention");
|
||||
|
||||
// Typical attention configurations
|
||||
let configs = [
|
||||
(128, 64, "small"), // seq_len=128, head_dim=64
|
||||
(512, 64, "medium"), // seq_len=512, head_dim=64
|
||||
(2048, 64, "large"), // seq_len=2048, head_dim=64
|
||||
];
|
||||
|
||||
for (seq_len, head_dim, label) in configs {
|
||||
let queries = generate_vec(seq_len * head_dim, 42);
|
||||
let keys = generate_vec(seq_len * head_dim, 123);
|
||||
let values = generate_vec(seq_len * head_dim, 456);
|
||||
let mut output = vec![0.0f32; seq_len * head_dim];
|
||||
|
||||
// Attention is O(n^2) in sequence length
|
||||
let sample_size = if seq_len > 1024 { 10 } else { 50 };
|
||||
group.sample_size(sample_size);
|
||||
group.throughput(Throughput::Elements((seq_len * seq_len) as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("cpu", label), &seq_len, |b, _| {
|
||||
b.iter(|| {
|
||||
attention_forward_cpu(
|
||||
black_box(&queries),
|
||||
black_box(&keys),
|
||||
black_box(&values),
|
||||
seq_len,
|
||||
head_dim,
|
||||
&mut output,
|
||||
);
|
||||
black_box(output[0])
|
||||
})
|
||||
});
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
group.bench_with_input(BenchmarkId::new("gpu", label), &seq_len, |b, _| {
|
||||
b.iter(|| {
|
||||
gpu_impl::attention_forward_gpu(
|
||||
black_box(&queries),
|
||||
black_box(&keys),
|
||||
black_box(&values),
|
||||
seq_len,
|
||||
head_dim,
|
||||
&mut output,
|
||||
);
|
||||
black_box(output[0])
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark multi-head attention
|
||||
fn bench_multihead_attention(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_multihead_attention");
|
||||
|
||||
let seq_len = 512;
|
||||
let head_dim = 64;
|
||||
let num_heads = 8;
|
||||
|
||||
let queries = generate_vec(seq_len * head_dim * num_heads, 42);
|
||||
let keys = generate_vec(seq_len * head_dim * num_heads, 123);
|
||||
let values = generate_vec(seq_len * head_dim * num_heads, 456);
|
||||
let mut output = vec![0.0f32; seq_len * head_dim * num_heads];
|
||||
|
||||
group.sample_size(20);
|
||||
group.throughput(Throughput::Elements((seq_len * seq_len * num_heads) as u64));
|
||||
|
||||
// CPU: sequential over heads
|
||||
group.bench_function("cpu_sequential_heads", |b| {
|
||||
b.iter(|| {
|
||||
for h in 0..num_heads {
|
||||
let offset = h * seq_len * head_dim;
|
||||
let q = &queries[offset..offset + seq_len * head_dim];
|
||||
let k = &keys[offset..offset + seq_len * head_dim];
|
||||
let v = &values[offset..offset + seq_len * head_dim];
|
||||
let out = &mut output[offset..offset + seq_len * head_dim];
|
||||
|
||||
attention_forward_cpu(q, k, v, seq_len, head_dim, out);
|
||||
}
|
||||
black_box(output[0])
|
||||
})
|
||||
});
|
||||
|
||||
// GPU would parallelize across heads
|
||||
#[cfg(feature = "gpu")]
|
||||
group.bench_function("gpu_parallel_heads", |b| {
|
||||
b.iter(|| {
|
||||
// In reality, GPU would process all heads in parallel
|
||||
for h in 0..num_heads {
|
||||
let offset = h * seq_len * head_dim;
|
||||
let q = &queries[offset..offset + seq_len * head_dim];
|
||||
let k = &keys[offset..offset + seq_len * head_dim];
|
||||
let v = &values[offset..offset + seq_len * head_dim];
|
||||
let out = &mut output[offset..offset + seq_len * head_dim];
|
||||
|
||||
gpu_impl::attention_forward_gpu(q, k, v, seq_len, head_dim, out);
|
||||
}
|
||||
black_box(output[0])
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// BATCH ROUTING BENCHMARKS (MoE)
|
||||
// ============================================================================
|
||||
|
||||
fn bench_batch_routing_cpu_vs_gpu(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_routing");
|
||||
|
||||
let embed_dim = 768; // Typical transformer embedding
|
||||
let num_experts = 8;
|
||||
let top_k = 2;
|
||||
|
||||
for num_tokens in [256, 1024, 4096] {
|
||||
let token_embeddings = generate_vec(num_tokens * embed_dim, 42);
|
||||
let expert_weights = generate_vec(num_experts * embed_dim, 123);
|
||||
|
||||
let sample_size = if num_tokens > 2048 { 20 } else { 50 };
|
||||
group.sample_size(sample_size);
|
||||
group.throughput(Throughput::Elements(num_tokens as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("cpu", num_tokens), &num_tokens, |b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_routing_cpu(
|
||||
black_box(&token_embeddings),
|
||||
black_box(&expert_weights),
|
||||
num_tokens,
|
||||
embed_dim,
|
||||
num_experts,
|
||||
top_k,
|
||||
))
|
||||
})
|
||||
});
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
group.bench_with_input(BenchmarkId::new("gpu", num_tokens), &num_tokens, |b, _| {
|
||||
b.iter(|| {
|
||||
black_box(gpu_impl::batch_routing_gpu(
|
||||
black_box(&token_embeddings),
|
||||
black_box(&expert_weights),
|
||||
num_tokens,
|
||||
embed_dim,
|
||||
num_experts,
|
||||
top_k,
|
||||
))
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MEMORY TRANSFER BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_memory_transfer_overhead(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_memory_transfer");
|
||||
|
||||
// Simulate different transfer sizes
|
||||
let sizes_kb = [1, 4, 16, 64, 256, 1024, 4096];
|
||||
|
||||
for &size_kb in &sizes_kb {
|
||||
let data = generate_vec(size_kb * 1024 / 4, 42); // f32 = 4 bytes
|
||||
|
||||
group.throughput(Throughput::Bytes((size_kb * 1024) as u64));
|
||||
|
||||
// Baseline: just accessing memory on CPU
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_access", format!("{}KB", size_kb)),
|
||||
&size_kb,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let sum: f32 = data.iter().sum();
|
||||
black_box(sum)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// GPU would have additional transfer overhead
|
||||
// This benchmark shows the amortization point
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CROSSOVER POINT BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
/// Find the problem size where GPU becomes faster than CPU
|
||||
fn bench_gpu_crossover(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_crossover");
|
||||
|
||||
// Matrix multiply is a classic GPU workload
|
||||
// Test different sizes to find crossover
|
||||
|
||||
let sizes = [32, 64, 128, 256, 512, 1024];
|
||||
|
||||
for &size in &sizes {
|
||||
let a = generate_matrix(size, size, 42);
|
||||
let b = generate_matrix(size, size, 123);
|
||||
let mut c = vec![0.0f32; size * size];
|
||||
|
||||
group.throughput(Throughput::Elements((size * size * size) as u64)); // O(n^3)
|
||||
|
||||
let sample_size = if size > 512 { 10 } else { 50 };
|
||||
group.sample_size(sample_size);
|
||||
|
||||
// CPU matrix multiply (naive)
|
||||
group.bench_with_input(BenchmarkId::new("cpu_matmul", size), &size, |b_iter, _| {
|
||||
b_iter.iter(|| {
|
||||
for i in 0..size {
|
||||
for j in 0..size {
|
||||
let mut sum = 0.0f32;
|
||||
for k in 0..size {
|
||||
sum += a[i * size + k] * b[k * size + j];
|
||||
}
|
||||
c[i * size + j] = sum;
|
||||
}
|
||||
}
|
||||
black_box(c[0])
|
||||
})
|
||||
});
|
||||
|
||||
// GPU would win for size >= 256 typically
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// COHERENCE-SPECIFIC GPU PATTERNS
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark parallel residual computation pattern
|
||||
fn bench_parallel_residual(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_parallel_residual");
|
||||
|
||||
let state_dim = 64;
|
||||
|
||||
for num_edges in [1_000, 10_000, 100_000] {
|
||||
// Prepare edge data in GPU-friendly format
|
||||
let sources: Vec<Vec<f32>> = (0..num_edges)
|
||||
.map(|i| generate_vec(state_dim, i as u64))
|
||||
.collect();
|
||||
let targets: Vec<Vec<f32>> = (0..num_edges)
|
||||
.map(|i| generate_vec(state_dim, i as u64 + 1000000))
|
||||
.collect();
|
||||
|
||||
let sample_size = if num_edges > 50000 { 10 } else { 50 };
|
||||
group.sample_size(sample_size);
|
||||
group.throughput(Throughput::Elements(num_edges as u64));
|
||||
|
||||
// CPU sequential
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_sequential", num_edges),
|
||||
&num_edges,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..state_dim {
|
||||
let diff = src[i] - tgt[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
total += norm_sq;
|
||||
}
|
||||
black_box(total)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// GPU would parallelize all edges
|
||||
// Each work item computes one residual
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark reduction patterns (sum of energies)
|
||||
fn bench_gpu_reduction(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_reduction");
|
||||
|
||||
for size in [1_000, 10_000, 100_000, 1_000_000] {
|
||||
let data = generate_vec(size, 42);
|
||||
|
||||
let sample_size = if size > 100000 { 10 } else { 50 };
|
||||
group.sample_size(sample_size);
|
||||
group.throughput(Throughput::Elements(size as u64));
|
||||
|
||||
// CPU sequential sum
|
||||
group.bench_with_input(BenchmarkId::new("cpu_sum", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
let sum: f32 = data.iter().sum();
|
||||
black_box(sum)
|
||||
})
|
||||
});
|
||||
|
||||
// CPU parallel reduction would use multiple accumulators
|
||||
group.bench_with_input(BenchmarkId::new("cpu_parallel", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
let chunks = data.chunks(1024);
|
||||
let partial_sums: Vec<f32> = chunks.map(|c| c.iter().sum()).collect();
|
||||
let sum: f32 = partial_sums.iter().sum();
|
||||
black_box(sum)
|
||||
})
|
||||
});
|
||||
|
||||
// GPU reduction uses tree-based parallel reduction
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CRITERION CONFIGURATION
|
||||
// ============================================================================
|
||||
|
||||
criterion_group!(
|
||||
energy_benches,
|
||||
bench_energy_cpu_vs_gpu,
|
||||
bench_energy_with_edges,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
attention_benches,
|
||||
bench_attention_cpu_vs_gpu,
|
||||
bench_multihead_attention,
|
||||
);
|
||||
|
||||
criterion_group!(routing_benches, bench_batch_routing_cpu_vs_gpu,);
|
||||
|
||||
criterion_group!(
|
||||
transfer_benches,
|
||||
bench_memory_transfer_overhead,
|
||||
bench_gpu_crossover,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
coherence_gpu_benches,
|
||||
bench_parallel_residual,
|
||||
bench_gpu_reduction,
|
||||
);
|
||||
|
||||
criterion_main!(
|
||||
energy_benches,
|
||||
attention_benches,
|
||||
routing_benches,
|
||||
transfer_benches,
|
||||
coherence_gpu_benches
|
||||
);
|
||||
488
vendor/ruvector/crates/prime-radiant/benches/hyperbolic_bench.rs
vendored
Normal file
488
vendor/ruvector/crates/prime-radiant/benches/hyperbolic_bench.rs
vendored
Normal file
@@ -0,0 +1,488 @@
|
||||
//! Benchmarks for Poincare distance computation
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 500ns per Poincare distance
|
||||
//!
|
||||
//! Hyperbolic geometry enables hierarchy-aware coherence where
|
||||
//! deeper nodes (further from origin) have different energy weights.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
|
||||
// ============================================================================
|
||||
// Hyperbolic Geometry Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Compute squared Euclidean norm
|
||||
#[inline]
|
||||
fn squared_norm(x: &[f32]) -> f32 {
|
||||
x.iter().map(|v| v * v).sum()
|
||||
}
|
||||
|
||||
/// Compute Euclidean norm
|
||||
#[inline]
|
||||
fn norm(x: &[f32]) -> f32 {
|
||||
squared_norm(x).sqrt()
|
||||
}
|
||||
|
||||
/// Compute squared Euclidean distance
|
||||
#[inline]
|
||||
fn squared_distance(x: &[f32], y: &[f32]) -> f32 {
|
||||
x.iter().zip(y.iter()).map(|(a, b)| (a - b).powi(2)).sum()
|
||||
}
|
||||
|
||||
/// Poincare distance in the Poincare ball model
|
||||
///
|
||||
/// d(x, y) = arcosh(1 + 2 * ||x - y||^2 / ((1 - ||x||^2) * (1 - ||y||^2)))
|
||||
///
|
||||
/// where arcosh(z) = ln(z + sqrt(z^2 - 1))
|
||||
#[inline]
|
||||
pub fn poincare_distance(x: &[f32], y: &[f32], curvature: f32) -> f32 {
|
||||
let sq_norm_x = squared_norm(x);
|
||||
let sq_norm_y = squared_norm(y);
|
||||
let sq_dist = squared_distance(x, y);
|
||||
|
||||
// Clamp to valid range for numerical stability
|
||||
let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
|
||||
let arg = 1.0 + 2.0 * sq_dist / denom;
|
||||
|
||||
// arcosh(arg) = ln(arg + sqrt(arg^2 - 1))
|
||||
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
|
||||
|
||||
// Scale by curvature
|
||||
arcosh / (-curvature).sqrt()
|
||||
}
|
||||
|
||||
/// Optimized Poincare distance with fused operations
|
||||
#[inline]
|
||||
pub fn poincare_distance_optimized(x: &[f32], y: &[f32], curvature: f32) -> f32 {
|
||||
let mut sq_norm_x = 0.0f32;
|
||||
let mut sq_norm_y = 0.0f32;
|
||||
let mut sq_dist = 0.0f32;
|
||||
|
||||
for i in 0..x.len() {
|
||||
sq_norm_x += x[i] * x[i];
|
||||
sq_norm_y += y[i] * y[i];
|
||||
let d = x[i] - y[i];
|
||||
sq_dist += d * d;
|
||||
}
|
||||
|
||||
let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
|
||||
let arg = 1.0 + 2.0 * sq_dist / denom;
|
||||
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
|
||||
|
||||
arcosh / (-curvature).sqrt()
|
||||
}
|
||||
|
||||
/// SIMD-friendly Poincare distance (chunked)
|
||||
#[inline]
|
||||
pub fn poincare_distance_simd_friendly(x: &[f32], y: &[f32], curvature: f32) -> f32 {
|
||||
// Process in chunks of 4 for potential auto-vectorization
|
||||
let mut sq_norm_x = [0.0f32; 4];
|
||||
let mut sq_norm_y = [0.0f32; 4];
|
||||
let mut sq_dist = [0.0f32; 4];
|
||||
|
||||
let chunks = x.len() / 4;
|
||||
for c in 0..chunks {
|
||||
let base = c * 4;
|
||||
for i in 0..4 {
|
||||
let xi = x[base + i];
|
||||
let yi = y[base + i];
|
||||
sq_norm_x[i] += xi * xi;
|
||||
sq_norm_y[i] += yi * yi;
|
||||
let d = xi - yi;
|
||||
sq_dist[i] += d * d;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remainder
|
||||
let remainder = x.len() % 4;
|
||||
let base = chunks * 4;
|
||||
for i in 0..remainder {
|
||||
let xi = x[base + i];
|
||||
let yi = y[base + i];
|
||||
sq_norm_x[0] += xi * xi;
|
||||
sq_norm_y[0] += yi * yi;
|
||||
let d = xi - yi;
|
||||
sq_dist[0] += d * d;
|
||||
}
|
||||
|
||||
// Reduce
|
||||
let total_sq_norm_x: f32 = sq_norm_x.iter().sum();
|
||||
let total_sq_norm_y: f32 = sq_norm_y.iter().sum();
|
||||
let total_sq_dist: f32 = sq_dist.iter().sum();
|
||||
|
||||
let denom = (1.0 - total_sq_norm_x).max(1e-10) * (1.0 - total_sq_norm_y).max(1e-10);
|
||||
let arg = 1.0 + 2.0 * total_sq_dist / denom;
|
||||
let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
|
||||
|
||||
arcosh / (-curvature).sqrt()
|
||||
}
|
||||
|
||||
/// Mobius addition in the Poincare ball
|
||||
///
|
||||
/// x + y = ((1 + 2<x,y> + ||y||^2)x + (1 - ||x||^2)y) / (1 + 2<x,y> + ||x||^2||y||^2)
|
||||
pub fn mobius_add(x: &[f32], y: &[f32], curvature: f32) -> Vec<f32> {
|
||||
let c = -curvature;
|
||||
let sq_norm_x = squared_norm(x);
|
||||
let sq_norm_y = squared_norm(y);
|
||||
let xy_dot: f32 = x.iter().zip(y.iter()).map(|(a, b)| a * b).sum();
|
||||
|
||||
let num_factor_x = 1.0 + 2.0 * c * xy_dot + c * sq_norm_y;
|
||||
let num_factor_y = 1.0 - c * sq_norm_x;
|
||||
let denom = 1.0 + 2.0 * c * xy_dot + c * c * sq_norm_x * sq_norm_y;
|
||||
|
||||
x.iter()
|
||||
.zip(y.iter())
|
||||
.map(|(xi, yi)| (num_factor_x * xi + num_factor_y * yi) / denom)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Exponential map at point p with tangent vector v
|
||||
pub fn exp_map(v: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
|
||||
let c = -curvature;
|
||||
let v_norm = norm(v);
|
||||
|
||||
if v_norm < 1e-10 {
|
||||
return p.to_vec();
|
||||
}
|
||||
|
||||
let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
|
||||
let t = (c.sqrt() * lambda_p * v_norm / 2.0).tanh();
|
||||
let factor = t / (c.sqrt() * v_norm);
|
||||
|
||||
let v_scaled: Vec<f32> = v.iter().map(|vi| factor * vi).collect();
|
||||
mobius_add(p, &v_scaled, curvature)
|
||||
}
|
||||
|
||||
/// Logarithmic map from point p to point q
|
||||
pub fn log_map(q: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
|
||||
let c = -curvature;
|
||||
|
||||
// Compute -p + q
|
||||
let neg_p: Vec<f32> = p.iter().map(|x| -x).collect();
|
||||
let diff = mobius_add(&neg_p, q, curvature);
|
||||
|
||||
let diff_norm = norm(&diff);
|
||||
if diff_norm < 1e-10 {
|
||||
return vec![0.0; p.len()];
|
||||
}
|
||||
|
||||
let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
|
||||
let factor = 2.0 / (c.sqrt() * lambda_p) * (c.sqrt() * diff_norm).atanh() / diff_norm;
|
||||
|
||||
diff.iter().map(|d| factor * d).collect()
|
||||
}
|
||||
|
||||
/// Project vector to Poincare ball (ensure ||x|| < 1/sqrt(c))
|
||||
pub fn project_to_ball(x: &[f32], curvature: f32) -> Vec<f32> {
|
||||
let max_norm = 1.0 / (-curvature).sqrt() - 1e-5;
|
||||
let current_norm = norm(x);
|
||||
|
||||
if current_norm >= max_norm {
|
||||
let scale = max_norm / current_norm;
|
||||
x.iter().map(|v| v * scale).collect()
|
||||
} else {
|
||||
x.to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute depth (distance from origin) in Poincare ball
|
||||
#[inline]
|
||||
pub fn poincare_depth(x: &[f32], curvature: f32) -> f32 {
|
||||
let origin = vec![0.0f32; x.len()];
|
||||
poincare_distance(x, &origin, curvature)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test Data Generation
|
||||
// ============================================================================
|
||||
|
||||
fn generate_point(dim: usize, seed: u64, max_norm: f32) -> Vec<f32> {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let raw: Vec<f32> = (0..dim)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Scale to be within ball
|
||||
let n = norm(&raw);
|
||||
if n > 0.0 {
|
||||
let scale = max_norm / n * 0.9; // 90% of max
|
||||
raw.iter().map(|v| v * scale).collect()
|
||||
} else {
|
||||
raw
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark Poincare distance at various dimensions
|
||||
fn bench_poincare_distance(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_poincare_distance");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let curvature = -1.0;
|
||||
|
||||
for dim in [8, 32, 64, 128, 256, 512] {
|
||||
let x = generate_point(dim, 42, 0.9);
|
||||
let y = generate_point(dim, 123, 0.9);
|
||||
|
||||
// Standard implementation
|
||||
group.bench_with_input(BenchmarkId::new("standard", dim), &dim, |b, _| {
|
||||
b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(curvature)))
|
||||
});
|
||||
|
||||
// Optimized implementation
|
||||
group.bench_with_input(BenchmarkId::new("optimized", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
poincare_distance_optimized(black_box(&x), black_box(&y), black_box(curvature))
|
||||
})
|
||||
});
|
||||
|
||||
// SIMD-friendly implementation
|
||||
group.bench_with_input(BenchmarkId::new("simd_friendly", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
poincare_distance_simd_friendly(black_box(&x), black_box(&y), black_box(curvature))
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark Mobius addition
|
||||
fn bench_mobius_add(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_mobius_add");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let curvature = -1.0;
|
||||
|
||||
for dim in [8, 32, 64, 128] {
|
||||
let x = generate_point(dim, 42, 0.5);
|
||||
let y = generate_point(dim, 123, 0.5);
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| mobius_add(black_box(&x), black_box(&y), black_box(curvature)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark exp/log maps
|
||||
fn bench_exp_log_map(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_exp_log");
|
||||
|
||||
let dim = 32;
|
||||
let curvature = -1.0;
|
||||
|
||||
let p = generate_point(dim, 42, 0.3);
|
||||
let v: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin() * 0.2)).collect();
|
||||
let q = generate_point(dim, 123, 0.4);
|
||||
|
||||
group.bench_function("exp_map", |b| {
|
||||
b.iter(|| exp_map(black_box(&v), black_box(&p), black_box(curvature)))
|
||||
});
|
||||
|
||||
group.bench_function("log_map", |b| {
|
||||
b.iter(|| log_map(black_box(&q), black_box(&p), black_box(curvature)))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark projection to ball
|
||||
fn bench_projection(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_projection");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let curvature = -1.0;
|
||||
|
||||
for dim in [8, 32, 64, 128, 256] {
|
||||
// Point that needs projection (outside ball)
|
||||
let x: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin())).collect();
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("project", dim), &dim, |b, _| {
|
||||
b.iter(|| project_to_ball(black_box(&x), black_box(curvature)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark depth computation
|
||||
fn bench_depth(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_depth");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
let curvature = -1.0;
|
||||
|
||||
for dim in [8, 32, 64, 128, 256] {
|
||||
let x = generate_point(dim, 42, 0.9);
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("depth", dim), &dim, |b, _| {
|
||||
b.iter(|| poincare_depth(black_box(&x), black_box(curvature)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark batch distance computation
|
||||
fn bench_batch_distance(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_batch_distance");
|
||||
|
||||
let dim = 64;
|
||||
let curvature = -1.0;
|
||||
|
||||
for batch_size in [10, 100, 1000] {
|
||||
let points: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_point(dim, i as u64, 0.9))
|
||||
.collect();
|
||||
let query = generate_point(dim, 999, 0.9);
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("batch", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let distances: Vec<f32> = points
|
||||
.iter()
|
||||
.map(|p| poincare_distance(&query, p, curvature))
|
||||
.collect();
|
||||
black_box(distances)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark k-nearest in hyperbolic space
|
||||
fn bench_knn_hyperbolic(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_knn");
|
||||
group.sample_size(50);
|
||||
|
||||
let dim = 64;
|
||||
let curvature = -1.0;
|
||||
|
||||
let points: Vec<Vec<f32>> = (0..1000)
|
||||
.map(|i| generate_point(dim, i as u64, 0.9))
|
||||
.collect();
|
||||
let query = generate_point(dim, 999, 0.9);
|
||||
|
||||
for k in [1, 5, 10, 50] {
|
||||
group.bench_with_input(BenchmarkId::new("k", k), &k, |b, &k| {
|
||||
b.iter(|| {
|
||||
// Compute all distances
|
||||
let mut distances: Vec<(usize, f32)> = points
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, p)| (i, poincare_distance(&query, p, curvature)))
|
||||
.collect();
|
||||
|
||||
// Partial sort for k-nearest
|
||||
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
||||
let result = distances[..k]
|
||||
.iter()
|
||||
.map(|(i, d)| (*i, *d))
|
||||
.collect::<Vec<_>>();
|
||||
black_box(result)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark hierarchy-weighted energy computation
|
||||
fn bench_hierarchy_weighted_energy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_hierarchy_energy");
|
||||
|
||||
let dim = 64;
|
||||
let curvature = -1.0;
|
||||
|
||||
// Create hierarchy: shallow and deep nodes
|
||||
let shallow_nodes: Vec<Vec<f32>> = (0..100)
|
||||
.map(|i| generate_point(dim, i as u64, 0.3)) // Near origin
|
||||
.collect();
|
||||
let deep_nodes: Vec<Vec<f32>> = (0..100)
|
||||
.map(|i| generate_point(dim, (i + 100) as u64, 0.9)) // Far from origin
|
||||
.collect();
|
||||
|
||||
group.bench_function("shallow_energy", |b| {
|
||||
b.iter(|| {
|
||||
let mut total_energy = 0.0f32;
|
||||
for i in 0..shallow_nodes.len() - 1 {
|
||||
let depth_a = poincare_depth(&shallow_nodes[i], curvature);
|
||||
let depth_b = poincare_depth(&shallow_nodes[i + 1], curvature);
|
||||
let avg_depth = (depth_a + depth_b) / 2.0;
|
||||
let weight = 1.0 + avg_depth.ln().max(0.0);
|
||||
|
||||
let dist = poincare_distance(&shallow_nodes[i], &shallow_nodes[i + 1], curvature);
|
||||
total_energy += weight * dist * dist;
|
||||
}
|
||||
black_box(total_energy)
|
||||
})
|
||||
});
|
||||
|
||||
group.bench_function("deep_energy", |b| {
|
||||
b.iter(|| {
|
||||
let mut total_energy = 0.0f32;
|
||||
for i in 0..deep_nodes.len() - 1 {
|
||||
let depth_a = poincare_depth(&deep_nodes[i], curvature);
|
||||
let depth_b = poincare_depth(&deep_nodes[i + 1], curvature);
|
||||
let avg_depth = (depth_a + depth_b) / 2.0;
|
||||
let weight = 1.0 + avg_depth.ln().max(0.0);
|
||||
|
||||
let dist = poincare_distance(&deep_nodes[i], &deep_nodes[i + 1], curvature);
|
||||
total_energy += weight * dist * dist;
|
||||
}
|
||||
black_box(total_energy)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark curvature impact
|
||||
fn bench_curvature_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("hyperbolic_curvature");
|
||||
|
||||
let dim = 64;
|
||||
let x = generate_point(dim, 42, 0.5);
|
||||
let y = generate_point(dim, 123, 0.5);
|
||||
|
||||
for curvature in [-0.1, -0.5, -1.0, -2.0, -5.0] {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("curvature", format!("{:.1}", curvature)),
|
||||
&curvature,
|
||||
|b, &c| b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(c))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_poincare_distance,
|
||||
bench_mobius_add,
|
||||
bench_exp_log_map,
|
||||
bench_projection,
|
||||
bench_depth,
|
||||
bench_batch_distance,
|
||||
bench_knn_hyperbolic,
|
||||
bench_hierarchy_weighted_energy,
|
||||
bench_curvature_impact,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
608
vendor/ruvector/crates/prime-radiant/benches/incremental_bench.rs
vendored
Normal file
608
vendor/ruvector/crates/prime-radiant/benches/incremental_bench.rs
vendored
Normal file
@@ -0,0 +1,608 @@
|
||||
//! Benchmarks for incremental coherence updates
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 100us for single node update
|
||||
//!
|
||||
//! Incremental computation recomputes only affected edges when
|
||||
//! a single node changes, avoiding full graph recomputation.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
// ============================================================================
|
||||
// Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RestrictionMap {
|
||||
pub matrix: Vec<f32>,
|
||||
pub bias: Vec<f32>,
|
||||
pub input_dim: usize,
|
||||
pub output_dim: usize,
|
||||
}
|
||||
|
||||
impl RestrictionMap {
|
||||
pub fn identity(dim: usize) -> Self {
|
||||
let mut matrix = vec![0.0f32; dim * dim];
|
||||
for i in 0..dim {
|
||||
matrix[i * dim + i] = 1.0;
|
||||
}
|
||||
Self {
|
||||
matrix,
|
||||
bias: vec![0.0; dim],
|
||||
input_dim: dim,
|
||||
output_dim: dim,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
|
||||
output.copy_from_slice(&self.bias);
|
||||
for i in 0..self.output_dim {
|
||||
let row_start = i * self.input_dim;
|
||||
for j in 0..self.input_dim {
|
||||
output[i] += self.matrix[row_start + j] * input[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SheafNode {
|
||||
pub id: u64,
|
||||
pub state: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SheafEdge {
|
||||
pub id: u64,
|
||||
pub source: u64,
|
||||
pub target: u64,
|
||||
pub weight: f32,
|
||||
pub rho_source: RestrictionMap,
|
||||
pub rho_target: RestrictionMap,
|
||||
}
|
||||
|
||||
impl SheafEdge {
|
||||
#[inline]
|
||||
pub fn weighted_residual_energy_into(
|
||||
&self,
|
||||
source: &[f32],
|
||||
target: &[f32],
|
||||
source_buf: &mut [f32],
|
||||
target_buf: &mut [f32],
|
||||
) -> f32 {
|
||||
self.rho_source.apply_into(source, source_buf);
|
||||
self.rho_target.apply_into(target, target_buf);
|
||||
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..source_buf.len() {
|
||||
let diff = source_buf[i] - target_buf[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
|
||||
self.weight * norm_sq
|
||||
}
|
||||
}
|
||||
|
||||
/// Incremental coherence tracker
|
||||
pub struct IncrementalCoherence {
|
||||
pub nodes: HashMap<u64, SheafNode>,
|
||||
pub edges: Vec<SheafEdge>,
|
||||
pub state_dim: usize,
|
||||
/// Node -> incident edge indices
|
||||
pub node_to_edges: HashMap<u64, Vec<usize>>,
|
||||
/// Cached per-edge energies
|
||||
pub edge_energies: Vec<f32>,
|
||||
/// Cached total energy
|
||||
pub total_energy: f32,
|
||||
/// Fingerprint for staleness detection
|
||||
pub fingerprint: u64,
|
||||
}
|
||||
|
||||
impl IncrementalCoherence {
|
||||
pub fn new(nodes: HashMap<u64, SheafNode>, edges: Vec<SheafEdge>, state_dim: usize) -> Self {
|
||||
// Build node-to-edge index
|
||||
let mut node_to_edges: HashMap<u64, Vec<usize>> = HashMap::new();
|
||||
for (idx, edge) in edges.iter().enumerate() {
|
||||
node_to_edges.entry(edge.source).or_default().push(idx);
|
||||
node_to_edges.entry(edge.target).or_default().push(idx);
|
||||
}
|
||||
|
||||
let mut tracker = Self {
|
||||
nodes,
|
||||
edges,
|
||||
state_dim,
|
||||
node_to_edges,
|
||||
edge_energies: Vec::new(),
|
||||
total_energy: 0.0,
|
||||
fingerprint: 0,
|
||||
};
|
||||
|
||||
tracker.full_recompute();
|
||||
tracker
|
||||
}
|
||||
|
||||
/// Full recomputation (initial or when needed)
|
||||
pub fn full_recompute(&mut self) {
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
|
||||
self.edge_energies = self
|
||||
.edges
|
||||
.iter()
|
||||
.map(|edge| {
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.total_energy = self.edge_energies.iter().sum();
|
||||
self.update_fingerprint();
|
||||
}
|
||||
|
||||
/// Update single node and recompute affected edges only
|
||||
pub fn update_node(&mut self, node_id: u64, new_state: Vec<f32>) {
|
||||
// Update node state
|
||||
if let Some(node) = self.nodes.get_mut(&node_id) {
|
||||
node.state = new_state;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get affected edges
|
||||
let affected_edges = match self.node_to_edges.get(&node_id) {
|
||||
Some(edges) => edges.clone(),
|
||||
None => return,
|
||||
};
|
||||
|
||||
// Recompute only affected edges
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
|
||||
let mut energy_delta = 0.0f32;
|
||||
|
||||
for &edge_idx in &affected_edges {
|
||||
let edge = &self.edges[edge_idx];
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
|
||||
let old_energy = self.edge_energies[edge_idx];
|
||||
let new_energy = edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
);
|
||||
|
||||
energy_delta += new_energy - old_energy;
|
||||
self.edge_energies[edge_idx] = new_energy;
|
||||
}
|
||||
|
||||
self.total_energy += energy_delta;
|
||||
self.update_fingerprint();
|
||||
}
|
||||
|
||||
/// Update multiple nodes in batch
|
||||
pub fn update_nodes_batch(&mut self, updates: Vec<(u64, Vec<f32>)>) {
|
||||
// Collect all affected edges
|
||||
let mut affected_edges: HashSet<usize> = HashSet::new();
|
||||
|
||||
for (node_id, new_state) in updates {
|
||||
if let Some(node) = self.nodes.get_mut(&node_id) {
|
||||
node.state = new_state;
|
||||
}
|
||||
if let Some(edges) = self.node_to_edges.get(&node_id) {
|
||||
affected_edges.extend(edges.iter());
|
||||
}
|
||||
}
|
||||
|
||||
// Recompute affected edges
|
||||
let mut source_buf = vec![0.0f32; self.state_dim];
|
||||
let mut target_buf = vec![0.0f32; self.state_dim];
|
||||
|
||||
let mut energy_delta = 0.0f32;
|
||||
|
||||
for edge_idx in affected_edges {
|
||||
let edge = &self.edges[edge_idx];
|
||||
let source_state = &self.nodes[&edge.source].state;
|
||||
let target_state = &self.nodes[&edge.target].state;
|
||||
|
||||
let old_energy = self.edge_energies[edge_idx];
|
||||
let new_energy = edge.weighted_residual_energy_into(
|
||||
source_state,
|
||||
target_state,
|
||||
&mut source_buf,
|
||||
&mut target_buf,
|
||||
);
|
||||
|
||||
energy_delta += new_energy - old_energy;
|
||||
self.edge_energies[edge_idx] = new_energy;
|
||||
}
|
||||
|
||||
self.total_energy += energy_delta;
|
||||
self.update_fingerprint();
|
||||
}
|
||||
|
||||
fn update_fingerprint(&mut self) {
|
||||
self.fingerprint = self.fingerprint.wrapping_add(1);
|
||||
}
|
||||
|
||||
/// Get current total energy
|
||||
pub fn energy(&self) -> f32 {
|
||||
self.total_energy
|
||||
}
|
||||
|
||||
/// Get energy for specific edge
|
||||
pub fn edge_energy(&self, edge_idx: usize) -> f32 {
|
||||
self.edge_energies[edge_idx]
|
||||
}
|
||||
|
||||
/// Check if cache is stale (fingerprint changed)
|
||||
pub fn is_stale(&self, last_fingerprint: u64) -> bool {
|
||||
self.fingerprint != last_fingerprint
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test Data Generation
|
||||
// ============================================================================
|
||||
|
||||
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
(0..dim)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn create_random_graph(
|
||||
num_nodes: usize,
|
||||
avg_degree: usize,
|
||||
state_dim: usize,
|
||||
) -> IncrementalCoherence {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
|
||||
.map(|id| {
|
||||
(
|
||||
id,
|
||||
SheafNode {
|
||||
id,
|
||||
state: generate_state(state_dim, id),
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let num_edges = (num_nodes * avg_degree) / 2;
|
||||
let edges: Vec<SheafEdge> = (0..num_edges)
|
||||
.filter_map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(42u64, i, "src").hash(&mut hasher);
|
||||
let source = hasher.finish() % num_nodes as u64;
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(42u64, i, "tgt").hash(&mut hasher);
|
||||
let target = hasher.finish() % num_nodes as u64;
|
||||
|
||||
if source != target {
|
||||
Some(SheafEdge {
|
||||
id: i as u64,
|
||||
source,
|
||||
target,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
IncrementalCoherence::new(nodes, edges, state_dim)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark single node update at various graph sizes
|
||||
fn bench_single_node_update(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_single_node");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
// ADR-014 target: <100us for single node update
|
||||
for num_nodes in [100, 1_000, 10_000] {
|
||||
let state_dim = 64;
|
||||
let avg_degree = 4;
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("update", format!("{}nodes", num_nodes)),
|
||||
&num_nodes,
|
||||
|b, _| {
|
||||
let node_id = (num_nodes / 2) as u64; // Update middle node
|
||||
b.iter(|| {
|
||||
let new_state = generate_state(state_dim, black_box(rand::random()));
|
||||
tracker.update_node(black_box(node_id), new_state);
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark incremental vs full recomputation
|
||||
fn bench_incremental_vs_full(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_vs_full");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let state_dim = 64;
|
||||
let avg_degree = 4;
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
// Incremental update
|
||||
group.bench_function("incremental_single", |b| {
|
||||
let node_id = 5000u64;
|
||||
b.iter(|| {
|
||||
let new_state = generate_state(state_dim, rand::random());
|
||||
tracker.update_node(black_box(node_id), new_state);
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
});
|
||||
|
||||
// Full recomputation
|
||||
group.bench_function("full_recompute", |b| {
|
||||
b.iter(|| {
|
||||
tracker.full_recompute();
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark node degree impact on update time
|
||||
fn bench_node_degree_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_degree_impact");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let state_dim = 64;
|
||||
|
||||
// Create graph with hub node (high degree)
|
||||
let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
|
||||
.map(|id| {
|
||||
(
|
||||
id,
|
||||
SheafNode {
|
||||
id,
|
||||
state: generate_state(state_dim, id),
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Hub node 0 connects to many nodes
|
||||
let hub_degree = 1000;
|
||||
let mut edges: Vec<SheafEdge> = (1..=hub_degree)
|
||||
.map(|i| SheafEdge {
|
||||
id: i as u64,
|
||||
source: 0,
|
||||
target: i as u64,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Regular edges for other nodes (degree ~4)
|
||||
for i in hub_degree + 1..num_nodes - 1 {
|
||||
edges.push(SheafEdge {
|
||||
id: i as u64,
|
||||
source: i as u64,
|
||||
target: (i + 1) as u64,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(state_dim),
|
||||
rho_target: RestrictionMap::identity(state_dim),
|
||||
});
|
||||
}
|
||||
|
||||
let mut tracker = IncrementalCoherence::new(nodes, edges, state_dim);
|
||||
|
||||
// Update hub node (high degree)
|
||||
group.bench_function("update_hub_1000_edges", |b| {
|
||||
b.iter(|| {
|
||||
let new_state = generate_state(state_dim, rand::random());
|
||||
tracker.update_node(black_box(0), new_state);
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
});
|
||||
|
||||
// Update leaf node (degree 1-2)
|
||||
group.bench_function("update_leaf_2_edges", |b| {
|
||||
let leaf_id = (hub_degree + 100) as u64;
|
||||
b.iter(|| {
|
||||
let new_state = generate_state(state_dim, rand::random());
|
||||
tracker.update_node(black_box(leaf_id), new_state);
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark batch updates
|
||||
fn bench_batch_updates(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_batch");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let state_dim = 64;
|
||||
let avg_degree = 4;
|
||||
|
||||
for batch_size in [1, 10, 100, 1000] {
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("batch_update", batch_size),
|
||||
&batch_size,
|
||||
|b, &size| {
|
||||
b.iter(|| {
|
||||
let updates: Vec<(u64, Vec<f32>)> = (0..size)
|
||||
.map(|i| {
|
||||
let node_id = (i * 10) as u64 % num_nodes as u64;
|
||||
let state = generate_state(state_dim, rand::random());
|
||||
(node_id, state)
|
||||
})
|
||||
.collect();
|
||||
|
||||
tracker.update_nodes_batch(black_box(updates));
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark state dimension impact
|
||||
fn bench_state_dim_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_state_dim");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let avg_degree = 4;
|
||||
|
||||
for state_dim in [8, 32, 64, 128, 256] {
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("update", state_dim),
|
||||
&state_dim,
|
||||
|b, &dim| {
|
||||
let node_id = 5000u64;
|
||||
b.iter(|| {
|
||||
let new_state = generate_state(dim, rand::random());
|
||||
tracker.update_node(black_box(node_id), new_state);
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark index lookup performance
|
||||
fn bench_index_lookup(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_index_lookup");
|
||||
|
||||
let num_nodes = 100_000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
// Lookup incident edges for a node
|
||||
group.bench_function("lookup_incident_edges", |b| {
|
||||
b.iter(|| {
|
||||
let node_id = black_box(50_000u64);
|
||||
black_box(tracker.node_to_edges.get(&node_id))
|
||||
})
|
||||
});
|
||||
|
||||
// Iterate incident edges
|
||||
group.bench_function("iterate_incident_edges", |b| {
|
||||
let node_id = 50_000u64;
|
||||
b.iter(|| {
|
||||
let sum = if let Some(edges) = tracker.node_to_edges.get(&node_id) {
|
||||
edges.iter().map(|&idx| tracker.edge_energies[idx]).sum()
|
||||
} else {
|
||||
0.0f32
|
||||
};
|
||||
black_box(sum)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark fingerprint operations
|
||||
fn bench_fingerprint(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_fingerprint");
|
||||
|
||||
let num_nodes = 10_000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
group.bench_function("check_staleness", |b| {
|
||||
let fp = tracker.fingerprint;
|
||||
b.iter(|| black_box(tracker.is_stale(black_box(fp))))
|
||||
});
|
||||
|
||||
group.bench_function("update_with_fingerprint_check", |b| {
|
||||
let node_id = 5000u64;
|
||||
b.iter(|| {
|
||||
let old_fp = tracker.fingerprint;
|
||||
let new_state = generate_state(state_dim, rand::random());
|
||||
tracker.update_node(black_box(node_id), new_state);
|
||||
let is_changed = tracker.is_stale(old_fp);
|
||||
black_box((tracker.energy(), is_changed))
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark worst case: update all nodes sequentially
|
||||
fn bench_sequential_all_updates(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("incremental_sequential_all");
|
||||
group.sample_size(10);
|
||||
|
||||
let num_nodes = 1000;
|
||||
let avg_degree = 4;
|
||||
let state_dim = 64;
|
||||
|
||||
let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
|
||||
|
||||
group.bench_function("update_all_1000_sequential", |b| {
|
||||
b.iter(|| {
|
||||
for node_id in 0..num_nodes as u64 {
|
||||
let new_state = generate_state(state_dim, node_id);
|
||||
tracker.update_node(node_id, new_state);
|
||||
}
|
||||
black_box(tracker.energy())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_single_node_update,
|
||||
bench_incremental_vs_full,
|
||||
bench_node_degree_impact,
|
||||
bench_batch_updates,
|
||||
bench_state_dim_impact,
|
||||
bench_index_lookup,
|
||||
bench_fingerprint,
|
||||
bench_sequential_all_updates,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
630
vendor/ruvector/crates/prime-radiant/benches/mincut_bench.rs
vendored
Normal file
630
vendor/ruvector/crates/prime-radiant/benches/mincut_bench.rs
vendored
Normal file
@@ -0,0 +1,630 @@
|
||||
//! Benchmarks for dynamic mincut updates
|
||||
//!
|
||||
//! ADR-014 Performance Target: n^o(1) amortized time per update
|
||||
//!
|
||||
//! The mincut algorithm isolates incoherent subgraphs using
|
||||
//! subpolynomial dynamic updates.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
|
||||
// ============================================================================
|
||||
// Dynamic MinCut Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
/// Edge in dynamic graph
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Edge {
|
||||
pub source: u64,
|
||||
pub target: u64,
|
||||
pub weight: f64,
|
||||
}
|
||||
|
||||
/// Dynamic graph with mincut tracking
|
||||
pub struct DynamicGraph {
|
||||
/// Adjacency lists
|
||||
adjacency: HashMap<u64, HashMap<u64, f64>>,
|
||||
/// Total edge count
|
||||
edge_count: usize,
|
||||
/// Vertex count
|
||||
vertex_count: usize,
|
||||
/// Cached connected components
|
||||
components: Option<Vec<HashSet<u64>>>,
|
||||
/// Modification counter for cache invalidation
|
||||
mod_count: u64,
|
||||
}
|
||||
|
||||
impl DynamicGraph {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
adjacency: HashMap::new(),
|
||||
edge_count: 0,
|
||||
vertex_count: 0,
|
||||
components: None,
|
||||
mod_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(vertices: usize, _edges: usize) -> Self {
|
||||
Self {
|
||||
adjacency: HashMap::with_capacity(vertices),
|
||||
edge_count: 0,
|
||||
vertex_count: 0,
|
||||
components: None,
|
||||
mod_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert edge
|
||||
pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
|
||||
self.components = None;
|
||||
self.mod_count += 1;
|
||||
|
||||
let adj = self.adjacency.entry(source).or_insert_with(HashMap::new);
|
||||
if adj.contains_key(&target) {
|
||||
return false;
|
||||
}
|
||||
adj.insert(target, weight);
|
||||
|
||||
let adj = self.adjacency.entry(target).or_insert_with(HashMap::new);
|
||||
adj.insert(source, weight);
|
||||
|
||||
self.edge_count += 1;
|
||||
self.vertex_count = self.adjacency.len();
|
||||
true
|
||||
}
|
||||
|
||||
/// Delete edge
|
||||
pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
|
||||
self.components = None;
|
||||
self.mod_count += 1;
|
||||
|
||||
let removed = if let Some(adj) = self.adjacency.get_mut(&source) {
|
||||
adj.remove(&target).is_some()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if removed {
|
||||
if let Some(adj) = self.adjacency.get_mut(&target) {
|
||||
adj.remove(&source);
|
||||
}
|
||||
self.edge_count -= 1;
|
||||
}
|
||||
|
||||
removed
|
||||
}
|
||||
|
||||
/// Check if edge exists
|
||||
pub fn has_edge(&self, source: u64, target: u64) -> bool {
|
||||
self.adjacency
|
||||
.get(&source)
|
||||
.map(|adj| adj.contains_key(&target))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Get vertex degree
|
||||
pub fn degree(&self, vertex: u64) -> usize {
|
||||
self.adjacency
|
||||
.get(&vertex)
|
||||
.map(|adj| adj.len())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Get neighbors
|
||||
pub fn neighbors(&self, vertex: u64) -> Vec<u64> {
|
||||
self.adjacency
|
||||
.get(&vertex)
|
||||
.map(|adj| adj.keys().copied().collect())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Compute connected components using BFS
|
||||
pub fn connected_components(&mut self) -> &Vec<HashSet<u64>> {
|
||||
if self.components.is_some() {
|
||||
return self.components.as_ref().unwrap();
|
||||
}
|
||||
|
||||
let mut visited = HashSet::new();
|
||||
let mut components = Vec::new();
|
||||
|
||||
for &vertex in self.adjacency.keys() {
|
||||
if visited.contains(&vertex) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut component = HashSet::new();
|
||||
let mut queue = VecDeque::new();
|
||||
queue.push_back(vertex);
|
||||
|
||||
while let Some(v) = queue.pop_front() {
|
||||
if visited.insert(v) {
|
||||
component.insert(v);
|
||||
if let Some(neighbors) = self.adjacency.get(&v) {
|
||||
for &neighbor in neighbors.keys() {
|
||||
if !visited.contains(&neighbor) {
|
||||
queue.push_back(neighbor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
components.push(component);
|
||||
}
|
||||
|
||||
self.components = Some(components);
|
||||
self.components.as_ref().unwrap()
|
||||
}
|
||||
|
||||
/// Check if graph is connected
|
||||
pub fn is_connected(&mut self) -> bool {
|
||||
let components = self.connected_components();
|
||||
components.len() <= 1
|
||||
}
|
||||
|
||||
/// Get edges as list
|
||||
pub fn edges(&self) -> Vec<Edge> {
|
||||
let mut edges = Vec::with_capacity(self.edge_count);
|
||||
let mut seen = HashSet::new();
|
||||
|
||||
for (&source, neighbors) in &self.adjacency {
|
||||
for (&target, &weight) in neighbors {
|
||||
let key = if source < target {
|
||||
(source, target)
|
||||
} else {
|
||||
(target, source)
|
||||
};
|
||||
if seen.insert(key) {
|
||||
edges.push(Edge {
|
||||
source,
|
||||
target,
|
||||
weight,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
edges
|
||||
}
|
||||
|
||||
/// Get graph statistics
|
||||
pub fn stats(&self) -> GraphStats {
|
||||
GraphStats {
|
||||
vertices: self.vertex_count,
|
||||
edges: self.edge_count,
|
||||
max_degree: self
|
||||
.adjacency
|
||||
.values()
|
||||
.map(|adj| adj.len())
|
||||
.max()
|
||||
.unwrap_or(0),
|
||||
avg_degree: if self.vertex_count > 0 {
|
||||
(self.edge_count * 2) as f64 / self.vertex_count as f64
|
||||
} else {
|
||||
0.0
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GraphStats {
|
||||
pub vertices: usize,
|
||||
pub edges: usize,
|
||||
pub max_degree: usize,
|
||||
pub avg_degree: f64,
|
||||
}
|
||||
|
||||
/// Subpolynomial MinCut (simplified simulation)
|
||||
/// Real implementation would use randomized contraction or tree packing
|
||||
pub struct SubpolynomialMinCut {
|
||||
graph: DynamicGraph,
|
||||
/// Cached mincut value
|
||||
cached_mincut: Option<f64>,
|
||||
/// Update count since last computation
|
||||
updates_since_compute: usize,
|
||||
/// Threshold for recomputation
|
||||
recompute_threshold: usize,
|
||||
}
|
||||
|
||||
impl SubpolynomialMinCut {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
graph: DynamicGraph::new(),
|
||||
cached_mincut: None,
|
||||
updates_since_compute: 0,
|
||||
recompute_threshold: 10,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(vertices: usize, edges: usize) -> Self {
|
||||
Self {
|
||||
graph: DynamicGraph::with_capacity(vertices, edges),
|
||||
cached_mincut: None,
|
||||
updates_since_compute: 0,
|
||||
recompute_threshold: ((vertices as f64).sqrt() as usize).max(10),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert edge with lazy mincut update
|
||||
pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
|
||||
let result = self.graph.insert_edge(source, target, weight);
|
||||
if result {
|
||||
self.updates_since_compute += 1;
|
||||
// Mincut can only decrease or stay same on edge insertion
|
||||
// So we can keep cached value as upper bound
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Delete edge with lazy mincut update
|
||||
pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
|
||||
let result = self.graph.delete_edge(source, target);
|
||||
if result {
|
||||
self.updates_since_compute += 1;
|
||||
// Mincut might have decreased, invalidate cache
|
||||
self.cached_mincut = None;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Compute mincut (lazy - uses cache if available)
|
||||
pub fn min_cut(&mut self) -> f64 {
|
||||
if let Some(cached) = self.cached_mincut {
|
||||
if self.updates_since_compute < self.recompute_threshold {
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
|
||||
// Simplified: use min degree as lower bound approximation
|
||||
// Real implementation: Karger's algorithm or tree packing
|
||||
let mincut = self.compute_mincut_approximation();
|
||||
self.cached_mincut = Some(mincut);
|
||||
self.updates_since_compute = 0;
|
||||
mincut
|
||||
}
|
||||
|
||||
/// Approximate mincut using min degree heuristic
|
||||
fn compute_mincut_approximation(&self) -> f64 {
|
||||
// Min cut <= min weighted degree
|
||||
let mut min_cut = f64::MAX;
|
||||
|
||||
for (_vertex, neighbors) in &self.graph.adjacency {
|
||||
let weighted_degree: f64 = neighbors.values().sum();
|
||||
if weighted_degree < min_cut {
|
||||
min_cut = weighted_degree;
|
||||
}
|
||||
}
|
||||
|
||||
if min_cut == f64::MAX {
|
||||
0.0
|
||||
} else {
|
||||
min_cut
|
||||
}
|
||||
}
|
||||
|
||||
/// Get partition (simplified: just split by component)
|
||||
pub fn partition(&mut self) -> (HashSet<u64>, HashSet<u64>) {
|
||||
let components = self.graph.connected_components();
|
||||
|
||||
if components.is_empty() {
|
||||
return (HashSet::new(), HashSet::new());
|
||||
}
|
||||
|
||||
if components.len() == 1 {
|
||||
// Single component - split roughly in half
|
||||
let vertices: Vec<_> = components[0].iter().copied().collect();
|
||||
let mid = vertices.len() / 2;
|
||||
let left: HashSet<_> = vertices[..mid].iter().copied().collect();
|
||||
let right: HashSet<_> = vertices[mid..].iter().copied().collect();
|
||||
(left, right)
|
||||
} else {
|
||||
// Multiple components - use first vs rest
|
||||
let left = components[0].clone();
|
||||
let right: HashSet<_> = components[1..]
|
||||
.iter()
|
||||
.flat_map(|c| c.iter())
|
||||
.copied()
|
||||
.collect();
|
||||
(left, right)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test Data Generation
|
||||
// ============================================================================
|
||||
|
||||
fn generate_random_graph(n: usize, m: usize, seed: u64) -> Vec<(u64, u64, f64)> {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut edges = Vec::with_capacity(m);
|
||||
let mut edge_set = HashSet::new();
|
||||
|
||||
for i in 0..m * 2 {
|
||||
if edges.len() >= m {
|
||||
break;
|
||||
}
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i, "source").hash(&mut hasher);
|
||||
let u = hasher.finish() % n as u64;
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i, "target").hash(&mut hasher);
|
||||
let v = hasher.finish() % n as u64;
|
||||
|
||||
if u != v {
|
||||
let key = if u < v { (u, v) } else { (v, u) };
|
||||
if edge_set.insert(key) {
|
||||
edges.push((u, v, 1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
edges
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark edge insertion
|
||||
fn bench_insert_edge(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_insert");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
|
||||
|
||||
// Pre-populate
|
||||
for (u, v, w) in &edges[..edges.len() / 2] {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("insert_single", size), &size, |b, &n| {
|
||||
let mut i = edges.len() / 2;
|
||||
b.iter(|| {
|
||||
let (u, v, w) = edges[i % edges.len()];
|
||||
black_box(mincut.insert_edge(u + n as u64, v + n as u64, w));
|
||||
i += 1;
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark edge deletion
|
||||
fn bench_delete_edge(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_delete");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("delete_single", size), &size, |b, _| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
|
||||
for (u, v, w) in &edges {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
(mincut, edges.clone())
|
||||
},
|
||||
|(mut mincut, edges)| {
|
||||
let (u, v, _) = edges[edges.len() / 2];
|
||||
black_box(mincut.delete_edge(u, v))
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark mincut query
|
||||
fn bench_mincut_query(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_query");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
|
||||
|
||||
for (u, v, w) in &edges {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
|
||||
// Cold query (no cache)
|
||||
group.bench_with_input(BenchmarkId::new("cold_query", size), &size, |b, _| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mc = mincut.graph.adjacency.clone();
|
||||
SubpolynomialMinCut {
|
||||
graph: DynamicGraph {
|
||||
adjacency: mc,
|
||||
edge_count: mincut.graph.edge_count,
|
||||
vertex_count: mincut.graph.vertex_count,
|
||||
components: None,
|
||||
mod_count: 0,
|
||||
},
|
||||
cached_mincut: None,
|
||||
updates_since_compute: 0,
|
||||
recompute_threshold: 10,
|
||||
}
|
||||
},
|
||||
|mut mc| black_box(mc.min_cut()),
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
// Warm query (cached)
|
||||
mincut.min_cut(); // Prime cache
|
||||
group.bench_with_input(BenchmarkId::new("warm_query", size), &size, |b, _| {
|
||||
b.iter(|| black_box(mincut.min_cut()))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark scaling behavior (verify subpolynomial)
|
||||
fn bench_scaling(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_scaling");
|
||||
group.sample_size(20);
|
||||
|
||||
// Sizes chosen for subpolynomial verification
|
||||
// n^(2/3) scaling should show sub-linear growth
|
||||
let sizes = vec![100, 316, 1000, 3162, 10000];
|
||||
|
||||
for size in sizes {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
|
||||
// Measure insert amortized time
|
||||
group.throughput(Throughput::Elements(1));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("insert_amortized", size),
|
||||
&size,
|
||||
|b, &n| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
|
||||
for (u, v, w) in &edges[..edges.len() / 2] {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
(mincut, n)
|
||||
},
|
||||
|(mut mincut, n)| {
|
||||
for i in 0..10 {
|
||||
let u = (i * 37) as u64 % n as u64;
|
||||
let v = (i * 73 + 1) as u64 % n as u64;
|
||||
if u != v {
|
||||
mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
|
||||
}
|
||||
}
|
||||
black_box(mincut.min_cut())
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark mixed workload
|
||||
fn bench_mixed_workload(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_mixed");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("mixed_ops", size), &size, |b, &n| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
|
||||
for (u, v, w) in &edges {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
(mincut, 0usize)
|
||||
},
|
||||
|(mut mincut, mut op_idx)| {
|
||||
// 50% insert, 30% delete, 20% query
|
||||
match op_idx % 10 {
|
||||
0..=4 => {
|
||||
let u = (op_idx * 37) as u64 % n as u64;
|
||||
let v = (op_idx * 73 + 1) as u64 % n as u64;
|
||||
if u != v {
|
||||
mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
|
||||
}
|
||||
}
|
||||
5..=7 => {
|
||||
if !edges.is_empty() {
|
||||
let (u, v, _) = edges[op_idx % edges.len()];
|
||||
mincut.delete_edge(u, v);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let _ = mincut.min_cut();
|
||||
}
|
||||
}
|
||||
op_idx += 1;
|
||||
black_box(op_idx)
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark partition computation
|
||||
fn bench_partition(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_partition");
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
let edges = generate_random_graph(size, size * 2, 42);
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
|
||||
|
||||
for (u, v, w) in &edges {
|
||||
mincut.insert_edge(*u, *v, *w);
|
||||
}
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("partition", size), &size, |b, _| {
|
||||
b.iter(|| black_box(mincut.partition()))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark connected components
|
||||
fn bench_components(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mincut_components");
|
||||
|
||||
for size in [100, 1000, 10000] {
|
||||
// Create graph with multiple components
|
||||
let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 2);
|
||||
|
||||
let component_size = size / 5;
|
||||
for comp in 0..5 {
|
||||
let offset = comp * component_size;
|
||||
for i in 0..component_size - 1 {
|
||||
let u = (offset + i) as u64;
|
||||
let v = (offset + i + 1) as u64;
|
||||
mincut.insert_edge(u, v, 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("multi_component", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
// Force recomputation
|
||||
mincut.graph.components = None;
|
||||
let components = mincut.graph.connected_components();
|
||||
black_box(components.len())
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_insert_edge,
|
||||
bench_delete_edge,
|
||||
bench_mincut_query,
|
||||
bench_scaling,
|
||||
bench_mixed_workload,
|
||||
bench_partition,
|
||||
bench_components,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
506
vendor/ruvector/crates/prime-radiant/benches/residual_bench.rs
vendored
Normal file
506
vendor/ruvector/crates/prime-radiant/benches/residual_bench.rs
vendored
Normal file
@@ -0,0 +1,506 @@
|
||||
//! Benchmarks for single residual calculation
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 1us per residual calculation
|
||||
//!
|
||||
//! Residual is the core primitive: r_e = rho_u(x_u) - rho_v(x_v)
|
||||
//! This measures the local constraint violation at each edge.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
|
||||
// ============================================================================
|
||||
// Restriction Map Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
/// Linear restriction map: y = Ax + b
|
||||
/// Maps node state to shared constraint space
|
||||
#[derive(Clone)]
|
||||
pub struct RestrictionMap {
|
||||
/// Linear transformation matrix (row-major, output_dim x input_dim)
|
||||
pub matrix: Vec<f32>,
|
||||
/// Bias vector
|
||||
pub bias: Vec<f32>,
|
||||
/// Input dimension
|
||||
pub input_dim: usize,
|
||||
/// Output dimension
|
||||
pub output_dim: usize,
|
||||
}
|
||||
|
||||
impl RestrictionMap {
|
||||
/// Create identity restriction map
|
||||
pub fn identity(dim: usize) -> Self {
|
||||
let mut matrix = vec![0.0f32; dim * dim];
|
||||
for i in 0..dim {
|
||||
matrix[i * dim + i] = 1.0;
|
||||
}
|
||||
Self {
|
||||
matrix,
|
||||
bias: vec![0.0; dim],
|
||||
input_dim: dim,
|
||||
output_dim: dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create random restriction map for testing
|
||||
pub fn random(input_dim: usize, output_dim: usize, seed: u64) -> Self {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut matrix = Vec::with_capacity(output_dim * input_dim);
|
||||
let mut bias = Vec::with_capacity(output_dim);
|
||||
|
||||
for i in 0..(output_dim * input_dim) {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
|
||||
matrix.push(val);
|
||||
}
|
||||
|
||||
for i in 0..output_dim {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i, "bias").hash(&mut hasher);
|
||||
let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
|
||||
bias.push(val);
|
||||
}
|
||||
|
||||
Self {
|
||||
matrix,
|
||||
bias,
|
||||
input_dim,
|
||||
output_dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply restriction map: y = Ax + b
|
||||
#[inline]
|
||||
pub fn apply(&self, input: &[f32]) -> Vec<f32> {
|
||||
debug_assert_eq!(input.len(), self.input_dim);
|
||||
let mut output = self.bias.clone();
|
||||
|
||||
for i in 0..self.output_dim {
|
||||
let row_start = i * self.input_dim;
|
||||
for j in 0..self.input_dim {
|
||||
output[i] += self.matrix[row_start + j] * input[j];
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
/// Apply restriction map with SIMD-friendly layout (output buffer provided)
|
||||
#[inline]
|
||||
pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
|
||||
debug_assert_eq!(input.len(), self.input_dim);
|
||||
debug_assert_eq!(output.len(), self.output_dim);
|
||||
|
||||
// Copy bias first
|
||||
output.copy_from_slice(&self.bias);
|
||||
|
||||
// Matrix-vector multiply
|
||||
for i in 0..self.output_dim {
|
||||
let row_start = i * self.input_dim;
|
||||
for j in 0..self.input_dim {
|
||||
output[i] += self.matrix[row_start + j] * input[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Edge with restriction maps
|
||||
pub struct SheafEdge {
|
||||
pub source: u64,
|
||||
pub target: u64,
|
||||
pub weight: f32,
|
||||
pub rho_source: RestrictionMap,
|
||||
pub rho_target: RestrictionMap,
|
||||
}
|
||||
|
||||
impl SheafEdge {
|
||||
/// Calculate the edge residual (local mismatch)
|
||||
/// r_e = rho_u(x_u) - rho_v(x_v)
|
||||
#[inline]
|
||||
pub fn residual(&self, source_state: &[f32], target_state: &[f32]) -> Vec<f32> {
|
||||
let projected_source = self.rho_source.apply(source_state);
|
||||
let projected_target = self.rho_target.apply(target_state);
|
||||
|
||||
projected_source
|
||||
.iter()
|
||||
.zip(projected_target.iter())
|
||||
.map(|(a, b)| a - b)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Calculate residual with pre-allocated buffers (zero allocation)
|
||||
#[inline]
|
||||
pub fn residual_into(
|
||||
&self,
|
||||
source_state: &[f32],
|
||||
target_state: &[f32],
|
||||
source_buf: &mut [f32],
|
||||
target_buf: &mut [f32],
|
||||
residual: &mut [f32],
|
||||
) {
|
||||
self.rho_source.apply_into(source_state, source_buf);
|
||||
self.rho_target.apply_into(target_state, target_buf);
|
||||
|
||||
for i in 0..residual.len() {
|
||||
residual[i] = source_buf[i] - target_buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate weighted residual norm squared: w_e * |r_e|^2
|
||||
#[inline]
|
||||
pub fn weighted_residual_energy(&self, source: &[f32], target: &[f32]) -> f32 {
|
||||
let r = self.residual(source, target);
|
||||
let norm_sq: f32 = r.iter().map(|x| x * x).sum();
|
||||
self.weight * norm_sq
|
||||
}
|
||||
|
||||
/// Weighted residual energy with pre-allocated buffers
|
||||
#[inline]
|
||||
pub fn weighted_residual_energy_into(
|
||||
&self,
|
||||
source: &[f32],
|
||||
target: &[f32],
|
||||
source_buf: &mut [f32],
|
||||
target_buf: &mut [f32],
|
||||
) -> f32 {
|
||||
self.rho_source.apply_into(source, source_buf);
|
||||
self.rho_target.apply_into(target, target_buf);
|
||||
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..source_buf.len() {
|
||||
let diff = source_buf[i] - target_buf[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
|
||||
self.weight * norm_sq
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
(0..dim)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Benchmark single residual calculation at various dimensions
|
||||
fn bench_single_residual(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("residual_single");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
// Test dimensions relevant for coherence engine:
|
||||
// 8: Minimal state
|
||||
// 32: Compact embedding
|
||||
// 64: Standard embedding
|
||||
// 128: Rich state
|
||||
// 256: Large state
|
||||
for dim in [8, 32, 64, 128, 256] {
|
||||
let rho_source = RestrictionMap::identity(dim);
|
||||
let rho_target = RestrictionMap::identity(dim);
|
||||
let source_state = generate_state(dim, 42);
|
||||
let target_state = generate_state(dim, 123);
|
||||
|
||||
let edge = SheafEdge {
|
||||
source: 0,
|
||||
target: 1,
|
||||
weight: 1.0,
|
||||
rho_source,
|
||||
rho_target,
|
||||
};
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("identity_map", dim), &dim, |b, _| {
|
||||
b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state)))
|
||||
});
|
||||
}
|
||||
|
||||
// Test with projection (non-identity maps)
|
||||
for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128)] {
|
||||
let rho_source = RestrictionMap::random(input_dim, output_dim, 42);
|
||||
let rho_target = RestrictionMap::random(input_dim, output_dim, 123);
|
||||
let source_state = generate_state(input_dim, 42);
|
||||
let target_state = generate_state(input_dim, 123);
|
||||
|
||||
let edge = SheafEdge {
|
||||
source: 0,
|
||||
target: 1,
|
||||
weight: 1.0,
|
||||
rho_source,
|
||||
rho_target,
|
||||
};
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("projection_map", format!("{}to{}", input_dim, output_dim)),
|
||||
&(input_dim, output_dim),
|
||||
|b, _| b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark residual calculation with pre-allocated buffers (zero allocation)
|
||||
fn bench_residual_zero_alloc(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("residual_zero_alloc");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for dim in [32, 64, 128, 256] {
|
||||
let rho_source = RestrictionMap::identity(dim);
|
||||
let rho_target = RestrictionMap::identity(dim);
|
||||
let source_state = generate_state(dim, 42);
|
||||
let target_state = generate_state(dim, 123);
|
||||
|
||||
let edge = SheafEdge {
|
||||
source: 0,
|
||||
target: 1,
|
||||
weight: 1.0,
|
||||
rho_source,
|
||||
rho_target,
|
||||
};
|
||||
|
||||
// Pre-allocate buffers
|
||||
let mut source_buf = vec![0.0f32; dim];
|
||||
let mut target_buf = vec![0.0f32; dim];
|
||||
let mut residual = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
edge.residual_into(
|
||||
black_box(&source_state),
|
||||
black_box(&target_state),
|
||||
black_box(&mut source_buf),
|
||||
black_box(&mut target_buf),
|
||||
black_box(&mut residual),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark weighted residual energy computation
|
||||
fn bench_weighted_energy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("residual_weighted_energy");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for dim in [32, 64, 128, 256] {
|
||||
let rho_source = RestrictionMap::identity(dim);
|
||||
let rho_target = RestrictionMap::identity(dim);
|
||||
let source_state = generate_state(dim, 42);
|
||||
let target_state = generate_state(dim, 123);
|
||||
|
||||
let edge = SheafEdge {
|
||||
source: 0,
|
||||
target: 1,
|
||||
weight: 1.5,
|
||||
rho_source,
|
||||
rho_target,
|
||||
};
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("allocating", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
edge.weighted_residual_energy(black_box(&source_state), black_box(&target_state))
|
||||
})
|
||||
});
|
||||
|
||||
// Pre-allocate buffers for zero-alloc version
|
||||
let mut source_buf = vec![0.0f32; dim];
|
||||
let mut target_buf = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("zero_alloc", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
edge.weighted_residual_energy_into(
|
||||
black_box(&source_state),
|
||||
black_box(&target_state),
|
||||
black_box(&mut source_buf),
|
||||
black_box(&mut target_buf),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark batch residual computation (for parallel evaluation)
|
||||
fn bench_batch_residual(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("residual_batch");
|
||||
|
||||
for batch_size in [10, 100, 1000] {
|
||||
let dim = 64;
|
||||
|
||||
// Create batch of edges
|
||||
let edges: Vec<SheafEdge> = (0..batch_size)
|
||||
.map(|i| SheafEdge {
|
||||
source: i as u64,
|
||||
target: (i + 1) as u64,
|
||||
weight: 1.0,
|
||||
rho_source: RestrictionMap::identity(dim),
|
||||
rho_target: RestrictionMap::identity(dim),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let states: Vec<Vec<f32>> = (0..batch_size + 1)
|
||||
.map(|i| generate_state(dim, i as u64))
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
|
||||
// Sequential computation
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("sequential", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let mut total_energy = 0.0f32;
|
||||
for (i, edge) in edges.iter().enumerate() {
|
||||
total_energy += edge.weighted_residual_energy(
|
||||
black_box(&states[i]),
|
||||
black_box(&states[i + 1]),
|
||||
);
|
||||
}
|
||||
black_box(total_energy)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark restriction map application alone
|
||||
fn bench_restriction_map(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("restriction_map");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
// Identity maps
|
||||
for dim in [32, 64, 128, 256] {
|
||||
let rho = RestrictionMap::identity(dim);
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("identity_apply", dim), &dim, |b, _| {
|
||||
b.iter(|| rho.apply(black_box(&input)))
|
||||
});
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("identity_apply_into", dim),
|
||||
&dim,
|
||||
|b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
|
||||
);
|
||||
}
|
||||
|
||||
// Projection maps (dense matrix multiply)
|
||||
for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128), (512, 256)] {
|
||||
let rho = RestrictionMap::random(input_dim, output_dim, 42);
|
||||
let input = generate_state(input_dim, 42);
|
||||
let mut output = vec![0.0f32; output_dim];
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("projection_apply", format!("{}x{}", input_dim, output_dim)),
|
||||
&(input_dim, output_dim),
|
||||
|b, _| b.iter(|| rho.apply(black_box(&input))),
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new(
|
||||
"projection_apply_into",
|
||||
format!("{}x{}", input_dim, output_dim),
|
||||
),
|
||||
&(input_dim, output_dim),
|
||||
|b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark SIMD-optimized residual patterns
|
||||
fn bench_simd_patterns(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("residual_simd_patterns");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
// Aligned dimensions for SIMD (multiples of 8 for AVX2, 16 for AVX-512)
|
||||
for dim in [32, 64, 128, 256, 512] {
|
||||
let a = generate_state(dim, 42);
|
||||
let b = generate_state(dim, 123);
|
||||
|
||||
// Scalar subtraction and norm
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("scalar_diff_norm", dim),
|
||||
&dim,
|
||||
|b_iter, _| {
|
||||
b_iter.iter(|| {
|
||||
let mut norm_sq = 0.0f32;
|
||||
for i in 0..dim {
|
||||
let diff = a[i] - b[i];
|
||||
norm_sq += diff * diff;
|
||||
}
|
||||
black_box(norm_sq)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Iterator-based (auto-vectorization friendly)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("iter_diff_norm", dim),
|
||||
&dim,
|
||||
|b_iter, _| {
|
||||
b_iter.iter(|| {
|
||||
let norm_sq: f32 = a
|
||||
.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| {
|
||||
let d = x - y;
|
||||
d * d
|
||||
})
|
||||
.sum();
|
||||
black_box(norm_sq)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Chunked for explicit SIMD opportunity
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("chunked_diff_norm", dim),
|
||||
&dim,
|
||||
|b_iter, _| {
|
||||
b_iter.iter(|| {
|
||||
let mut accum = [0.0f32; 8];
|
||||
for (chunk_a, chunk_b) in a.chunks(8).zip(b.chunks(8)) {
|
||||
for i in 0..chunk_a.len() {
|
||||
let d = chunk_a[i] - chunk_b[i];
|
||||
accum[i] += d * d;
|
||||
}
|
||||
}
|
||||
black_box(accum.iter().sum::<f32>())
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_single_residual,
|
||||
bench_residual_zero_alloc,
|
||||
bench_weighted_energy,
|
||||
bench_batch_residual,
|
||||
bench_restriction_map,
|
||||
bench_simd_patterns,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
800
vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
vendored
Normal file
800
vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
vendored
Normal file
@@ -0,0 +1,800 @@
|
||||
//! SIMD-Specific Benchmarks for Prime-Radiant Coherence Engine
|
||||
//!
|
||||
//! This benchmark suite compares naive/scalar implementations against
|
||||
//! SIMD-optimized versions for core coherence operations.
|
||||
//!
|
||||
//! ## Benchmark Categories
|
||||
//! 1. Dense Matrix Multiply - naive vs SIMD
|
||||
//! 2. Vector Norm Computation - naive vs SIMD
|
||||
//! 3. Batch Residual Computation - naive vs SIMD
|
||||
//! 4. Dot Products and Reductions
|
||||
//!
|
||||
//! ## Architecture Notes
|
||||
//! - x86_64: AVX2 (256-bit, f32x8) or AVX-512 (512-bit, f32x16)
|
||||
//! - aarch64: NEON (128-bit, f32x4)
|
||||
//! - WASM: SIMD128 (128-bit)
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
// ============================================================================
|
||||
// TEST DATA GENERATION
|
||||
// ============================================================================
|
||||
|
||||
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
|
||||
(0..len)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
|
||||
(0..rows * cols)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// NAIVE IMPLEMENTATIONS (BASELINE)
|
||||
// ============================================================================
|
||||
|
||||
/// Naive matrix-vector multiply: y = Ax
|
||||
#[inline(never)]
|
||||
fn matmul_naive(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let mut sum = 0.0f32;
|
||||
let row_start = i * cols;
|
||||
for j in 0..cols {
|
||||
sum += matrix[row_start + j] * x[j];
|
||||
}
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Naive squared norm: |v|^2
|
||||
#[inline(never)]
|
||||
fn norm_sq_naive(v: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for &x in v {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive dot product: a . b
|
||||
#[inline(never)]
|
||||
fn dot_naive(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..a.len() {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive residual norm: |a - b|^2
|
||||
#[inline(never)]
|
||||
fn residual_norm_naive(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..a.len() {
|
||||
let diff = a[i] - b[i];
|
||||
sum += diff * diff;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive batch residual computation
|
||||
#[inline(never)]
|
||||
fn batch_residual_naive(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_naive(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SIMD-FRIENDLY IMPLEMENTATIONS
|
||||
// ============================================================================
|
||||
|
||||
/// Unrolled matrix-vector multiply (auto-vectorization friendly)
|
||||
#[inline(never)]
|
||||
fn matmul_unrolled(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let row_start = i * cols;
|
||||
|
||||
// Process in chunks of 8
|
||||
let chunks = cols / 8;
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
let mut acc4 = 0.0f32;
|
||||
let mut acc5 = 0.0f32;
|
||||
let mut acc6 = 0.0f32;
|
||||
let mut acc7 = 0.0f32;
|
||||
|
||||
for c in 0..chunks {
|
||||
let base = row_start + c * 8;
|
||||
acc0 += matrix[base] * x[c * 8];
|
||||
acc1 += matrix[base + 1] * x[c * 8 + 1];
|
||||
acc2 += matrix[base + 2] * x[c * 8 + 2];
|
||||
acc3 += matrix[base + 3] * x[c * 8 + 3];
|
||||
acc4 += matrix[base + 4] * x[c * 8 + 4];
|
||||
acc5 += matrix[base + 5] * x[c * 8 + 5];
|
||||
acc6 += matrix[base + 6] * x[c * 8 + 6];
|
||||
acc7 += matrix[base + 7] * x[c * 8 + 7];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3 + acc4 + acc5 + acc6 + acc7;
|
||||
|
||||
// Handle remainder
|
||||
for j in (chunks * 8)..cols {
|
||||
sum += matrix[row_start + j] * x[j];
|
||||
}
|
||||
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Unrolled squared norm with 4 accumulators
|
||||
#[inline(never)]
|
||||
fn norm_sq_unrolled(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(4);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for chunk in chunks {
|
||||
acc0 += chunk[0] * chunk[0];
|
||||
acc1 += chunk[1] * chunk[1];
|
||||
acc2 += chunk[2] * chunk[2];
|
||||
acc3 += chunk[3] * chunk[3];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Unrolled squared norm with 8 accumulators (better for wider SIMD)
|
||||
#[inline(never)]
|
||||
fn norm_sq_unrolled_8(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(8);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc = [0.0f32; 8];
|
||||
|
||||
for chunk in chunks {
|
||||
acc[0] += chunk[0] * chunk[0];
|
||||
acc[1] += chunk[1] * chunk[1];
|
||||
acc[2] += chunk[2] * chunk[2];
|
||||
acc[3] += chunk[3] * chunk[3];
|
||||
acc[4] += chunk[4] * chunk[4];
|
||||
acc[5] += chunk[5] * chunk[5];
|
||||
acc[6] += chunk[6] * chunk[6];
|
||||
acc[7] += chunk[7] * chunk[7];
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.iter().sum();
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Iterator-based squared norm (relies on auto-vectorization)
|
||||
#[inline(never)]
|
||||
fn norm_sq_iter(v: &[f32]) -> f32 {
|
||||
v.iter().map(|x| x * x).sum()
|
||||
}
|
||||
|
||||
/// Unrolled dot product
|
||||
#[inline(never)]
|
||||
fn dot_unrolled(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(4);
|
||||
let chunks_b = b.chunks_exact(4);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
acc0 += ca[0] * cb[0];
|
||||
acc1 += ca[1] * cb[1];
|
||||
acc2 += ca[2] * cb[2];
|
||||
acc3 += ca[3] * cb[3];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
sum += a * b;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Unrolled residual norm
|
||||
#[inline(never)]
|
||||
fn residual_norm_unrolled(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(4);
|
||||
let chunks_b = b.chunks_exact(4);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let d0 = ca[0] - cb[0];
|
||||
let d1 = ca[1] - cb[1];
|
||||
let d2 = ca[2] - cb[2];
|
||||
let d3 = ca[3] - cb[3];
|
||||
acc0 += d0 * d0;
|
||||
acc1 += d1 * d1;
|
||||
acc2 += d2 * d2;
|
||||
acc3 += d3 * d3;
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
let d = a - b;
|
||||
sum += d * d;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Batch residual with unrolled inner loop
|
||||
#[inline(never)]
|
||||
fn batch_residual_unrolled(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_unrolled(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// EXPLICIT SIMD (when wide crate is available)
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
mod simd_impl {
|
||||
use wide::f32x8;
|
||||
|
||||
/// SIMD squared norm using f32x8
|
||||
#[inline(never)]
|
||||
pub fn norm_sq_simd(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(8);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for chunk in chunks {
|
||||
let vals = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
|
||||
acc += vals * vals;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD dot product using f32x8
|
||||
#[inline(never)]
|
||||
pub fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(8);
|
||||
let chunks_b = b.chunks_exact(8);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
|
||||
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
|
||||
acc += va * vb;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
sum += a * b;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD residual norm using f32x8
|
||||
#[inline(never)]
|
||||
pub fn residual_norm_simd(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(8);
|
||||
let chunks_b = b.chunks_exact(8);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
|
||||
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
|
||||
let diff = va - vb;
|
||||
acc += diff * diff;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
let d = a - b;
|
||||
sum += d * d;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD matrix-vector multiply
|
||||
#[inline(never)]
|
||||
pub fn matmul_simd(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let row_start = i * cols;
|
||||
let row = &matrix[row_start..row_start + cols];
|
||||
|
||||
let chunks_m = row.chunks_exact(8);
|
||||
let chunks_x = x.chunks_exact(8);
|
||||
let rem_m = chunks_m.remainder();
|
||||
let rem_x = chunks_x.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (cm, cx) in chunks_m.zip(chunks_x) {
|
||||
let vm = f32x8::from(<[f32; 8]>::try_from(cm).unwrap());
|
||||
let vx = f32x8::from(<[f32; 8]>::try_from(cx).unwrap());
|
||||
acc += vm * vx;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&m, &xv) in rem_m.iter().zip(rem_x.iter()) {
|
||||
sum += m * xv;
|
||||
}
|
||||
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// SIMD batch residual
|
||||
#[inline(never)]
|
||||
pub fn batch_residual_simd(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_simd(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DENSE MATRIX MULTIPLY BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_dense_matmul(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_matmul");
|
||||
|
||||
// Test matrix sizes: 64x64, 128x128, 256x256
|
||||
for size in [64, 128, 256] {
|
||||
let matrix = generate_matrix(size, size, 42);
|
||||
let x = generate_vec(size, 123);
|
||||
let mut y = vec![0.0f32; size];
|
||||
|
||||
group.throughput(Throughput::Elements((size * size) as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
matmul_naive(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
simd_impl::matmul_simd(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark non-square matrix multiply (projection)
|
||||
fn bench_projection_matmul(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_matmul_projection");
|
||||
|
||||
// Common projection sizes in coherence: 64->32, 128->64, 256->128
|
||||
for (in_dim, out_dim) in [(64, 32), (128, 64), (256, 128)] {
|
||||
let matrix = generate_matrix(out_dim, in_dim, 42);
|
||||
let x = generate_vec(in_dim, 123);
|
||||
let mut y = vec![0.0f32; out_dim];
|
||||
|
||||
group.throughput(Throughput::Elements((out_dim * in_dim) as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("naive", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
matmul_naive(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("unrolled", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("simd", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
simd_impl::matmul_simd(
|
||||
black_box(&matrix),
|
||||
black_box(&x),
|
||||
&mut y,
|
||||
out_dim,
|
||||
in_dim,
|
||||
);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// NORM COMPUTATION BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_norm_computation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_norm");
|
||||
|
||||
// Test dimensions aligned for SIMD
|
||||
for dim in [64, 128, 256, 512, 1024] {
|
||||
let v = generate_vec(dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_naive(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("iter", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_iter(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled_4", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_unrolled(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled_8", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd_f32x8", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(simd_impl::norm_sq_simd(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DOT PRODUCT BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_dot_product(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_dot");
|
||||
|
||||
for dim in [64, 256, 1024] {
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(dot_naive(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(dot_unrolled(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(simd_impl::dot_simd(black_box(&a), black_box(&b))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// RESIDUAL NORM BENCHMARKS (CORE COHERENCE OPERATION)
|
||||
// ============================================================================
|
||||
|
||||
fn bench_residual_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_residual_norm");
|
||||
|
||||
for dim in [64, 256, 1024] {
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(residual_norm_naive(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// BATCH RESIDUAL BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_batch_residual(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_batch_residual");
|
||||
|
||||
let dim = 64;
|
||||
|
||||
for batch_size in [100, 1000, 10000] {
|
||||
let sources: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_vec(dim, i as u64))
|
||||
.collect();
|
||||
let targets: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_vec(dim, i as u64 + 10000))
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("naive", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_residual_naive(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("unrolled", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_residual_unrolled(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", batch_size), &batch_size, |b, _| {
|
||||
b.iter(|| {
|
||||
black_box(simd_impl::batch_residual_simd(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MEMORY ALIGNMENT BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_alignment_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_alignment");
|
||||
|
||||
let dim = 256;
|
||||
|
||||
// Aligned (multiple of 8)
|
||||
{
|
||||
let v = generate_vec(dim, 42);
|
||||
group.bench_function("aligned_256", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
// Misaligned (not multiple of 8)
|
||||
{
|
||||
let v = generate_vec(dim + 3, 42);
|
||||
group.bench_function("misaligned_259", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
// Small vector (below SIMD threshold)
|
||||
{
|
||||
let v = generate_vec(7, 42);
|
||||
group.bench_function("small_7", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// THROUGHPUT SCALING BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_throughput_scaling(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_throughput_scaling");
|
||||
|
||||
// Test how throughput scales with vector size
|
||||
let sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
|
||||
|
||||
for &size in &sizes {
|
||||
let a = generate_vec(size, 42);
|
||||
let b = generate_vec(size, 123);
|
||||
|
||||
group.throughput(Throughput::Bytes((size * 4 * 2) as u64)); // 2 vectors, 4 bytes each
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("residual_unrolled", size),
|
||||
&size,
|
||||
|bench, _| {
|
||||
bench.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("residual_simd", size),
|
||||
&size,
|
||||
|bench, _| {
|
||||
bench
|
||||
.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// COHERENCE-SPECIFIC SIMD PATTERNS
|
||||
// ============================================================================
|
||||
|
||||
/// Fused multiply-add pattern for coherence energy
|
||||
fn bench_fma_pattern(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_fma_pattern");
|
||||
|
||||
let dim = 256;
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
let weight = 1.5f32;
|
||||
|
||||
// Without FMA (separate multiply and add)
|
||||
group.bench_function("separate_ops", |bench| {
|
||||
bench.iter(|| {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..dim {
|
||||
let diff = a[i] - b[i];
|
||||
let sq = diff * diff;
|
||||
sum += sq;
|
||||
}
|
||||
black_box(weight * sum)
|
||||
})
|
||||
});
|
||||
|
||||
// With potential FMA (compiler may optimize)
|
||||
group.bench_function("fma_friendly", |bench| {
|
||||
bench.iter(|| {
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
let chunks = dim / 4;
|
||||
for c in 0..chunks {
|
||||
let base = c * 4;
|
||||
let d0 = a[base] - b[base];
|
||||
let d1 = a[base + 1] - b[base + 1];
|
||||
let d2 = a[base + 2] - b[base + 2];
|
||||
let d3 = a[base + 3] - b[base + 3];
|
||||
|
||||
// These can become FMA operations
|
||||
acc0 = d0.mul_add(d0, acc0);
|
||||
acc1 = d1.mul_add(d1, acc1);
|
||||
acc2 = d2.mul_add(d2, acc2);
|
||||
acc3 = d3.mul_add(d3, acc3);
|
||||
}
|
||||
|
||||
black_box(weight * (acc0 + acc1 + acc2 + acc3))
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CRITERION CONFIGURATION
|
||||
// ============================================================================
|
||||
|
||||
criterion_group!(matmul_benches, bench_dense_matmul, bench_projection_matmul,);
|
||||
|
||||
criterion_group!(
|
||||
vector_ops_benches,
|
||||
bench_norm_computation,
|
||||
bench_dot_product,
|
||||
bench_residual_norm,
|
||||
);
|
||||
|
||||
criterion_group!(batch_benches, bench_batch_residual,);
|
||||
|
||||
criterion_group!(
|
||||
optimization_benches,
|
||||
bench_alignment_impact,
|
||||
bench_throughput_scaling,
|
||||
bench_fma_pattern,
|
||||
);
|
||||
|
||||
criterion_main!(
|
||||
matmul_benches,
|
||||
vector_ops_benches,
|
||||
batch_benches,
|
||||
optimization_benches
|
||||
);
|
||||
549
vendor/ruvector/crates/prime-radiant/benches/sona_bench.rs
vendored
Normal file
549
vendor/ruvector/crates/prime-radiant/benches/sona_bench.rs
vendored
Normal file
@@ -0,0 +1,549 @@
|
||||
//! Benchmarks for SONA Micro-LoRA instant adaptation
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 0.05ms (50us) for instant adaptation
|
||||
//!
|
||||
//! SONA provides self-optimizing threshold tuning with:
|
||||
//! - Micro-LoRA: Ultra-low rank (1-2) for instant learning
|
||||
//! - Base-LoRA: Standard LoRA for background learning
|
||||
//! - EWC++: Elastic Weight Consolidation to prevent forgetting
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
|
||||
// ============================================================================
|
||||
// SONA Types (Simulated for benchmarking)
|
||||
// ============================================================================
|
||||
|
||||
/// Micro-LoRA layer (rank 1-2 for instant adaptation)
|
||||
pub struct MicroLoRA {
|
||||
/// Low-rank factor A (dim x rank)
|
||||
pub a: Vec<f32>,
|
||||
/// Low-rank factor B (rank x dim)
|
||||
pub b: Vec<f32>,
|
||||
/// Scaling factor
|
||||
pub scale: f32,
|
||||
/// Input dimension
|
||||
pub dim: usize,
|
||||
/// Rank (typically 1-2)
|
||||
pub rank: usize,
|
||||
}
|
||||
|
||||
impl MicroLoRA {
|
||||
pub fn new(dim: usize, rank: usize) -> Self {
|
||||
// Initialize with small random values
|
||||
let a: Vec<f32> = (0..dim * rank)
|
||||
.map(|i| ((i as f32 * 0.1234).sin() * 0.01))
|
||||
.collect();
|
||||
let b: Vec<f32> = (0..rank * dim)
|
||||
.map(|i| ((i as f32 * 0.5678).cos() * 0.01))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
a,
|
||||
b,
|
||||
scale: 0.1,
|
||||
dim,
|
||||
rank,
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply micro-LoRA transform: y = x + scale * B @ A @ x
|
||||
#[inline]
|
||||
pub fn apply(&self, input: &[f32], output: &mut [f32]) {
|
||||
debug_assert_eq!(input.len(), self.dim);
|
||||
debug_assert_eq!(output.len(), self.dim);
|
||||
|
||||
// Copy input to output first (identity component)
|
||||
output.copy_from_slice(input);
|
||||
|
||||
// Compute A @ x -> hidden (rank-dimensional)
|
||||
let mut hidden = vec![0.0f32; self.rank];
|
||||
for r in 0..self.rank {
|
||||
for i in 0..self.dim {
|
||||
hidden[r] += self.a[i * self.rank + r] * input[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute B @ hidden and add to output
|
||||
for i in 0..self.dim {
|
||||
let mut delta = 0.0f32;
|
||||
for r in 0..self.rank {
|
||||
delta += self.b[r * self.dim + i] * hidden[r];
|
||||
}
|
||||
output[i] += self.scale * delta;
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply with pre-allocated hidden buffer (zero allocation)
|
||||
#[inline]
|
||||
pub fn apply_zero_alloc(&self, input: &[f32], hidden: &mut [f32], output: &mut [f32]) {
|
||||
debug_assert_eq!(hidden.len(), self.rank);
|
||||
|
||||
// Copy input
|
||||
output.copy_from_slice(input);
|
||||
|
||||
// A @ x
|
||||
hidden.fill(0.0);
|
||||
for r in 0..self.rank {
|
||||
for i in 0..self.dim {
|
||||
hidden[r] += self.a[i * self.rank + r] * input[i];
|
||||
}
|
||||
}
|
||||
|
||||
// B @ hidden
|
||||
for i in 0..self.dim {
|
||||
let mut delta = 0.0f32;
|
||||
for r in 0..self.rank {
|
||||
delta += self.b[r * self.dim + i] * hidden[r];
|
||||
}
|
||||
output[i] += self.scale * delta;
|
||||
}
|
||||
}
|
||||
|
||||
/// Update weights from gradient (instant learning)
|
||||
#[inline]
|
||||
pub fn update(&mut self, grad_a: &[f32], grad_b: &[f32], learning_rate: f32) {
|
||||
for i in 0..self.a.len() {
|
||||
self.a[i] -= learning_rate * grad_a[i];
|
||||
}
|
||||
for i in 0..self.b.len() {
|
||||
self.b[i] -= learning_rate * grad_b[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Base-LoRA layer (higher rank for background learning)
|
||||
pub struct BaseLoRA {
|
||||
pub a: Vec<f32>,
|
||||
pub b: Vec<f32>,
|
||||
pub scale: f32,
|
||||
pub dim: usize,
|
||||
pub rank: usize,
|
||||
}
|
||||
|
||||
impl BaseLoRA {
|
||||
pub fn new(dim: usize, rank: usize) -> Self {
|
||||
let a: Vec<f32> = (0..dim * rank)
|
||||
.map(|i| ((i as f32 * 0.3456).sin() * 0.01))
|
||||
.collect();
|
||||
let b: Vec<f32> = (0..rank * dim)
|
||||
.map(|i| ((i as f32 * 0.7890).cos() * 0.01))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
a,
|
||||
b,
|
||||
scale: 0.05,
|
||||
dim,
|
||||
rank,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn apply(&self, input: &[f32], output: &mut [f32]) {
|
||||
output.copy_from_slice(input);
|
||||
|
||||
let mut hidden = vec![0.0f32; self.rank];
|
||||
for r in 0..self.rank {
|
||||
for i in 0..self.dim {
|
||||
hidden[r] += self.a[i * self.rank + r] * input[i];
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..self.dim {
|
||||
let mut delta = 0.0f32;
|
||||
for r in 0..self.rank {
|
||||
delta += self.b[r * self.dim + i] * hidden[r];
|
||||
}
|
||||
output[i] += self.scale * delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// EWC++ weight importance
|
||||
pub struct EwcPlusPlus {
|
||||
/// Fisher information diagonal
|
||||
pub fisher: Vec<f32>,
|
||||
/// Optimal weights from previous tasks
|
||||
pub optimal_weights: Vec<f32>,
|
||||
/// Regularization strength
|
||||
pub lambda: f32,
|
||||
}
|
||||
|
||||
impl EwcPlusPlus {
|
||||
pub fn new(param_count: usize, lambda: f32) -> Self {
|
||||
Self {
|
||||
fisher: vec![1.0; param_count],
|
||||
optimal_weights: vec![0.0; param_count],
|
||||
lambda,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute EWC penalty for given weights
|
||||
#[inline]
|
||||
pub fn penalty(&self, weights: &[f32]) -> f32 {
|
||||
let mut penalty = 0.0f32;
|
||||
for i in 0..weights.len().min(self.fisher.len()) {
|
||||
let diff = weights[i] - self.optimal_weights[i];
|
||||
penalty += self.fisher[i] * diff * diff;
|
||||
}
|
||||
self.lambda * 0.5 * penalty
|
||||
}
|
||||
|
||||
/// Update Fisher information (consolidation)
|
||||
pub fn consolidate(&mut self, weights: &[f32], new_fisher: &[f32]) {
|
||||
for i in 0..self.fisher.len().min(new_fisher.len()) {
|
||||
// Online Fisher update (running average)
|
||||
self.fisher[i] = 0.9 * self.fisher[i] + 0.1 * new_fisher[i];
|
||||
self.optimal_weights[i] = weights[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Trajectory step for learning
|
||||
#[derive(Clone)]
|
||||
pub struct TrajectoryStep {
|
||||
pub state: Vec<f32>,
|
||||
pub action_embedding: Vec<f32>,
|
||||
pub reward: f32,
|
||||
}
|
||||
|
||||
/// Trajectory builder
|
||||
pub struct TrajectoryBuilder {
|
||||
pub initial_state: Vec<f32>,
|
||||
pub steps: Vec<TrajectoryStep>,
|
||||
}
|
||||
|
||||
impl TrajectoryBuilder {
|
||||
pub fn new(initial_state: Vec<f32>) -> Self {
|
||||
Self {
|
||||
initial_state,
|
||||
steps: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_step(&mut self, state: Vec<f32>, action: Vec<f32>, reward: f32) {
|
||||
self.steps.push(TrajectoryStep {
|
||||
state,
|
||||
action_embedding: action,
|
||||
reward,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// SONA engine (simplified for benchmarking)
|
||||
pub struct SonaEngine {
|
||||
pub micro_lora: MicroLoRA,
|
||||
pub base_lora: BaseLoRA,
|
||||
pub ewc: EwcPlusPlus,
|
||||
pub dim: usize,
|
||||
}
|
||||
|
||||
impl SonaEngine {
|
||||
pub fn new(dim: usize) -> Self {
|
||||
let micro_rank = 2;
|
||||
let base_rank = 8;
|
||||
let param_count = dim * micro_rank * 2 + dim * base_rank * 2;
|
||||
|
||||
Self {
|
||||
micro_lora: MicroLoRA::new(dim, micro_rank),
|
||||
base_lora: BaseLoRA::new(dim, base_rank),
|
||||
ewc: EwcPlusPlus::new(param_count, 0.4),
|
||||
dim,
|
||||
}
|
||||
}
|
||||
|
||||
/// Begin trajectory
|
||||
pub fn begin_trajectory(&self, initial_state: Vec<f32>) -> TrajectoryBuilder {
|
||||
TrajectoryBuilder::new(initial_state)
|
||||
}
|
||||
|
||||
/// End trajectory and trigger learning
|
||||
pub fn end_trajectory(&mut self, builder: TrajectoryBuilder, final_reward: f32) {
|
||||
// Simplified learning: update micro-LoRA based on reward
|
||||
let lr = 0.001 * final_reward.max(0.0);
|
||||
|
||||
// Pseudo-gradient (simplified)
|
||||
let grad_a: Vec<f32> = self.micro_lora.a.iter().map(|w| w * lr).collect();
|
||||
let grad_b: Vec<f32> = self.micro_lora.b.iter().map(|w| w * lr).collect();
|
||||
|
||||
self.micro_lora.update(&grad_a, &grad_b, lr);
|
||||
}
|
||||
|
||||
/// Apply micro-LoRA (instant)
|
||||
#[inline]
|
||||
pub fn apply_micro(&self, input: &[f32], output: &mut [f32]) {
|
||||
self.micro_lora.apply(input, output);
|
||||
}
|
||||
|
||||
/// Apply base-LoRA (background)
|
||||
pub fn apply_base(&self, input: &[f32], output: &mut [f32]) {
|
||||
self.base_lora.apply(input, output);
|
||||
}
|
||||
|
||||
/// Apply both LoRAs combined
|
||||
pub fn apply_combined(&self, input: &[f32], output: &mut [f32]) {
|
||||
// Apply micro first
|
||||
let mut intermediate = vec![0.0f32; self.dim];
|
||||
self.micro_lora.apply(input, &mut intermediate);
|
||||
// Then base
|
||||
self.base_lora.apply(&intermediate, output);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
|
||||
(0..dim)
|
||||
.map(|i| ((seed as f32 * 0.123 + i as f32 * 0.456).sin()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Benchmark Micro-LoRA application (target: <50us)
|
||||
fn bench_micro_lora_apply(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_micro_lora_apply");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for dim in [64, 128, 256, 512] {
|
||||
let lora = MicroLoRA::new(dim, 2); // Rank 2
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
}
|
||||
|
||||
// Different ranks
|
||||
let dim = 256;
|
||||
for rank in [1, 2, 4] {
|
||||
let lora = MicroLoRA::new(dim, rank);
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
|
||||
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark zero-allocation Micro-LoRA
|
||||
fn bench_micro_lora_zero_alloc(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_micro_lora_zero_alloc");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for dim in [64, 128, 256, 512] {
|
||||
let lora = MicroLoRA::new(dim, 2);
|
||||
let input = generate_state(dim, 42);
|
||||
let mut hidden = vec![0.0f32; 2];
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
lora.apply_zero_alloc(
|
||||
black_box(&input),
|
||||
black_box(&mut hidden),
|
||||
black_box(&mut output),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark Base-LoRA application
|
||||
fn bench_base_lora_apply(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_base_lora_apply");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for dim in [64, 128, 256, 512] {
|
||||
let lora = BaseLoRA::new(dim, 8); // Rank 8
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
}
|
||||
|
||||
// Different ranks
|
||||
let dim = 256;
|
||||
for rank in [4, 8, 16, 32] {
|
||||
let lora = BaseLoRA::new(dim, rank);
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
|
||||
b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark EWC++ penalty computation
|
||||
fn bench_ewc_penalty(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_ewc_penalty");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
for param_count in [1000, 10000, 100000] {
|
||||
let ewc = EwcPlusPlus::new(param_count, 0.4);
|
||||
let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("params", param_count),
|
||||
¶m_count,
|
||||
|b, _| b.iter(|| black_box(ewc.penalty(black_box(&weights)))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark EWC++ consolidation
|
||||
fn bench_ewc_consolidate(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_ewc_consolidate");
|
||||
|
||||
for param_count in [1000, 10000, 100000] {
|
||||
let mut ewc = EwcPlusPlus::new(param_count, 0.4);
|
||||
let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
|
||||
let new_fisher: Vec<f32> = (0..param_count)
|
||||
.map(|i| (i as f32 * 0.002).cos().abs())
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("params", param_count),
|
||||
¶m_count,
|
||||
|b, _| b.iter(|| ewc.consolidate(black_box(&weights), black_box(&new_fisher))),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark full trajectory learning cycle
|
||||
fn bench_trajectory_learning(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_trajectory_learning");
|
||||
|
||||
let dim = 256;
|
||||
let mut engine = SonaEngine::new(dim);
|
||||
|
||||
// Single step trajectory
|
||||
group.bench_function("single_step_trajectory", |b| {
|
||||
b.iter(|| {
|
||||
let mut builder = engine.begin_trajectory(generate_state(dim, 42));
|
||||
builder.add_step(generate_state(dim, 43), vec![], 0.8);
|
||||
engine.end_trajectory(builder, black_box(0.85));
|
||||
})
|
||||
});
|
||||
|
||||
// Multi-step trajectory
|
||||
group.bench_function("10_step_trajectory", |b| {
|
||||
b.iter(|| {
|
||||
let mut builder = engine.begin_trajectory(generate_state(dim, 42));
|
||||
for i in 0..10 {
|
||||
builder.add_step(generate_state(dim, 43 + i), vec![], 0.5 + (i as f32) * 0.05);
|
||||
}
|
||||
engine.end_trajectory(builder, black_box(0.9));
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark combined LoRA application
|
||||
fn bench_combined_lora(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_combined_lora");
|
||||
|
||||
for dim in [64, 128, 256, 512] {
|
||||
let engine = SonaEngine::new(dim);
|
||||
let input = generate_state(dim, 42);
|
||||
let mut output = vec![0.0f32; dim];
|
||||
|
||||
// Micro only
|
||||
group.bench_with_input(BenchmarkId::new("micro_only", dim), &dim, |b, _| {
|
||||
b.iter(|| engine.apply_micro(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
|
||||
// Base only
|
||||
group.bench_with_input(BenchmarkId::new("base_only", dim), &dim, |b, _| {
|
||||
b.iter(|| engine.apply_base(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
|
||||
// Combined
|
||||
group.bench_with_input(BenchmarkId::new("combined", dim), &dim, |b, _| {
|
||||
b.iter(|| engine.apply_combined(black_box(&input), black_box(&mut output)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark batch inference
|
||||
fn bench_batch_inference(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_batch_inference");
|
||||
|
||||
let dim = 256;
|
||||
let engine = SonaEngine::new(dim);
|
||||
|
||||
for batch_size in [1, 10, 100, 1000] {
|
||||
let inputs: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_state(dim, i as u64))
|
||||
.collect();
|
||||
let mut outputs: Vec<Vec<f32>> = (0..batch_size).map(|_| vec![0.0f32; dim]).collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("batch", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
|
||||
engine.apply_micro(input, output);
|
||||
}
|
||||
black_box(outputs.len())
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark weight update (instant learning)
|
||||
fn bench_weight_update(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sona_weight_update");
|
||||
|
||||
for dim in [64, 128, 256, 512] {
|
||||
let mut lora = MicroLoRA::new(dim, 2);
|
||||
let grad_a: Vec<f32> = (0..dim * 2).map(|i| (i as f32 * 0.001).sin()).collect();
|
||||
let grad_b: Vec<f32> = (0..2 * dim).map(|i| (i as f32 * 0.002).cos()).collect();
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
|
||||
b.iter(|| {
|
||||
lora.update(black_box(&grad_a), black_box(&grad_b), black_box(0.001));
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_micro_lora_apply,
|
||||
bench_micro_lora_zero_alloc,
|
||||
bench_base_lora_apply,
|
||||
bench_ewc_penalty,
|
||||
bench_ewc_consolidate,
|
||||
bench_trajectory_learning,
|
||||
bench_combined_lora,
|
||||
bench_batch_inference,
|
||||
bench_weight_update,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
663
vendor/ruvector/crates/prime-radiant/benches/tile_bench.rs
vendored
Normal file
663
vendor/ruvector/crates/prime-radiant/benches/tile_bench.rs
vendored
Normal file
@@ -0,0 +1,663 @@
|
||||
//! Benchmarks for 256-tile parallel tick
|
||||
//!
|
||||
//! ADR-014 Performance Target: < 1ms for 256-tile parallel tick
|
||||
//!
|
||||
//! The cognitum-gate-kernel provides 256 WASM tiles, each maintaining
|
||||
//! a local graph shard with E-value accumulation and witness fragments.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
|
||||
// ============================================================================
|
||||
// Tile Types (Simulated, matching cognitum-gate-kernel structure)
|
||||
// ============================================================================
|
||||
|
||||
/// Maximum delta buffer per tile
|
||||
pub const MAX_DELTA_BUFFER: usize = 64;
|
||||
/// Number of tiles in fabric
|
||||
pub const NUM_TILES: usize = 256;
|
||||
/// Maximum vertices per shard
|
||||
pub const MAX_SHARD_VERTICES: usize = 256;
|
||||
/// Maximum edges per shard
|
||||
pub const MAX_SHARD_EDGES: usize = 1024;
|
||||
|
||||
/// Delta operation type
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum DeltaType {
|
||||
EdgeAdd,
|
||||
EdgeRemove,
|
||||
Observation,
|
||||
WeightUpdate,
|
||||
}
|
||||
|
||||
/// Delta (change event) for tile
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Delta {
|
||||
pub delta_type: DeltaType,
|
||||
pub source: u16,
|
||||
pub target: u16,
|
||||
pub weight: u16,
|
||||
pub payload: u32,
|
||||
}
|
||||
|
||||
impl Delta {
|
||||
pub fn edge_add(src: u16, tgt: u16, weight: u16) -> Self {
|
||||
Self {
|
||||
delta_type: DeltaType::EdgeAdd,
|
||||
source: src,
|
||||
target: tgt,
|
||||
weight,
|
||||
payload: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn observation(vertex: u16, positive: bool) -> Self {
|
||||
Self {
|
||||
delta_type: DeltaType::Observation,
|
||||
source: vertex,
|
||||
target: 0,
|
||||
weight: 0,
|
||||
payload: positive as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact vertex state
|
||||
#[derive(Clone, Copy, Default)]
|
||||
pub struct VertexState {
|
||||
pub degree: u8,
|
||||
pub component_id: u8,
|
||||
pub active: bool,
|
||||
pub energy_contrib: f32,
|
||||
}
|
||||
|
||||
impl VertexState {
|
||||
pub fn is_active(&self) -> bool {
|
||||
self.active
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact edge
|
||||
#[derive(Clone, Copy, Default)]
|
||||
pub struct CompactEdge {
|
||||
pub source: u16,
|
||||
pub target: u16,
|
||||
pub weight: u16,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
impl CompactEdge {
|
||||
pub fn is_active(&self) -> bool {
|
||||
self.active
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact graph for single tile
|
||||
pub struct CompactGraph {
|
||||
pub vertices: [VertexState; MAX_SHARD_VERTICES],
|
||||
pub edges: [CompactEdge; MAX_SHARD_EDGES],
|
||||
pub edge_count: usize,
|
||||
pub vertex_count: usize,
|
||||
pub component_count: u8,
|
||||
}
|
||||
|
||||
impl CompactGraph {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
vertices: [VertexState::default(); MAX_SHARD_VERTICES],
|
||||
edges: [CompactEdge::default(); MAX_SHARD_EDGES],
|
||||
edge_count: 0,
|
||||
vertex_count: 0,
|
||||
component_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_edge(&mut self, src: u16, tgt: u16, weight: u16) -> bool {
|
||||
if self.edge_count >= MAX_SHARD_EDGES {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Activate vertices
|
||||
self.vertices[src as usize].active = true;
|
||||
self.vertices[src as usize].degree += 1;
|
||||
self.vertices[tgt as usize].active = true;
|
||||
self.vertices[tgt as usize].degree += 1;
|
||||
|
||||
// Add edge
|
||||
self.edges[self.edge_count] = CompactEdge {
|
||||
source: src,
|
||||
target: tgt,
|
||||
weight,
|
||||
active: true,
|
||||
};
|
||||
self.edge_count += 1;
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
pub fn recompute_components(&mut self) {
|
||||
// Simple union-find simulation
|
||||
let mut parent = [0u8; MAX_SHARD_VERTICES];
|
||||
for i in 0..MAX_SHARD_VERTICES {
|
||||
parent[i] = i as u8;
|
||||
}
|
||||
|
||||
// Union edges
|
||||
for edge in &self.edges[..self.edge_count] {
|
||||
if edge.active {
|
||||
let s = edge.source as usize;
|
||||
let t = edge.target as usize;
|
||||
parent[s] = parent[t];
|
||||
}
|
||||
}
|
||||
|
||||
// Count unique components
|
||||
let mut seen = [false; MAX_SHARD_VERTICES];
|
||||
let mut count = 0u8;
|
||||
for i in 0..MAX_SHARD_VERTICES {
|
||||
if self.vertices[i].active && !seen[parent[i] as usize] {
|
||||
seen[parent[i] as usize] = true;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
self.component_count = count;
|
||||
}
|
||||
|
||||
pub fn compute_total_energy(&self) -> f32 {
|
||||
let mut energy = 0.0f32;
|
||||
for edge in &self.edges[..self.edge_count] {
|
||||
if edge.active {
|
||||
// Simplified: weight as energy contribution
|
||||
energy += edge.weight as f32 / 100.0;
|
||||
}
|
||||
}
|
||||
energy
|
||||
}
|
||||
}
|
||||
|
||||
/// E-value accumulator (log-space evidence)
|
||||
pub struct EvidenceAccumulator {
|
||||
/// Log e-value (fixed-point: value / 65536 = log2(e-value))
|
||||
pub log_e_values: Vec<i32>,
|
||||
pub hypothesis_count: usize,
|
||||
}
|
||||
|
||||
impl EvidenceAccumulator {
|
||||
pub fn new(capacity: usize) -> Self {
|
||||
Self {
|
||||
log_e_values: vec![0; capacity],
|
||||
hypothesis_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_hypothesis(&mut self) -> usize {
|
||||
let idx = self.hypothesis_count;
|
||||
if idx < self.log_e_values.len() {
|
||||
self.hypothesis_count += 1;
|
||||
}
|
||||
idx
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn update(&mut self, idx: usize, log_lr: i32) {
|
||||
if idx < self.hypothesis_count {
|
||||
self.log_e_values[idx] = self.log_e_values[idx].saturating_add(log_lr);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn global_log_e(&self) -> i64 {
|
||||
self.log_e_values[..self.hypothesis_count]
|
||||
.iter()
|
||||
.map(|&v| v as i64)
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
/// Tile report (output of tick)
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TileReport {
|
||||
pub tile_id: u8,
|
||||
pub tick: u32,
|
||||
pub connected: bool,
|
||||
pub component_count: u8,
|
||||
pub log_e_value: i64,
|
||||
pub energy: f32,
|
||||
pub witness_hash: u64,
|
||||
}
|
||||
|
||||
impl TileReport {
|
||||
pub fn new(tile_id: u8) -> Self {
|
||||
Self {
|
||||
tile_id,
|
||||
tick: 0,
|
||||
connected: true,
|
||||
component_count: 1,
|
||||
log_e_value: 0,
|
||||
energy: 0.0,
|
||||
witness_hash: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Single tile state
|
||||
pub struct TileState {
|
||||
pub tile_id: u8,
|
||||
pub graph: CompactGraph,
|
||||
pub evidence: EvidenceAccumulator,
|
||||
pub delta_buffer: Vec<Delta>,
|
||||
pub tick_count: u32,
|
||||
}
|
||||
|
||||
impl TileState {
|
||||
pub fn new(tile_id: u8) -> Self {
|
||||
Self {
|
||||
tile_id,
|
||||
graph: CompactGraph::new(),
|
||||
evidence: EvidenceAccumulator::new(64),
|
||||
delta_buffer: Vec::with_capacity(MAX_DELTA_BUFFER),
|
||||
tick_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ingest_delta(&mut self, delta: &Delta) -> bool {
|
||||
if self.delta_buffer.len() >= MAX_DELTA_BUFFER {
|
||||
return false;
|
||||
}
|
||||
self.delta_buffer.push(*delta);
|
||||
true
|
||||
}
|
||||
|
||||
pub fn tick(&mut self, tick_number: u32) -> TileReport {
|
||||
// Process pending deltas
|
||||
for delta in self.delta_buffer.drain(..) {
|
||||
match delta.delta_type {
|
||||
DeltaType::EdgeAdd => {
|
||||
self.graph
|
||||
.add_edge(delta.source, delta.target, delta.weight);
|
||||
}
|
||||
DeltaType::Observation => {
|
||||
// Update evidence accumulator
|
||||
let log_lr = if delta.payload != 0 { 65536 } else { -65536 };
|
||||
if self.evidence.hypothesis_count > 0 {
|
||||
self.evidence.update(0, log_lr);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Recompute components if needed
|
||||
self.graph.recompute_components();
|
||||
|
||||
// Compute energy
|
||||
let energy = self.graph.compute_total_energy();
|
||||
|
||||
// Build report
|
||||
self.tick_count = tick_number;
|
||||
TileReport {
|
||||
tile_id: self.tile_id,
|
||||
tick: tick_number,
|
||||
connected: self.graph.component_count <= 1,
|
||||
component_count: self.graph.component_count,
|
||||
log_e_value: self.evidence.global_log_e(),
|
||||
energy,
|
||||
witness_hash: self.compute_witness_hash(),
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_witness_hash(&self) -> u64 {
|
||||
let mut hash = self.tile_id as u64;
|
||||
hash = hash.wrapping_mul(0x517cc1b727220a95);
|
||||
hash ^= self.tick_count as u64;
|
||||
hash = hash.wrapping_mul(0x517cc1b727220a95);
|
||||
hash ^= self.graph.edge_count as u64;
|
||||
hash
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.graph = CompactGraph::new();
|
||||
self.delta_buffer.clear();
|
||||
self.tick_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// 256-tile coherence fabric
|
||||
pub struct CoherenceFabric {
|
||||
pub tiles: Vec<TileState>,
|
||||
}
|
||||
|
||||
impl CoherenceFabric {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tiles: (0..NUM_TILES).map(|i| TileState::new(i as u8)).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute tick on all tiles sequentially
|
||||
pub fn tick_sequential(&mut self, tick_number: u32) -> Vec<TileReport> {
|
||||
self.tiles.iter_mut().map(|t| t.tick(tick_number)).collect()
|
||||
}
|
||||
|
||||
/// Aggregate reports into global coherence
|
||||
pub fn aggregate_reports(reports: &[TileReport]) -> FabricReport {
|
||||
let total_energy: f32 = reports.iter().map(|r| r.energy).sum();
|
||||
let total_log_e: i64 = reports.iter().map(|r| r.log_e_value).sum();
|
||||
let all_connected = reports.iter().all(|r| r.connected);
|
||||
|
||||
// Compute global witness hash
|
||||
let mut global_hash = 0u64;
|
||||
for r in reports {
|
||||
global_hash = global_hash.wrapping_mul(0x517cc1b727220a95);
|
||||
global_hash ^= r.witness_hash;
|
||||
}
|
||||
|
||||
FabricReport {
|
||||
tick: reports.first().map(|r| r.tick).unwrap_or(0),
|
||||
total_energy,
|
||||
total_log_e,
|
||||
all_connected,
|
||||
global_witness_hash: global_hash,
|
||||
}
|
||||
}
|
||||
|
||||
/// Distribute delta to appropriate tile
|
||||
pub fn distribute_delta(&mut self, node_id: u64, delta: &Delta) {
|
||||
let tile_id = (node_id % NUM_TILES as u64) as usize;
|
||||
self.tiles[tile_id].ingest_delta(delta);
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregated fabric report
|
||||
pub struct FabricReport {
|
||||
pub tick: u32,
|
||||
pub total_energy: f32,
|
||||
pub total_log_e: i64,
|
||||
pub all_connected: bool,
|
||||
pub global_witness_hash: u64,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmarks
|
||||
// ============================================================================
|
||||
|
||||
/// Benchmark single tile tick
|
||||
fn bench_single_tile_tick(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_single_tick");
|
||||
group.throughput(Throughput::Elements(1));
|
||||
|
||||
// Empty tick
|
||||
let mut tile = TileState::new(0);
|
||||
group.bench_function("empty", |b| b.iter(|| black_box(tile.tick(black_box(1)))));
|
||||
|
||||
// Tick with small graph
|
||||
let mut tile = TileState::new(0);
|
||||
for i in 0..20u16 {
|
||||
tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
||||
}
|
||||
tile.tick(0);
|
||||
|
||||
group.bench_function("small_graph_20_edges", |b| {
|
||||
b.iter(|| black_box(tile.tick(black_box(1))))
|
||||
});
|
||||
|
||||
// Tick with pending deltas
|
||||
group.bench_function("with_10_deltas", |b| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut t = TileState::new(0);
|
||||
for i in 0..10u16 {
|
||||
t.ingest_delta(&Delta::edge_add(i, i + 1, 100));
|
||||
}
|
||||
t
|
||||
},
|
||||
|mut t| black_box(t.tick(1)),
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
// Tick with full delta buffer
|
||||
group.bench_function("with_64_deltas", |b| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut t = TileState::new(0);
|
||||
for i in 0..MAX_DELTA_BUFFER as u16 {
|
||||
t.ingest_delta(&Delta::edge_add(i % 200, (i + 1) % 200, 100));
|
||||
}
|
||||
t
|
||||
},
|
||||
|mut t| black_box(t.tick(1)),
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark 256-tile parallel tick (sequential baseline)
|
||||
fn bench_256_tile_tick_sequential(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_256_sequential");
|
||||
group.throughput(Throughput::Elements(NUM_TILES as u64));
|
||||
|
||||
// Empty fabric
|
||||
let mut fabric = CoherenceFabric::new();
|
||||
group.bench_function("empty_fabric", |b| {
|
||||
b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
|
||||
});
|
||||
|
||||
// Fabric with some data per tile
|
||||
let mut fabric = CoherenceFabric::new();
|
||||
for i in 0..NUM_TILES {
|
||||
for j in 0..10u16 {
|
||||
fabric.tiles[i].ingest_delta(&Delta::edge_add(j, j + 1, 100));
|
||||
}
|
||||
fabric.tiles[i].tick(0);
|
||||
}
|
||||
|
||||
group.bench_function("populated_10_edges_per_tile", |b| {
|
||||
b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark report aggregation
|
||||
fn bench_report_aggregation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_report_aggregation");
|
||||
group.throughput(Throughput::Elements(NUM_TILES as u64));
|
||||
|
||||
// Generate 256 reports
|
||||
let reports: Vec<TileReport> = (0..NUM_TILES)
|
||||
.map(|i| TileReport {
|
||||
tile_id: i as u8,
|
||||
tick: 1,
|
||||
connected: i % 10 != 0,
|
||||
component_count: (i % 5) as u8 + 1,
|
||||
log_e_value: (i as i64) * 1000 - 128000,
|
||||
energy: (i as f32) * 0.1,
|
||||
witness_hash: i as u64 * 0x517cc1b727220a95,
|
||||
})
|
||||
.collect();
|
||||
|
||||
group.bench_function("aggregate_256_reports", |b| {
|
||||
b.iter(|| black_box(CoherenceFabric::aggregate_reports(black_box(&reports))))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark delta distribution
|
||||
fn bench_delta_distribution(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_delta_distribution");
|
||||
|
||||
let mut fabric = CoherenceFabric::new();
|
||||
|
||||
// Single delta
|
||||
let delta = Delta::edge_add(0, 1, 100);
|
||||
group.bench_function("distribute_single", |b| {
|
||||
b.iter(|| fabric.distribute_delta(black_box(12345), black_box(&delta)))
|
||||
});
|
||||
|
||||
// Batch distribution
|
||||
for batch_size in [100, 1000, 10000] {
|
||||
let deltas: Vec<(u64, Delta)> = (0..batch_size)
|
||||
.map(|i| {
|
||||
(
|
||||
i as u64,
|
||||
Delta::edge_add((i % 200) as u16, ((i + 1) % 200) as u16, 100),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("distribute_batch", batch_size),
|
||||
&deltas,
|
||||
|b, deltas| {
|
||||
b.iter(|| {
|
||||
for (node_id, delta) in deltas {
|
||||
fabric.distribute_delta(*node_id, delta);
|
||||
}
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark evidence accumulator
|
||||
fn bench_evidence_accumulator(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_evidence");
|
||||
|
||||
let mut acc = EvidenceAccumulator::new(64);
|
||||
for _ in 0..16 {
|
||||
acc.add_hypothesis();
|
||||
}
|
||||
|
||||
// Single update
|
||||
group.bench_function("update_single", |b| {
|
||||
b.iter(|| acc.update(black_box(5), black_box(65536)))
|
||||
});
|
||||
|
||||
// Global e-value computation
|
||||
group.bench_function("global_log_e_16_hyp", |b| {
|
||||
b.iter(|| black_box(acc.global_log_e()))
|
||||
});
|
||||
|
||||
// 64 hypotheses
|
||||
let mut acc64 = EvidenceAccumulator::new(64);
|
||||
for _ in 0..64 {
|
||||
acc64.add_hypothesis();
|
||||
}
|
||||
for i in 0..64 {
|
||||
acc64.log_e_values[i] = (i as i32 - 32) * 1000;
|
||||
}
|
||||
|
||||
group.bench_function("global_log_e_64_hyp", |b| {
|
||||
b.iter(|| black_box(acc64.global_log_e()))
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark component recomputation
|
||||
fn bench_component_recompute(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_component_recompute");
|
||||
|
||||
for edge_count in [50, 200, 500, 1000] {
|
||||
let mut graph = CompactGraph::new();
|
||||
for i in 0..edge_count.min(MAX_SHARD_EDGES) {
|
||||
let src = (i % 200) as u16;
|
||||
let tgt = ((i + 1) % 200) as u16;
|
||||
if src != tgt {
|
||||
graph.add_edge(src, tgt, 100);
|
||||
}
|
||||
}
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("recompute", edge_count),
|
||||
&edge_count,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
graph.recompute_components();
|
||||
black_box(graph.component_count)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark full tick + aggregate cycle
|
||||
fn bench_full_cycle(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_full_cycle");
|
||||
group.sample_size(50);
|
||||
|
||||
// Populate fabric
|
||||
let mut fabric = CoherenceFabric::new();
|
||||
for i in 0..NUM_TILES {
|
||||
for j in 0..50u16 {
|
||||
fabric.tiles[i].ingest_delta(&Delta::edge_add(j, (j + 1) % 200, 100));
|
||||
}
|
||||
fabric.tiles[i].tick(0);
|
||||
}
|
||||
|
||||
group.bench_function("tick_and_aggregate_256_tiles", |b| {
|
||||
let mut tick = 1u32;
|
||||
b.iter(|| {
|
||||
let reports = fabric.tick_sequential(tick);
|
||||
let fabric_report = CoherenceFabric::aggregate_reports(&reports);
|
||||
tick += 1;
|
||||
black_box(fabric_report)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark memory access patterns
|
||||
fn bench_memory_patterns(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tile_memory");
|
||||
|
||||
// Sequential tile access
|
||||
let fabric = CoherenceFabric::new();
|
||||
group.bench_function("sequential_tile_scan", |b| {
|
||||
b.iter(|| {
|
||||
let mut total = 0usize;
|
||||
for tile in &fabric.tiles {
|
||||
total += tile.graph.edge_count;
|
||||
}
|
||||
black_box(total)
|
||||
})
|
||||
});
|
||||
|
||||
// Strided tile access
|
||||
group.bench_function("strided_tile_scan", |b| {
|
||||
let stride = 7;
|
||||
b.iter(|| {
|
||||
let mut total = 0usize;
|
||||
let mut idx = 0;
|
||||
for _ in 0..NUM_TILES {
|
||||
total += fabric.tiles[idx % NUM_TILES].graph.edge_count;
|
||||
idx += stride;
|
||||
}
|
||||
black_box(total)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_single_tile_tick,
|
||||
bench_256_tile_tick_sequential,
|
||||
bench_report_aggregation,
|
||||
bench_delta_distribution,
|
||||
bench_evidence_accumulator,
|
||||
bench_component_recompute,
|
||||
bench_full_cycle,
|
||||
bench_memory_patterns,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
Reference in New Issue
Block a user