Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/crates/prime-radiant/benches/attention_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/attention_bench.rs
@@ -0,0 +1,15 @@
+//! Attention-weighted coherence benchmarks
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+
+fn attention_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attention");
+
+    // Placeholder benchmark - requires attention feature
+    group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
+
+    group.finish();
+}
+
+criterion_group!(benches, attention_benchmark);
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/coherence_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/coherence_bench.rs
@@ -0,0 +1,15 @@
+//! Coherence engine benchmarks
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+fn coherence_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("coherence");
+
+    // Placeholder benchmark - will be implemented when coherence module is complete
+    group.bench_function("placeholder", |b| b.iter(|| black_box(42)));
+
+    group.finish();
+}
+
+criterion_group!(benches, coherence_benchmark);
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/coherence_benchmarks.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/coherence_benchmarks.rs
--- a/vendor/ruvector/crates/prime-radiant/benches/energy_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/energy_bench.rs
@@ -0,0 +1,546 @@
+//! Benchmarks for full graph energy computation
+//!
+//! ADR-014 Performance Target: < 10ms for 10K nodes
+//!
+//! Global coherence energy: E(S) = sum(w_e * |r_e|^2)
+//! This is the aggregate measure of system incoherence.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::HashMap;
+
+// ============================================================================
+// Graph Types (Simulated for benchmarking)
+// ============================================================================
+
+/// Simplified restriction map for energy benchmarks
+#[derive(Clone)]
+pub struct RestrictionMap {
+    pub matrix: Vec<f32>,
+    pub bias: Vec<f32>,
+    pub input_dim: usize,
+    pub output_dim: usize,
+}
+
+impl RestrictionMap {
+    pub fn identity(dim: usize) -> Self {
+        let mut matrix = vec![0.0f32; dim * dim];
+        for i in 0..dim {
+            matrix[i * dim + i] = 1.0;
+        }
+        Self {
+            matrix,
+            bias: vec![0.0; dim],
+            input_dim: dim,
+            output_dim: dim,
+        }
+    }
+
+    #[inline]
+    pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
+        output.copy_from_slice(&self.bias);
+        for i in 0..self.output_dim {
+            let row_start = i * self.input_dim;
+            for j in 0..self.input_dim {
+                output[i] += self.matrix[row_start + j] * input[j];
+            }
+        }
+    }
+}
+
+/// Node in sheaf graph
+#[derive(Clone)]
+pub struct SheafNode {
+    pub id: u64,
+    pub state: Vec<f32>,
+}
+
+/// Edge with restriction maps
+#[derive(Clone)]
+pub struct SheafEdge {
+    pub source: u64,
+    pub target: u64,
+    pub weight: f32,
+    pub rho_source: RestrictionMap,
+    pub rho_target: RestrictionMap,
+}
+
+impl SheafEdge {
+    #[inline]
+    pub fn weighted_residual_energy_into(
+        &self,
+        source: &[f32],
+        target: &[f32],
+        source_buf: &mut [f32],
+        target_buf: &mut [f32],
+    ) -> f32 {
+        self.rho_source.apply_into(source, source_buf);
+        self.rho_target.apply_into(target, target_buf);
+
+        let mut norm_sq = 0.0f32;
+        for i in 0..source_buf.len() {
+            let diff = source_buf[i] - target_buf[i];
+            norm_sq += diff * diff;
+        }
+
+        self.weight * norm_sq
+    }
+}
+
+/// Full sheaf graph for coherence computation
+pub struct SheafGraph {
+    pub nodes: HashMap<u64, SheafNode>,
+    pub edges: Vec<SheafEdge>,
+    pub state_dim: usize,
+}
+
+/// Result of energy computation
+pub struct CoherenceEnergy {
+    pub total_energy: f32,
+    pub edge_energies: Vec<f32>,
+}
+
+impl SheafGraph {
+    /// Generate a random graph for benchmarking
+    pub fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let mut hasher = || {
+            let mut h = DefaultHasher::new();
+            seed.hash(&mut h);
+            h
+        };
+
+        // Generate nodes
+        let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
+            .map(|id| {
+                let state: Vec<f32> = (0..state_dim)
+                    .map(|i| {
+                        let mut h = hasher();
+                        (id, i).hash(&mut h);
+                        (h.finish() % 1000) as f32 / 1000.0 - 0.5
+                    })
+                    .collect();
+                (id, SheafNode { id, state })
+            })
+            .collect();
+
+        // Generate edges (random graph with target average degree)
+        let num_edges = (num_nodes * avg_degree) / 2;
+        let mut edges = Vec::with_capacity(num_edges);
+
+        for i in 0..num_edges {
+            let mut h = hasher();
+            (seed, i, "edge").hash(&mut h);
+            let source = (h.finish() % num_nodes as u64) as u64;
+
+            let mut h = hasher();
+            (seed, i, "target").hash(&mut h);
+            let target = (h.finish() % num_nodes as u64) as u64;
+
+            if source != target {
+                edges.push(SheafEdge {
+                    source,
+                    target,
+                    weight: 1.0,
+                    rho_source: RestrictionMap::identity(state_dim),
+                    rho_target: RestrictionMap::identity(state_dim),
+                });
+            }
+        }
+
+        Self {
+            nodes,
+            edges,
+            state_dim,
+        }
+    }
+
+    /// Generate a chain graph (linear topology)
+    pub fn chain(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
+            .map(|id| {
+                let state: Vec<f32> = (0..state_dim)
+                    .map(|i| {
+                        let mut h = DefaultHasher::new();
+                        (seed, id, i).hash(&mut h);
+                        (h.finish() % 1000) as f32 / 1000.0 - 0.5
+                    })
+                    .collect();
+                (id, SheafNode { id, state })
+            })
+            .collect();
+
+        let edges: Vec<SheafEdge> = (0..num_nodes - 1)
+            .map(|i| SheafEdge {
+                source: i as u64,
+                target: (i + 1) as u64,
+                weight: 1.0,
+                rho_source: RestrictionMap::identity(state_dim),
+                rho_target: RestrictionMap::identity(state_dim),
+            })
+            .collect();
+
+        Self {
+            nodes,
+            edges,
+            state_dim,
+        }
+    }
+
+    /// Generate a dense graph (high connectivity)
+    pub fn dense(num_nodes: usize, state_dim: usize, seed: u64) -> Self {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
+            .map(|id| {
+                let state: Vec<f32> = (0..state_dim)
+                    .map(|i| {
+                        let mut h = DefaultHasher::new();
+                        (seed, id, i).hash(&mut h);
+                        (h.finish() % 1000) as f32 / 1000.0 - 0.5
+                    })
+                    .collect();
+                (id, SheafNode { id, state })
+            })
+            .collect();
+
+        // Dense: ~30% of possible edges
+        let mut edges = Vec::new();
+        for i in 0..num_nodes as u64 {
+            for j in (i + 1)..num_nodes as u64 {
+                let mut h = DefaultHasher::new();
+                (seed, i, j).hash(&mut h);
+                if h.finish() % 10 < 3 {
+                    // 30% probability
+                    edges.push(SheafEdge {
+                        source: i,
+                        target: j,
+                        weight: 1.0,
+                        rho_source: RestrictionMap::identity(state_dim),
+                        rho_target: RestrictionMap::identity(state_dim),
+                    });
+                }
+            }
+        }
+
+        Self {
+            nodes,
+            edges,
+            state_dim,
+        }
+    }
+
+    /// Compute global coherence energy (sequential)
+    pub fn compute_energy_sequential(&self) -> CoherenceEnergy {
+        let mut source_buf = vec![0.0f32; self.state_dim];
+        let mut target_buf = vec![0.0f32; self.state_dim];
+
+        let edge_energies: Vec<f32> = self
+            .edges
+            .iter()
+            .map(|edge| {
+                let source_state = &self.nodes[&edge.source].state;
+                let target_state = &self.nodes[&edge.target].state;
+                edge.weighted_residual_energy_into(
+                    source_state,
+                    target_state,
+                    &mut source_buf,
+                    &mut target_buf,
+                )
+            })
+            .collect();
+
+        let total_energy: f32 = edge_energies.iter().sum();
+
+        CoherenceEnergy {
+            total_energy,
+            edge_energies,
+        }
+    }
+
+    /// Compute global coherence energy (parallel with rayon)
+    #[cfg(feature = "parallel")]
+    pub fn compute_energy_parallel(&self) -> CoherenceEnergy {
+        use rayon::prelude::*;
+
+        let edge_energies: Vec<f32> = self
+            .edges
+            .par_iter()
+            .map(|edge| {
+                let mut source_buf = vec![0.0f32; self.state_dim];
+                let mut target_buf = vec![0.0f32; self.state_dim];
+                let source_state = &self.nodes[&edge.source].state;
+                let target_state = &self.nodes[&edge.target].state;
+                edge.weighted_residual_energy_into(
+                    source_state,
+                    target_state,
+                    &mut source_buf,
+                    &mut target_buf,
+                )
+            })
+            .collect();
+
+        let total_energy: f32 = edge_energies.par_iter().sum();
+
+        CoherenceEnergy {
+            total_energy,
+            edge_energies,
+        }
+    }
+
+    /// Compute just total energy (no per-edge tracking)
+    pub fn compute_total_energy(&self) -> f32 {
+        let mut source_buf = vec![0.0f32; self.state_dim];
+        let mut target_buf = vec![0.0f32; self.state_dim];
+        let mut total = 0.0f32;
+
+        for edge in &self.edges {
+            let source_state = &self.nodes[&edge.source].state;
+            let target_state = &self.nodes[&edge.target].state;
+            total += edge.weighted_residual_energy_into(
+                source_state,
+                target_state,
+                &mut source_buf,
+                &mut target_buf,
+            );
+        }
+
+        total
+    }
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark full graph energy at various sizes
+fn bench_full_graph_energy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_full_graph");
+
+    // ADR-014 target: 10K nodes in <10ms
+    // Test progression: 100, 1K, 10K, 100K
+    for num_nodes in [100, 1_000, 10_000] {
+        let avg_degree = 4;
+        let state_dim = 64;
+        let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+        group.throughput(Throughput::Elements(graph.edges.len() as u64));
+
+        group.bench_with_input(
+            BenchmarkId::new("sequential", format!("{}nodes", num_nodes)),
+            &num_nodes,
+            |b, _| b.iter(|| black_box(graph.compute_energy_sequential())),
+        );
+
+        // Total energy only (no per-edge allocation)
+        group.bench_with_input(
+            BenchmarkId::new("total_only", format!("{}nodes", num_nodes)),
+            &num_nodes,
+            |b, _| b.iter(|| black_box(graph.compute_total_energy())),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark with 100K nodes (reduced sample size due to runtime)
+fn bench_large_graph_energy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_large_graph");
+    group.sample_size(10);
+
+    let num_nodes = 100_000;
+    let avg_degree = 4;
+    let state_dim = 64;
+    let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+    group.throughput(Throughput::Elements(graph.edges.len() as u64));
+
+    group.bench_function("100K_nodes_total_energy", |b| {
+        b.iter(|| black_box(graph.compute_total_energy()))
+    });
+
+    group.finish();
+}
+
+/// Benchmark energy computation for different graph topologies
+fn bench_topology_impact(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_topology");
+
+    let num_nodes = 1000;
+    let state_dim = 64;
+
+    // Chain topology (sparse, n-1 edges)
+    let chain = SheafGraph::chain(num_nodes, state_dim, 42);
+    group.throughput(Throughput::Elements(chain.edges.len() as u64));
+    group.bench_function("chain_1000", |b| {
+        b.iter(|| black_box(chain.compute_total_energy()))
+    });
+
+    // Random topology (avg degree 4)
+    let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
+    group.throughput(Throughput::Elements(random.edges.len() as u64));
+    group.bench_function("random_1000_deg4", |b| {
+        b.iter(|| black_box(random.compute_total_energy()))
+    });
+
+    // Dense topology (~30% edges)
+    let dense = SheafGraph::dense(100, state_dim, 42); // Smaller for dense
+    group.throughput(Throughput::Elements(dense.edges.len() as u64));
+    group.bench_function("dense_100", |b| {
+        b.iter(|| black_box(dense.compute_total_energy()))
+    });
+
+    group.finish();
+}
+
+/// Benchmark impact of state dimension on energy computation
+fn bench_state_dimension(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_state_dim");
+
+    let num_nodes = 1000;
+    let avg_degree = 4;
+
+    for state_dim in [8, 32, 64, 128, 256] {
+        let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+        group.throughput(Throughput::Elements(graph.edges.len() as u64));
+        group.bench_with_input(BenchmarkId::new("dim", state_dim), &state_dim, |b, _| {
+            b.iter(|| black_box(graph.compute_total_energy()))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark edge density scaling
+fn bench_edge_density(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_edge_density");
+
+    let num_nodes = 1000;
+    let state_dim = 64;
+
+    // Varying average degree
+    for avg_degree in [2, 4, 8, 16, 32] {
+        let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+        group.throughput(Throughput::Elements(graph.edges.len() as u64));
+        group.bench_with_input(
+            BenchmarkId::new("avg_degree", avg_degree),
+            &avg_degree,
+            |b, _| b.iter(|| black_box(graph.compute_total_energy())),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark scope-based energy aggregation
+fn bench_scoped_energy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_scoped");
+
+    let num_nodes = 10_000;
+    let avg_degree = 4;
+    let state_dim = 64;
+    let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+    // Simulate scope-based aggregation (e.g., by namespace)
+    let num_scopes = 10;
+    let scope_assignments: Vec<usize> = graph
+        .edges
+        .iter()
+        .enumerate()
+        .map(|(i, _)| i % num_scopes)
+        .collect();
+
+    group.bench_function("aggregate_by_scope", |b| {
+        b.iter(|| {
+            let mut source_buf = vec![0.0f32; state_dim];
+            let mut target_buf = vec![0.0f32; state_dim];
+            let mut scope_energies = vec![0.0f32; num_scopes];
+
+            for (i, edge) in graph.edges.iter().enumerate() {
+                let source_state = &graph.nodes[&edge.source].state;
+                let target_state = &graph.nodes[&edge.target].state;
+                let energy = edge.weighted_residual_energy_into(
+                    source_state,
+                    target_state,
+                    &mut source_buf,
+                    &mut target_buf,
+                );
+                scope_energies[scope_assignments[i]] += energy;
+            }
+
+            black_box(scope_energies)
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark energy fingerprint computation
+fn bench_energy_fingerprint(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_fingerprint");
+
+    let num_nodes = 1000;
+    let avg_degree = 4;
+    let state_dim = 64;
+    let graph = SheafGraph::random(num_nodes, avg_degree, state_dim, 42);
+
+    group.bench_function("compute_with_fingerprint", |b| {
+        b.iter(|| {
+            let energy = graph.compute_energy_sequential();
+
+            // Compute fingerprint from edge energies
+            let mut fingerprint = 0u64;
+            for e in &energy.edge_energies {
+                fingerprint ^= e.to_bits() as u64;
+                fingerprint = fingerprint.rotate_left(7);
+            }
+
+            black_box((energy.total_energy, fingerprint))
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark memory access patterns for energy computation
+fn bench_memory_patterns(c: &mut Criterion) {
+    let mut group = c.benchmark_group("energy_memory");
+
+    let num_nodes = 10_000;
+    let state_dim = 64;
+
+    // Sequential node access (chain)
+    let chain = SheafGraph::chain(num_nodes, state_dim, 42);
+    group.bench_function("sequential_access", |b| {
+        b.iter(|| black_box(chain.compute_total_energy()))
+    });
+
+    // Random node access
+    let random = SheafGraph::random(num_nodes, 4, state_dim, 42);
+    group.bench_function("random_access", |b| {
+        b.iter(|| black_box(random.compute_total_energy()))
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_full_graph_energy,
+    bench_large_graph_energy,
+    bench_topology_impact,
+    bench_state_dimension,
+    bench_edge_density,
+    bench_scoped_energy,
+    bench_energy_fingerprint,
+    bench_memory_patterns,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/gate_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/gate_bench.rs
@@ -0,0 +1,629 @@
+//! Benchmarks for coherence gate evaluation
+//!
+//! ADR-014 Performance Target: < 500us per gate evaluation
+//!
+//! The gate is a deterministic decision point that:
+//! 1. Evaluates current energy against thresholds
+//! 2. Checks persistence history
+//! 3. Determines compute lane (Reflex/Retrieval/Heavy/Human)
+//! 4. Creates witness record
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::VecDeque;
+use std::time::Duration;
+
+// ============================================================================
+// Types (Simulated for benchmarking)
+// ============================================================================
+
+/// Compute lanes for escalating complexity
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub enum ComputeLane {
+    /// Lane 0: Local residual updates (<1ms)
+    Reflex = 0,
+    /// Lane 1: Evidence fetching (~10ms)
+    Retrieval = 1,
+    /// Lane 2: Multi-step planning (~100ms)
+    Heavy = 2,
+    /// Lane 3: Human escalation
+    Human = 3,
+}
+
+/// Coherence energy snapshot
+#[derive(Clone)]
+pub struct CoherenceEnergy {
+    pub total_energy: f32,
+    pub scope_energies: Vec<(u64, f32)>, // (scope_id, energy)
+    pub timestamp: u64,
+    pub fingerprint: u64,
+}
+
+impl CoherenceEnergy {
+    pub fn new(total: f32, num_scopes: usize) -> Self {
+        let scope_energies: Vec<(u64, f32)> = (0..num_scopes)
+            .map(|i| (i as u64, total / num_scopes as f32))
+            .collect();
+
+        Self {
+            total_energy: total,
+            scope_energies,
+            timestamp: 0,
+            fingerprint: (total.to_bits() as u64).wrapping_mul(0x517cc1b727220a95),
+        }
+    }
+
+    pub fn scope_energy(&self, scope_id: u64) -> f32 {
+        self.scope_energies
+            .iter()
+            .find(|(id, _)| *id == scope_id)
+            .map(|(_, e)| *e)
+            .unwrap_or(0.0)
+    }
+}
+
+/// Action to be gated
+#[derive(Clone)]
+pub struct Action {
+    pub id: u64,
+    pub scope_id: u64,
+    pub action_type: ActionType,
+    pub payload_hash: u64,
+}
+
+#[derive(Clone, Copy)]
+pub enum ActionType {
+    Read,
+    Write,
+    Execute,
+    External,
+}
+
+/// Threshold configuration
+#[derive(Clone)]
+pub struct ThresholdConfig {
+    pub reflex: f32,
+    pub retrieval: f32,
+    pub heavy: f32,
+    pub persistence_window_ms: u64,
+}
+
+impl Default for ThresholdConfig {
+    fn default() -> Self {
+        Self {
+            reflex: 0.1,
+            retrieval: 0.5,
+            heavy: 1.0,
+            persistence_window_ms: 5000,
+        }
+    }
+}
+
+/// Energy history for persistence detection
+pub struct EnergyHistory {
+    /// Rolling window of (timestamp_ms, energy) pairs per scope
+    history: Vec<VecDeque<(u64, f32)>>,
+    max_scopes: usize,
+    window_size: usize,
+}
+
+impl EnergyHistory {
+    pub fn new(max_scopes: usize, window_size: usize) -> Self {
+        Self {
+            history: (0..max_scopes)
+                .map(|_| VecDeque::with_capacity(window_size))
+                .collect(),
+            max_scopes,
+            window_size,
+        }
+    }
+
+    pub fn record(&mut self, scope_id: u64, timestamp_ms: u64, energy: f32) {
+        if (scope_id as usize) < self.max_scopes {
+            let queue = &mut self.history[scope_id as usize];
+            if queue.len() >= self.window_size {
+                queue.pop_front();
+            }
+            queue.push_back((timestamp_ms, energy));
+        }
+    }
+
+    pub fn is_above_threshold(
+        &self,
+        scope_id: u64,
+        threshold: f32,
+        window_ms: u64,
+        current_time_ms: u64,
+    ) -> bool {
+        if (scope_id as usize) >= self.max_scopes {
+            return false;
+        }
+
+        let queue = &self.history[scope_id as usize];
+        let cutoff = current_time_ms.saturating_sub(window_ms);
+
+        // Check if all samples in window are above threshold
+        let samples_in_window: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
+
+        if samples_in_window.is_empty() {
+            return false;
+        }
+
+        samples_in_window.iter().all(|(_, e)| *e >= threshold)
+    }
+
+    pub fn trend(&self, scope_id: u64, window_ms: u64, current_time_ms: u64) -> Option<f32> {
+        if (scope_id as usize) >= self.max_scopes {
+            return None;
+        }
+
+        let queue = &self.history[scope_id as usize];
+        let cutoff = current_time_ms.saturating_sub(window_ms);
+
+        let samples: Vec<_> = queue.iter().filter(|(ts, _)| *ts >= cutoff).collect();
+
+        if samples.len() < 2 {
+            return None;
+        }
+
+        // Simple linear trend: (last - first) / count
+        let first = samples.first().unwrap().1;
+        let last = samples.last().unwrap().1;
+        Some((last - first) / samples.len() as f32)
+    }
+}
+
+/// Witness record for audit
+#[derive(Clone)]
+pub struct WitnessRecord {
+    pub id: u64,
+    pub action_hash: u64,
+    pub energy_fingerprint: u64,
+    pub lane: ComputeLane,
+    pub allowed: bool,
+    pub timestamp: u64,
+    pub content_hash: u64,
+}
+
+impl WitnessRecord {
+    pub fn new(
+        action: &Action,
+        energy: &CoherenceEnergy,
+        lane: ComputeLane,
+        allowed: bool,
+        timestamp: u64,
+    ) -> Self {
+        let content_hash = Self::compute_hash(action, energy, lane, allowed, timestamp);
+
+        Self {
+            id: timestamp, // Simplified
+            action_hash: action.payload_hash,
+            energy_fingerprint: energy.fingerprint,
+            lane,
+            allowed,
+            timestamp,
+            content_hash,
+        }
+    }
+
+    fn compute_hash(
+        action: &Action,
+        energy: &CoherenceEnergy,
+        lane: ComputeLane,
+        allowed: bool,
+        timestamp: u64,
+    ) -> u64 {
+        // Simplified hash computation (in production: use Blake3)
+        let mut h = action.payload_hash;
+        h = h.wrapping_mul(0x517cc1b727220a95);
+        h ^= energy.fingerprint;
+        h = h.wrapping_mul(0x517cc1b727220a95);
+        h ^= (lane as u64) << 32 | (allowed as u64);
+        h = h.wrapping_mul(0x517cc1b727220a95);
+        h ^= timestamp;
+        h
+    }
+}
+
+/// Gate decision result
+pub struct GateDecision {
+    pub allow: bool,
+    pub lane: ComputeLane,
+    pub witness: WitnessRecord,
+    pub denial_reason: Option<&'static str>,
+}
+
+/// Coherence gate
+pub struct CoherenceGate {
+    pub config: ThresholdConfig,
+    pub history: EnergyHistory,
+    current_time_ms: u64,
+}
+
+impl CoherenceGate {
+    pub fn new(config: ThresholdConfig, max_scopes: usize) -> Self {
+        Self {
+            config,
+            history: EnergyHistory::new(max_scopes, 100),
+            current_time_ms: 0,
+        }
+    }
+
+    /// Evaluate whether action should proceed
+    pub fn evaluate(&mut self, action: &Action, energy: &CoherenceEnergy) -> GateDecision {
+        let current_energy = energy.scope_energy(action.scope_id);
+
+        // Record in history
+        self.history
+            .record(action.scope_id, self.current_time_ms, current_energy);
+
+        // Determine lane based on energy
+        let lane = if current_energy < self.config.reflex {
+            ComputeLane::Reflex
+        } else if current_energy < self.config.retrieval {
+            ComputeLane::Retrieval
+        } else if current_energy < self.config.heavy {
+            ComputeLane::Heavy
+        } else {
+            ComputeLane::Human
+        };
+
+        // Check for persistent incoherence
+        let persistent = self.history.is_above_threshold(
+            action.scope_id,
+            self.config.retrieval,
+            self.config.persistence_window_ms,
+            self.current_time_ms,
+        );
+
+        // Check for growing incoherence (trend)
+        let growing = self
+            .history
+            .trend(
+                action.scope_id,
+                self.config.persistence_window_ms,
+                self.current_time_ms,
+            )
+            .map(|t| t > 0.01)
+            .unwrap_or(false);
+
+        // Escalate if persistent and not already at high lane
+        let final_lane = if (persistent || growing) && lane < ComputeLane::Heavy {
+            ComputeLane::Heavy
+        } else {
+            lane
+        };
+
+        // Allow unless Human lane
+        let allow = final_lane < ComputeLane::Human;
+
+        let denial_reason = if !allow {
+            Some("Energy exceeds all automatic thresholds")
+        } else if persistent {
+            Some("Persistent incoherence - escalated")
+        } else {
+            None
+        };
+
+        let witness = WitnessRecord::new(action, energy, final_lane, allow, self.current_time_ms);
+
+        self.current_time_ms += 1;
+
+        GateDecision {
+            allow,
+            lane: final_lane,
+            witness,
+            denial_reason,
+        }
+    }
+
+    /// Fast path evaluation (no history update)
+    #[inline]
+    pub fn evaluate_fast(&self, scope_energy: f32) -> ComputeLane {
+        if scope_energy < self.config.reflex {
+            ComputeLane::Reflex
+        } else if scope_energy < self.config.retrieval {
+            ComputeLane::Retrieval
+        } else if scope_energy < self.config.heavy {
+            ComputeLane::Heavy
+        } else {
+            ComputeLane::Human
+        }
+    }
+
+    /// Advance time (for benchmarking)
+    pub fn advance_time(&mut self, delta_ms: u64) {
+        self.current_time_ms += delta_ms;
+    }
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark full gate evaluation
+fn bench_gate_evaluate(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_evaluate");
+    group.throughput(Throughput::Elements(1));
+
+    let config = ThresholdConfig::default();
+    let mut gate = CoherenceGate::new(config, 100);
+
+    let action = Action {
+        id: 1,
+        scope_id: 0,
+        action_type: ActionType::Write,
+        payload_hash: 0x12345678,
+    };
+
+    // Low energy (Reflex lane)
+    let low_energy = CoherenceEnergy::new(0.05, 10);
+    group.bench_function("low_energy_reflex", |b| {
+        b.iter(|| {
+            let decision = gate.evaluate(black_box(&action), black_box(&low_energy));
+            black_box(decision.lane)
+        })
+    });
+
+    // Medium energy (Retrieval lane)
+    let med_energy = CoherenceEnergy::new(0.3, 10);
+    group.bench_function("medium_energy_retrieval", |b| {
+        b.iter(|| {
+            let decision = gate.evaluate(black_box(&action), black_box(&med_energy));
+            black_box(decision.lane)
+        })
+    });
+
+    // High energy (Heavy lane)
+    let high_energy = CoherenceEnergy::new(0.8, 10);
+    group.bench_function("high_energy_heavy", |b| {
+        b.iter(|| {
+            let decision = gate.evaluate(black_box(&action), black_box(&high_energy));
+            black_box(decision.lane)
+        })
+    });
+
+    // Critical energy (Human lane)
+    let critical_energy = CoherenceEnergy::new(2.0, 10);
+    group.bench_function("critical_energy_human", |b| {
+        b.iter(|| {
+            let decision = gate.evaluate(black_box(&action), black_box(&critical_energy));
+            black_box(decision.lane)
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark fast path evaluation (no history)
+fn bench_gate_fast_path(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_fast_path");
+    group.throughput(Throughput::Elements(1));
+
+    let config = ThresholdConfig::default();
+    let gate = CoherenceGate::new(config, 100);
+
+    for energy in [0.05, 0.3, 0.8, 2.0] {
+        group.bench_with_input(
+            BenchmarkId::new("evaluate_fast", format!("{:.2}", energy)),
+            &energy,
+            |b, &e| b.iter(|| black_box(gate.evaluate_fast(black_box(e)))),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark witness record creation
+fn bench_witness_creation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_witness");
+    group.throughput(Throughput::Elements(1));
+
+    let action = Action {
+        id: 1,
+        scope_id: 0,
+        action_type: ActionType::Write,
+        payload_hash: 0x12345678,
+    };
+    let energy = CoherenceEnergy::new(0.3, 10);
+
+    group.bench_function("create_witness", |b| {
+        b.iter(|| {
+            WitnessRecord::new(
+                black_box(&action),
+                black_box(&energy),
+                black_box(ComputeLane::Retrieval),
+                black_box(true),
+                black_box(12345),
+            )
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark history operations
+fn bench_history_operations(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_history");
+
+    let mut history = EnergyHistory::new(100, 1000);
+
+    // Pre-populate with some history
+    for t in 0..500 {
+        for scope in 0..10u64 {
+            history.record(scope, t, 0.3 + (t % 10) as f32 * 0.01);
+        }
+    }
+
+    // Record operation
+    group.bench_function("record_single", |b| {
+        let mut t = 1000u64;
+        b.iter(|| {
+            history.record(black_box(5), black_box(t), black_box(0.35));
+            t += 1;
+        })
+    });
+
+    // Check threshold
+    group.bench_function("check_threshold", |b| {
+        b.iter(|| {
+            history.is_above_threshold(black_box(5), black_box(0.3), black_box(100), black_box(500))
+        })
+    });
+
+    // Compute trend
+    group.bench_function("compute_trend", |b| {
+        b.iter(|| history.trend(black_box(5), black_box(100), black_box(500)))
+    });
+
+    group.finish();
+}
+
+/// Benchmark persistence detection with various window sizes
+fn bench_persistence_detection(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_persistence");
+
+    for window_size in [10, 100, 1000] {
+        let mut history = EnergyHistory::new(10, window_size);
+
+        // Fill history
+        for t in 0..window_size as u64 {
+            history.record(0, t, 0.4); // Consistently above retrieval threshold
+        }
+
+        group.bench_with_input(
+            BenchmarkId::new("check_persistent", window_size),
+            &window_size,
+            |b, &size| {
+                b.iter(|| {
+                    history.is_above_threshold(
+                        black_box(0),
+                        black_box(0.3),
+                        black_box(size as u64),
+                        black_box(size as u64),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark batch evaluation (multiple actions)
+fn bench_batch_evaluation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_batch");
+
+    let config = ThresholdConfig::default();
+    let mut gate = CoherenceGate::new(config, 100);
+
+    for batch_size in [10, 100, 1000] {
+        let actions: Vec<Action> = (0..batch_size)
+            .map(|i| Action {
+                id: i as u64,
+                scope_id: (i % 10) as u64,
+                action_type: ActionType::Write,
+                payload_hash: i as u64 * 0x517cc1b727220a95,
+            })
+            .collect();
+
+        let energies: Vec<CoherenceEnergy> = (0..batch_size)
+            .map(|i| CoherenceEnergy::new(0.1 + (i % 20) as f32 * 0.05, 10))
+            .collect();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("evaluate_batch", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    let mut lanes = Vec::with_capacity(actions.len());
+                    for (action, energy) in actions.iter().zip(energies.iter()) {
+                        let decision = gate.evaluate(action, energy);
+                        lanes.push(decision.lane);
+                    }
+                    black_box(lanes)
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark scope energy lookup
+fn bench_scope_lookup(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_scope_lookup");
+
+    for num_scopes in [10, 100, 1000] {
+        let energy = CoherenceEnergy::new(1.0, num_scopes);
+
+        group.bench_with_input(
+            BenchmarkId::new("lookup", num_scopes),
+            &num_scopes,
+            |b, &n| {
+                let scope_id = (n / 2) as u64;
+                b.iter(|| black_box(energy.scope_energy(black_box(scope_id))))
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark threshold comparison patterns
+fn bench_threshold_comparison(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gate_threshold_cmp");
+
+    let config = ThresholdConfig::default();
+
+    // Sequential if-else (current implementation)
+    group.bench_function("sequential_if_else", |b| {
+        let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
+        b.iter(|| {
+            let mut lanes = [0u32; 4];
+            for &e in &energies {
+                let lane = if e < config.reflex {
+                    0
+                } else if e < config.retrieval {
+                    1
+                } else if e < config.heavy {
+                    2
+                } else {
+                    3
+                };
+                lanes[lane] += 1;
+            }
+            black_box(lanes)
+        })
+    });
+
+    // Binary search pattern
+    group.bench_function("binary_search", |b| {
+        let thresholds = [config.reflex, config.retrieval, config.heavy, f32::MAX];
+        let energies: Vec<f32> = (0..1000).map(|i| (i as f32) * 0.002).collect();
+        b.iter(|| {
+            let mut lanes = [0u32; 4];
+            for &e in &energies {
+                let lane = thresholds.partition_point(|&t| t <= e);
+                lanes[lane.min(3)] += 1;
+            }
+            black_box(lanes)
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_gate_evaluate,
+    bench_gate_fast_path,
+    bench_witness_creation,
+    bench_history_operations,
+    bench_persistence_detection,
+    bench_batch_evaluation,
+    bench_scope_lookup,
+    bench_threshold_comparison,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/gpu_benchmarks.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/gpu_benchmarks.rs
@@ -0,0 +1,784 @@
+//! GPU-Specific Benchmarks for Prime-Radiant Coherence Engine
+//!
+//! This benchmark suite compares CPU and GPU implementations of core
+//! coherence operations. Requires the `gpu` feature to be enabled.
+//!
+//! ## Benchmark Categories
+//! 1. Energy Computation - CPU vs GPU
+//! 2. Attention Forward Pass - CPU vs GPU
+//! 3. Batch Routing Decisions - CPU vs GPU
+//! 4. Memory Transfer Overhead
+//!
+//! ## GPU Backend Notes
+//! - Primary: wgpu (cross-platform WebGPU)
+//! - Optional: CUDA (NVIDIA), Metal (Apple), Vulkan
+//!
+//! ## Running GPU Benchmarks
+//! ```bash
+//! cargo bench --features gpu --bench gpu_benchmarks
+//! ```
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::hash_map::DefaultHasher;
+use std::collections::HashMap;
+use std::hash::{Hash, Hasher};
+
+// ============================================================================
+// TEST DATA GENERATION
+// ============================================================================
+
+fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
+    (0..len)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
+    (0..rows * cols)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+// ============================================================================
+// CPU BASELINE IMPLEMENTATIONS
+// ============================================================================
+
+/// CPU coherence energy computation
+#[derive(Clone)]
+struct CpuSheafGraph {
+    nodes: HashMap<u64, Vec<f32>>,
+    edges: Vec<(u64, u64, f32)>, // (source, target, weight)
+    state_dim: usize,
+}
+
+impl CpuSheafGraph {
+    fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
+        let nodes: HashMap<u64, Vec<f32>> = (0..num_nodes as u64)
+            .map(|id| (id, generate_vec(state_dim, seed + id)))
+            .collect();
+
+        let num_edges = (num_nodes * avg_degree) / 2;
+        let edges: Vec<(u64, u64, f32)> = (0..num_edges)
+            .filter_map(|i| {
+                let mut h = DefaultHasher::new();
+                (seed, i, "src").hash(&mut h);
+                let source = h.finish() % num_nodes as u64;
+
+                let mut h = DefaultHasher::new();
+                (seed, i, "tgt").hash(&mut h);
+                let target = h.finish() % num_nodes as u64;
+
+                if source != target {
+                    Some((source, target, 1.0))
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        Self {
+            nodes,
+            edges,
+            state_dim,
+        }
+    }
+
+    /// Compute total energy on CPU
+    fn compute_energy_cpu(&self) -> f32 {
+        let mut total = 0.0f32;
+        for &(src, tgt, weight) in &self.edges {
+            let src_state = &self.nodes[&src];
+            let tgt_state = &self.nodes[&tgt];
+
+            let mut norm_sq = 0.0f32;
+            for i in 0..self.state_dim {
+                let diff = src_state[i] - tgt_state[i];
+                norm_sq += diff * diff;
+            }
+            total += weight * norm_sq;
+        }
+        total
+    }
+
+    /// Compute energy with per-edge results on CPU
+    fn compute_energy_with_edges_cpu(&self) -> (f32, Vec<f32>) {
+        let edge_energies: Vec<f32> = self
+            .edges
+            .iter()
+            .map(|&(src, tgt, weight)| {
+                let src_state = &self.nodes[&src];
+                let tgt_state = &self.nodes[&tgt];
+
+                let mut norm_sq = 0.0f32;
+                for i in 0..self.state_dim {
+                    let diff = src_state[i] - tgt_state[i];
+                    norm_sq += diff * diff;
+                }
+                weight * norm_sq
+            })
+            .collect();
+
+        let total: f32 = edge_energies.iter().sum();
+        (total, edge_energies)
+    }
+}
+
+/// CPU attention forward pass (simplified)
+fn attention_forward_cpu(
+    queries: &[f32],
+    keys: &[f32],
+    values: &[f32],
+    seq_len: usize,
+    head_dim: usize,
+    output: &mut [f32],
+) {
+    let scale = 1.0 / (head_dim as f32).sqrt();
+
+    // For each query position
+    for i in 0..seq_len {
+        let q_offset = i * head_dim;
+
+        // Compute attention scores
+        let mut scores = vec![0.0f32; seq_len];
+        let mut max_score = f32::NEG_INFINITY;
+
+        for j in 0..seq_len {
+            let k_offset = j * head_dim;
+            let mut dot = 0.0f32;
+            for k in 0..head_dim {
+                dot += queries[q_offset + k] * keys[k_offset + k];
+            }
+            scores[j] = dot * scale;
+            if scores[j] > max_score {
+                max_score = scores[j];
+            }
+        }
+
+        // Softmax
+        let mut sum_exp = 0.0f32;
+        for s in &mut scores {
+            *s = (*s - max_score).exp();
+            sum_exp += *s;
+        }
+        for s in &mut scores {
+            *s /= sum_exp;
+        }
+
+        // Weighted sum of values
+        let out_offset = i * head_dim;
+        for k in 0..head_dim {
+            let mut weighted_sum = 0.0f32;
+            for j in 0..seq_len {
+                let v_offset = j * head_dim;
+                weighted_sum += scores[j] * values[v_offset + k];
+            }
+            output[out_offset + k] = weighted_sum;
+        }
+    }
+}
+
+/// CPU batch routing (expert selection for MoE)
+fn batch_routing_cpu(
+    token_embeddings: &[f32],
+    expert_weights: &[f32],
+    num_tokens: usize,
+    embed_dim: usize,
+    num_experts: usize,
+    top_k: usize,
+) -> Vec<(usize, Vec<usize>)> {
+    // token_embeddings: [num_tokens, embed_dim]
+    // expert_weights: [num_experts, embed_dim]
+    // Returns: for each token, the indices of top-k experts
+
+    let mut results = Vec::with_capacity(num_tokens);
+
+    for t in 0..num_tokens {
+        let token_offset = t * embed_dim;
+        let token = &token_embeddings[token_offset..token_offset + embed_dim];
+
+        // Compute scores for each expert
+        let mut expert_scores: Vec<(usize, f32)> = (0..num_experts)
+            .map(|e| {
+                let expert_offset = e * embed_dim;
+                let expert = &expert_weights[expert_offset..expert_offset + embed_dim];
+
+                let mut dot = 0.0f32;
+                for i in 0..embed_dim {
+                    dot += token[i] * expert[i];
+                }
+                (e, dot)
+            })
+            .collect();
+
+        // Sort by score (descending) and take top-k
+        expert_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+        let top_experts: Vec<usize> = expert_scores
+            .iter()
+            .take(top_k)
+            .map(|(idx, _)| *idx)
+            .collect();
+
+        results.push((t, top_experts));
+    }
+
+    results
+}
+
+// ============================================================================
+// GPU IMPLEMENTATIONS (SIMULATED WITHOUT ACTUAL GPU)
+// When gpu feature is enabled, these would use actual GPU code
+// ============================================================================
+
+#[cfg(feature = "gpu")]
+mod gpu_impl {
+    //! GPU implementations using wgpu or similar
+    //!
+    //! These would contain actual GPU shader code and buffer management.
+    //! For now, we simulate the overhead.
+
+    use super::*;
+
+    /// Simulated GPU energy computation
+    /// In reality, this would:
+    /// 1. Upload node states to GPU buffer
+    /// 2. Execute compute shader for parallel residual computation
+    /// 3. Reduce edge energies
+    /// 4. Read back result
+    pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
+        // Simulate GPU overhead
+        let _upload_time = simulate_memory_transfer(
+            graph.nodes.len() * graph.state_dim * 4, // bytes
+            true,                                    // host to device
+        );
+
+        // Actual computation would happen on GPU
+        // Here we just call CPU version
+        let result = graph.compute_energy_cpu();
+
+        let _download_time = simulate_memory_transfer(
+            4, // single f32 result
+            false,
+        );
+
+        result
+    }
+
+    /// Simulated GPU attention forward pass
+    pub fn attention_forward_gpu(
+        queries: &[f32],
+        keys: &[f32],
+        values: &[f32],
+        seq_len: usize,
+        head_dim: usize,
+        output: &mut [f32],
+    ) {
+        // Simulate upload
+        let input_bytes = (queries.len() + keys.len() + values.len()) * 4;
+        let _upload_time = simulate_memory_transfer(input_bytes, true);
+
+        // CPU fallback
+        attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
+
+        // Simulate download
+        let _download_time = simulate_memory_transfer(output.len() * 4, false);
+    }
+
+    /// Simulated GPU batch routing
+    pub fn batch_routing_gpu(
+        token_embeddings: &[f32],
+        expert_weights: &[f32],
+        num_tokens: usize,
+        embed_dim: usize,
+        num_experts: usize,
+        top_k: usize,
+    ) -> Vec<(usize, Vec<usize>)> {
+        // Simulate upload
+        let input_bytes = (token_embeddings.len() + expert_weights.len()) * 4;
+        let _upload_time = simulate_memory_transfer(input_bytes, true);
+
+        // CPU fallback
+        let result = batch_routing_cpu(
+            token_embeddings,
+            expert_weights,
+            num_tokens,
+            embed_dim,
+            num_experts,
+            top_k,
+        );
+
+        // Simulate download
+        let result_bytes = num_tokens * top_k * 4;
+        let _download_time = simulate_memory_transfer(result_bytes, false);
+
+        result
+    }
+
+    /// Simulate memory transfer time
+    /// Returns simulated nanoseconds
+    fn simulate_memory_transfer(bytes: usize, _host_to_device: bool) -> u64 {
+        // Assume ~10 GB/s transfer rate (PCIe 3.0 x16 theoretical)
+        // In practice, smaller transfers have higher overhead
+        let base_overhead_ns = 1000; // 1 microsecond base overhead
+        let transfer_ns = (bytes as u64 * 100) / 1_000_000_000; // ~10 GB/s
+        base_overhead_ns + transfer_ns
+    }
+}
+
+// Fallback for non-GPU builds
+#[cfg(not(feature = "gpu"))]
+mod gpu_impl {
+    use super::*;
+
+    pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
+        graph.compute_energy_cpu()
+    }
+
+    pub fn attention_forward_gpu(
+        queries: &[f32],
+        keys: &[f32],
+        values: &[f32],
+        seq_len: usize,
+        head_dim: usize,
+        output: &mut [f32],
+    ) {
+        attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
+    }
+
+    pub fn batch_routing_gpu(
+        token_embeddings: &[f32],
+        expert_weights: &[f32],
+        num_tokens: usize,
+        embed_dim: usize,
+        num_experts: usize,
+        top_k: usize,
+    ) -> Vec<(usize, Vec<usize>)> {
+        batch_routing_cpu(
+            token_embeddings,
+            expert_weights,
+            num_tokens,
+            embed_dim,
+            num_experts,
+            top_k,
+        )
+    }
+}
+
+// ============================================================================
+// ENERGY COMPUTATION BENCHMARKS
+// ============================================================================
+
+fn bench_energy_cpu_vs_gpu(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_energy");
+
+    // Test at various graph sizes
+    let sizes = [(1_000, 50), (10_000, 30), (100_000, 10)];
+
+    for (num_nodes, sample_size) in sizes {
+        let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
+
+        group.sample_size(sample_size);
+        group.throughput(Throughput::Elements(graph.edges.len() as u64));
+
+        group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
+            b.iter(|| black_box(graph.compute_energy_cpu()))
+        });
+
+        #[cfg(feature = "gpu")]
+        group.bench_with_input(BenchmarkId::new("gpu", num_nodes), &num_nodes, |b, _| {
+            b.iter(|| black_box(gpu_impl::compute_energy_gpu(&graph)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark energy computation with per-edge tracking
+fn bench_energy_with_edges(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_energy_with_edges");
+
+    for num_nodes in [1_000, 10_000] {
+        let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
+
+        group.throughput(Throughput::Elements(graph.edges.len() as u64));
+
+        group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
+            b.iter(|| black_box(graph.compute_energy_with_edges_cpu()))
+        });
+
+        // GPU version would return per-edge results
+        // Useful for hotspot detection
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// ATTENTION BENCHMARKS
+// ============================================================================
+
+fn bench_attention_cpu_vs_gpu(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_attention");
+
+    // Typical attention configurations
+    let configs = [
+        (128, 64, "small"),  // seq_len=128, head_dim=64
+        (512, 64, "medium"), // seq_len=512, head_dim=64
+        (2048, 64, "large"), // seq_len=2048, head_dim=64
+    ];
+
+    for (seq_len, head_dim, label) in configs {
+        let queries = generate_vec(seq_len * head_dim, 42);
+        let keys = generate_vec(seq_len * head_dim, 123);
+        let values = generate_vec(seq_len * head_dim, 456);
+        let mut output = vec![0.0f32; seq_len * head_dim];
+
+        // Attention is O(n^2) in sequence length
+        let sample_size = if seq_len > 1024 { 10 } else { 50 };
+        group.sample_size(sample_size);
+        group.throughput(Throughput::Elements((seq_len * seq_len) as u64));
+
+        group.bench_with_input(BenchmarkId::new("cpu", label), &seq_len, |b, _| {
+            b.iter(|| {
+                attention_forward_cpu(
+                    black_box(&queries),
+                    black_box(&keys),
+                    black_box(&values),
+                    seq_len,
+                    head_dim,
+                    &mut output,
+                );
+                black_box(output[0])
+            })
+        });
+
+        #[cfg(feature = "gpu")]
+        group.bench_with_input(BenchmarkId::new("gpu", label), &seq_len, |b, _| {
+            b.iter(|| {
+                gpu_impl::attention_forward_gpu(
+                    black_box(&queries),
+                    black_box(&keys),
+                    black_box(&values),
+                    seq_len,
+                    head_dim,
+                    &mut output,
+                );
+                black_box(output[0])
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark multi-head attention
+fn bench_multihead_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_multihead_attention");
+
+    let seq_len = 512;
+    let head_dim = 64;
+    let num_heads = 8;
+
+    let queries = generate_vec(seq_len * head_dim * num_heads, 42);
+    let keys = generate_vec(seq_len * head_dim * num_heads, 123);
+    let values = generate_vec(seq_len * head_dim * num_heads, 456);
+    let mut output = vec![0.0f32; seq_len * head_dim * num_heads];
+
+    group.sample_size(20);
+    group.throughput(Throughput::Elements((seq_len * seq_len * num_heads) as u64));
+
+    // CPU: sequential over heads
+    group.bench_function("cpu_sequential_heads", |b| {
+        b.iter(|| {
+            for h in 0..num_heads {
+                let offset = h * seq_len * head_dim;
+                let q = &queries[offset..offset + seq_len * head_dim];
+                let k = &keys[offset..offset + seq_len * head_dim];
+                let v = &values[offset..offset + seq_len * head_dim];
+                let out = &mut output[offset..offset + seq_len * head_dim];
+
+                attention_forward_cpu(q, k, v, seq_len, head_dim, out);
+            }
+            black_box(output[0])
+        })
+    });
+
+    // GPU would parallelize across heads
+    #[cfg(feature = "gpu")]
+    group.bench_function("gpu_parallel_heads", |b| {
+        b.iter(|| {
+            // In reality, GPU would process all heads in parallel
+            for h in 0..num_heads {
+                let offset = h * seq_len * head_dim;
+                let q = &queries[offset..offset + seq_len * head_dim];
+                let k = &keys[offset..offset + seq_len * head_dim];
+                let v = &values[offset..offset + seq_len * head_dim];
+                let out = &mut output[offset..offset + seq_len * head_dim];
+
+                gpu_impl::attention_forward_gpu(q, k, v, seq_len, head_dim, out);
+            }
+            black_box(output[0])
+        })
+    });
+
+    group.finish();
+}
+
+// ============================================================================
+// BATCH ROUTING BENCHMARKS (MoE)
+// ============================================================================
+
+fn bench_batch_routing_cpu_vs_gpu(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_routing");
+
+    let embed_dim = 768; // Typical transformer embedding
+    let num_experts = 8;
+    let top_k = 2;
+
+    for num_tokens in [256, 1024, 4096] {
+        let token_embeddings = generate_vec(num_tokens * embed_dim, 42);
+        let expert_weights = generate_vec(num_experts * embed_dim, 123);
+
+        let sample_size = if num_tokens > 2048 { 20 } else { 50 };
+        group.sample_size(sample_size);
+        group.throughput(Throughput::Elements(num_tokens as u64));
+
+        group.bench_with_input(BenchmarkId::new("cpu", num_tokens), &num_tokens, |b, _| {
+            b.iter(|| {
+                black_box(batch_routing_cpu(
+                    black_box(&token_embeddings),
+                    black_box(&expert_weights),
+                    num_tokens,
+                    embed_dim,
+                    num_experts,
+                    top_k,
+                ))
+            })
+        });
+
+        #[cfg(feature = "gpu")]
+        group.bench_with_input(BenchmarkId::new("gpu", num_tokens), &num_tokens, |b, _| {
+            b.iter(|| {
+                black_box(gpu_impl::batch_routing_gpu(
+                    black_box(&token_embeddings),
+                    black_box(&expert_weights),
+                    num_tokens,
+                    embed_dim,
+                    num_experts,
+                    top_k,
+                ))
+            })
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// MEMORY TRANSFER BENCHMARKS
+// ============================================================================
+
+fn bench_memory_transfer_overhead(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_memory_transfer");
+
+    // Simulate different transfer sizes
+    let sizes_kb = [1, 4, 16, 64, 256, 1024, 4096];
+
+    for &size_kb in &sizes_kb {
+        let data = generate_vec(size_kb * 1024 / 4, 42); // f32 = 4 bytes
+
+        group.throughput(Throughput::Bytes((size_kb * 1024) as u64));
+
+        // Baseline: just accessing memory on CPU
+        group.bench_with_input(
+            BenchmarkId::new("cpu_access", format!("{}KB", size_kb)),
+            &size_kb,
+            |b, _| {
+                b.iter(|| {
+                    let sum: f32 = data.iter().sum();
+                    black_box(sum)
+                })
+            },
+        );
+
+        // GPU would have additional transfer overhead
+        // This benchmark shows the amortization point
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// CROSSOVER POINT BENCHMARKS
+// ============================================================================
+
+/// Find the problem size where GPU becomes faster than CPU
+fn bench_gpu_crossover(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_crossover");
+
+    // Matrix multiply is a classic GPU workload
+    // Test different sizes to find crossover
+
+    let sizes = [32, 64, 128, 256, 512, 1024];
+
+    for &size in &sizes {
+        let a = generate_matrix(size, size, 42);
+        let b = generate_matrix(size, size, 123);
+        let mut c = vec![0.0f32; size * size];
+
+        group.throughput(Throughput::Elements((size * size * size) as u64)); // O(n^3)
+
+        let sample_size = if size > 512 { 10 } else { 50 };
+        group.sample_size(sample_size);
+
+        // CPU matrix multiply (naive)
+        group.bench_with_input(BenchmarkId::new("cpu_matmul", size), &size, |b_iter, _| {
+            b_iter.iter(|| {
+                for i in 0..size {
+                    for j in 0..size {
+                        let mut sum = 0.0f32;
+                        for k in 0..size {
+                            sum += a[i * size + k] * b[k * size + j];
+                        }
+                        c[i * size + j] = sum;
+                    }
+                }
+                black_box(c[0])
+            })
+        });
+
+        // GPU would win for size >= 256 typically
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// COHERENCE-SPECIFIC GPU PATTERNS
+// ============================================================================
+
+/// Benchmark parallel residual computation pattern
+fn bench_parallel_residual(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_parallel_residual");
+
+    let state_dim = 64;
+
+    for num_edges in [1_000, 10_000, 100_000] {
+        // Prepare edge data in GPU-friendly format
+        let sources: Vec<Vec<f32>> = (0..num_edges)
+            .map(|i| generate_vec(state_dim, i as u64))
+            .collect();
+        let targets: Vec<Vec<f32>> = (0..num_edges)
+            .map(|i| generate_vec(state_dim, i as u64 + 1000000))
+            .collect();
+
+        let sample_size = if num_edges > 50000 { 10 } else { 50 };
+        group.sample_size(sample_size);
+        group.throughput(Throughput::Elements(num_edges as u64));
+
+        // CPU sequential
+        group.bench_with_input(
+            BenchmarkId::new("cpu_sequential", num_edges),
+            &num_edges,
+            |b, _| {
+                b.iter(|| {
+                    let mut total = 0.0f32;
+                    for (src, tgt) in sources.iter().zip(targets.iter()) {
+                        let mut norm_sq = 0.0f32;
+                        for i in 0..state_dim {
+                            let diff = src[i] - tgt[i];
+                            norm_sq += diff * diff;
+                        }
+                        total += norm_sq;
+                    }
+                    black_box(total)
+                })
+            },
+        );
+
+        // GPU would parallelize all edges
+        // Each work item computes one residual
+    }
+
+    group.finish();
+}
+
+/// Benchmark reduction patterns (sum of energies)
+fn bench_gpu_reduction(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_reduction");
+
+    for size in [1_000, 10_000, 100_000, 1_000_000] {
+        let data = generate_vec(size, 42);
+
+        let sample_size = if size > 100000 { 10 } else { 50 };
+        group.sample_size(sample_size);
+        group.throughput(Throughput::Elements(size as u64));
+
+        // CPU sequential sum
+        group.bench_with_input(BenchmarkId::new("cpu_sum", size), &size, |b, _| {
+            b.iter(|| {
+                let sum: f32 = data.iter().sum();
+                black_box(sum)
+            })
+        });
+
+        // CPU parallel reduction would use multiple accumulators
+        group.bench_with_input(BenchmarkId::new("cpu_parallel", size), &size, |b, _| {
+            b.iter(|| {
+                let chunks = data.chunks(1024);
+                let partial_sums: Vec<f32> = chunks.map(|c| c.iter().sum()).collect();
+                let sum: f32 = partial_sums.iter().sum();
+                black_box(sum)
+            })
+        });
+
+        // GPU reduction uses tree-based parallel reduction
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// CRITERION CONFIGURATION
+// ============================================================================
+
+criterion_group!(
+    energy_benches,
+    bench_energy_cpu_vs_gpu,
+    bench_energy_with_edges,
+);
+
+criterion_group!(
+    attention_benches,
+    bench_attention_cpu_vs_gpu,
+    bench_multihead_attention,
+);
+
+criterion_group!(routing_benches, bench_batch_routing_cpu_vs_gpu,);
+
+criterion_group!(
+    transfer_benches,
+    bench_memory_transfer_overhead,
+    bench_gpu_crossover,
+);
+
+criterion_group!(
+    coherence_gpu_benches,
+    bench_parallel_residual,
+    bench_gpu_reduction,
+);
+
+criterion_main!(
+    energy_benches,
+    attention_benches,
+    routing_benches,
+    transfer_benches,
+    coherence_gpu_benches
+);
--- a/vendor/ruvector/crates/prime-radiant/benches/hyperbolic_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/hyperbolic_bench.rs
@@ -0,0 +1,488 @@
+//! Benchmarks for Poincare distance computation
+//!
+//! ADR-014 Performance Target: < 500ns per Poincare distance
+//!
+//! Hyperbolic geometry enables hierarchy-aware coherence where
+//! deeper nodes (further from origin) have different energy weights.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+
+// ============================================================================
+// Hyperbolic Geometry Functions
+// ============================================================================
+
+/// Compute squared Euclidean norm
+#[inline]
+fn squared_norm(x: &[f32]) -> f32 {
+    x.iter().map(|v| v * v).sum()
+}
+
+/// Compute Euclidean norm
+#[inline]
+fn norm(x: &[f32]) -> f32 {
+    squared_norm(x).sqrt()
+}
+
+/// Compute squared Euclidean distance
+#[inline]
+fn squared_distance(x: &[f32], y: &[f32]) -> f32 {
+    x.iter().zip(y.iter()).map(|(a, b)| (a - b).powi(2)).sum()
+}
+
+/// Poincare distance in the Poincare ball model
+///
+/// d(x, y) = arcosh(1 + 2 * ||x - y||^2 / ((1 - ||x||^2) * (1 - ||y||^2)))
+///
+/// where arcosh(z) = ln(z + sqrt(z^2 - 1))
+#[inline]
+pub fn poincare_distance(x: &[f32], y: &[f32], curvature: f32) -> f32 {
+    let sq_norm_x = squared_norm(x);
+    let sq_norm_y = squared_norm(y);
+    let sq_dist = squared_distance(x, y);
+
+    // Clamp to valid range for numerical stability
+    let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
+    let arg = 1.0 + 2.0 * sq_dist / denom;
+
+    // arcosh(arg) = ln(arg + sqrt(arg^2 - 1))
+    let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
+
+    // Scale by curvature
+    arcosh / (-curvature).sqrt()
+}
+
+/// Optimized Poincare distance with fused operations
+#[inline]
+pub fn poincare_distance_optimized(x: &[f32], y: &[f32], curvature: f32) -> f32 {
+    let mut sq_norm_x = 0.0f32;
+    let mut sq_norm_y = 0.0f32;
+    let mut sq_dist = 0.0f32;
+
+    for i in 0..x.len() {
+        sq_norm_x += x[i] * x[i];
+        sq_norm_y += y[i] * y[i];
+        let d = x[i] - y[i];
+        sq_dist += d * d;
+    }
+
+    let denom = (1.0 - sq_norm_x).max(1e-10) * (1.0 - sq_norm_y).max(1e-10);
+    let arg = 1.0 + 2.0 * sq_dist / denom;
+    let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
+
+    arcosh / (-curvature).sqrt()
+}
+
+/// SIMD-friendly Poincare distance (chunked)
+#[inline]
+pub fn poincare_distance_simd_friendly(x: &[f32], y: &[f32], curvature: f32) -> f32 {
+    // Process in chunks of 4 for potential auto-vectorization
+    let mut sq_norm_x = [0.0f32; 4];
+    let mut sq_norm_y = [0.0f32; 4];
+    let mut sq_dist = [0.0f32; 4];
+
+    let chunks = x.len() / 4;
+    for c in 0..chunks {
+        let base = c * 4;
+        for i in 0..4 {
+            let xi = x[base + i];
+            let yi = y[base + i];
+            sq_norm_x[i] += xi * xi;
+            sq_norm_y[i] += yi * yi;
+            let d = xi - yi;
+            sq_dist[i] += d * d;
+        }
+    }
+
+    // Handle remainder
+    let remainder = x.len() % 4;
+    let base = chunks * 4;
+    for i in 0..remainder {
+        let xi = x[base + i];
+        let yi = y[base + i];
+        sq_norm_x[0] += xi * xi;
+        sq_norm_y[0] += yi * yi;
+        let d = xi - yi;
+        sq_dist[0] += d * d;
+    }
+
+    // Reduce
+    let total_sq_norm_x: f32 = sq_norm_x.iter().sum();
+    let total_sq_norm_y: f32 = sq_norm_y.iter().sum();
+    let total_sq_dist: f32 = sq_dist.iter().sum();
+
+    let denom = (1.0 - total_sq_norm_x).max(1e-10) * (1.0 - total_sq_norm_y).max(1e-10);
+    let arg = 1.0 + 2.0 * total_sq_dist / denom;
+    let arcosh = (arg + (arg * arg - 1.0).max(0.0).sqrt()).ln();
+
+    arcosh / (-curvature).sqrt()
+}
+
+/// Mobius addition in the Poincare ball
+///
+/// x + y = ((1 + 2<x,y> + ||y||^2)x + (1 - ||x||^2)y) / (1 + 2<x,y> + ||x||^2||y||^2)
+pub fn mobius_add(x: &[f32], y: &[f32], curvature: f32) -> Vec<f32> {
+    let c = -curvature;
+    let sq_norm_x = squared_norm(x);
+    let sq_norm_y = squared_norm(y);
+    let xy_dot: f32 = x.iter().zip(y.iter()).map(|(a, b)| a * b).sum();
+
+    let num_factor_x = 1.0 + 2.0 * c * xy_dot + c * sq_norm_y;
+    let num_factor_y = 1.0 - c * sq_norm_x;
+    let denom = 1.0 + 2.0 * c * xy_dot + c * c * sq_norm_x * sq_norm_y;
+
+    x.iter()
+        .zip(y.iter())
+        .map(|(xi, yi)| (num_factor_x * xi + num_factor_y * yi) / denom)
+        .collect()
+}
+
+/// Exponential map at point p with tangent vector v
+pub fn exp_map(v: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
+    let c = -curvature;
+    let v_norm = norm(v);
+
+    if v_norm < 1e-10 {
+        return p.to_vec();
+    }
+
+    let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
+    let t = (c.sqrt() * lambda_p * v_norm / 2.0).tanh();
+    let factor = t / (c.sqrt() * v_norm);
+
+    let v_scaled: Vec<f32> = v.iter().map(|vi| factor * vi).collect();
+    mobius_add(p, &v_scaled, curvature)
+}
+
+/// Logarithmic map from point p to point q
+pub fn log_map(q: &[f32], p: &[f32], curvature: f32) -> Vec<f32> {
+    let c = -curvature;
+
+    // Compute -p + q
+    let neg_p: Vec<f32> = p.iter().map(|x| -x).collect();
+    let diff = mobius_add(&neg_p, q, curvature);
+
+    let diff_norm = norm(&diff);
+    if diff_norm < 1e-10 {
+        return vec![0.0; p.len()];
+    }
+
+    let lambda_p = 2.0 / (1.0 - c * squared_norm(p)).max(1e-10);
+    let factor = 2.0 / (c.sqrt() * lambda_p) * (c.sqrt() * diff_norm).atanh() / diff_norm;
+
+    diff.iter().map(|d| factor * d).collect()
+}
+
+/// Project vector to Poincare ball (ensure ||x|| < 1/sqrt(c))
+pub fn project_to_ball(x: &[f32], curvature: f32) -> Vec<f32> {
+    let max_norm = 1.0 / (-curvature).sqrt() - 1e-5;
+    let current_norm = norm(x);
+
+    if current_norm >= max_norm {
+        let scale = max_norm / current_norm;
+        x.iter().map(|v| v * scale).collect()
+    } else {
+        x.to_vec()
+    }
+}
+
+/// Compute depth (distance from origin) in Poincare ball
+#[inline]
+pub fn poincare_depth(x: &[f32], curvature: f32) -> f32 {
+    let origin = vec![0.0f32; x.len()];
+    poincare_distance(x, &origin, curvature)
+}
+
+// ============================================================================
+// Test Data Generation
+// ============================================================================
+
+fn generate_point(dim: usize, seed: u64, max_norm: f32) -> Vec<f32> {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    let raw: Vec<f32> = (0..dim)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect();
+
+    // Scale to be within ball
+    let n = norm(&raw);
+    if n > 0.0 {
+        let scale = max_norm / n * 0.9; // 90% of max
+        raw.iter().map(|v| v * scale).collect()
+    } else {
+        raw
+    }
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark Poincare distance at various dimensions
+fn bench_poincare_distance(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_poincare_distance");
+    group.throughput(Throughput::Elements(1));
+
+    let curvature = -1.0;
+
+    for dim in [8, 32, 64, 128, 256, 512] {
+        let x = generate_point(dim, 42, 0.9);
+        let y = generate_point(dim, 123, 0.9);
+
+        // Standard implementation
+        group.bench_with_input(BenchmarkId::new("standard", dim), &dim, |b, _| {
+            b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(curvature)))
+        });
+
+        // Optimized implementation
+        group.bench_with_input(BenchmarkId::new("optimized", dim), &dim, |b, _| {
+            b.iter(|| {
+                poincare_distance_optimized(black_box(&x), black_box(&y), black_box(curvature))
+            })
+        });
+
+        // SIMD-friendly implementation
+        group.bench_with_input(BenchmarkId::new("simd_friendly", dim), &dim, |b, _| {
+            b.iter(|| {
+                poincare_distance_simd_friendly(black_box(&x), black_box(&y), black_box(curvature))
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark Mobius addition
+fn bench_mobius_add(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_mobius_add");
+    group.throughput(Throughput::Elements(1));
+
+    let curvature = -1.0;
+
+    for dim in [8, 32, 64, 128] {
+        let x = generate_point(dim, 42, 0.5);
+        let y = generate_point(dim, 123, 0.5);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| mobius_add(black_box(&x), black_box(&y), black_box(curvature)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark exp/log maps
+fn bench_exp_log_map(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_exp_log");
+
+    let dim = 32;
+    let curvature = -1.0;
+
+    let p = generate_point(dim, 42, 0.3);
+    let v: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin() * 0.2)).collect();
+    let q = generate_point(dim, 123, 0.4);
+
+    group.bench_function("exp_map", |b| {
+        b.iter(|| exp_map(black_box(&v), black_box(&p), black_box(curvature)))
+    });
+
+    group.bench_function("log_map", |b| {
+        b.iter(|| log_map(black_box(&q), black_box(&p), black_box(curvature)))
+    });
+
+    group.finish();
+}
+
+/// Benchmark projection to ball
+fn bench_projection(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_projection");
+    group.throughput(Throughput::Elements(1));
+
+    let curvature = -1.0;
+
+    for dim in [8, 32, 64, 128, 256] {
+        // Point that needs projection (outside ball)
+        let x: Vec<f32> = (0..dim).map(|i| ((i as f32 * 0.1).sin())).collect();
+
+        group.bench_with_input(BenchmarkId::new("project", dim), &dim, |b, _| {
+            b.iter(|| project_to_ball(black_box(&x), black_box(curvature)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark depth computation
+fn bench_depth(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_depth");
+    group.throughput(Throughput::Elements(1));
+
+    let curvature = -1.0;
+
+    for dim in [8, 32, 64, 128, 256] {
+        let x = generate_point(dim, 42, 0.9);
+
+        group.bench_with_input(BenchmarkId::new("depth", dim), &dim, |b, _| {
+            b.iter(|| poincare_depth(black_box(&x), black_box(curvature)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark batch distance computation
+fn bench_batch_distance(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_batch_distance");
+
+    let dim = 64;
+    let curvature = -1.0;
+
+    for batch_size in [10, 100, 1000] {
+        let points: Vec<Vec<f32>> = (0..batch_size)
+            .map(|i| generate_point(dim, i as u64, 0.9))
+            .collect();
+        let query = generate_point(dim, 999, 0.9);
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("batch", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    let distances: Vec<f32> = points
+                        .iter()
+                        .map(|p| poincare_distance(&query, p, curvature))
+                        .collect();
+                    black_box(distances)
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark k-nearest in hyperbolic space
+fn bench_knn_hyperbolic(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_knn");
+    group.sample_size(50);
+
+    let dim = 64;
+    let curvature = -1.0;
+
+    let points: Vec<Vec<f32>> = (0..1000)
+        .map(|i| generate_point(dim, i as u64, 0.9))
+        .collect();
+    let query = generate_point(dim, 999, 0.9);
+
+    for k in [1, 5, 10, 50] {
+        group.bench_with_input(BenchmarkId::new("k", k), &k, |b, &k| {
+            b.iter(|| {
+                // Compute all distances
+                let mut distances: Vec<(usize, f32)> = points
+                    .iter()
+                    .enumerate()
+                    .map(|(i, p)| (i, poincare_distance(&query, p, curvature)))
+                    .collect();
+
+                // Partial sort for k-nearest
+                distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+                let result = distances[..k]
+                    .iter()
+                    .map(|(i, d)| (*i, *d))
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark hierarchy-weighted energy computation
+fn bench_hierarchy_weighted_energy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_hierarchy_energy");
+
+    let dim = 64;
+    let curvature = -1.0;
+
+    // Create hierarchy: shallow and deep nodes
+    let shallow_nodes: Vec<Vec<f32>> = (0..100)
+        .map(|i| generate_point(dim, i as u64, 0.3)) // Near origin
+        .collect();
+    let deep_nodes: Vec<Vec<f32>> = (0..100)
+        .map(|i| generate_point(dim, (i + 100) as u64, 0.9)) // Far from origin
+        .collect();
+
+    group.bench_function("shallow_energy", |b| {
+        b.iter(|| {
+            let mut total_energy = 0.0f32;
+            for i in 0..shallow_nodes.len() - 1 {
+                let depth_a = poincare_depth(&shallow_nodes[i], curvature);
+                let depth_b = poincare_depth(&shallow_nodes[i + 1], curvature);
+                let avg_depth = (depth_a + depth_b) / 2.0;
+                let weight = 1.0 + avg_depth.ln().max(0.0);
+
+                let dist = poincare_distance(&shallow_nodes[i], &shallow_nodes[i + 1], curvature);
+                total_energy += weight * dist * dist;
+            }
+            black_box(total_energy)
+        })
+    });
+
+    group.bench_function("deep_energy", |b| {
+        b.iter(|| {
+            let mut total_energy = 0.0f32;
+            for i in 0..deep_nodes.len() - 1 {
+                let depth_a = poincare_depth(&deep_nodes[i], curvature);
+                let depth_b = poincare_depth(&deep_nodes[i + 1], curvature);
+                let avg_depth = (depth_a + depth_b) / 2.0;
+                let weight = 1.0 + avg_depth.ln().max(0.0);
+
+                let dist = poincare_distance(&deep_nodes[i], &deep_nodes[i + 1], curvature);
+                total_energy += weight * dist * dist;
+            }
+            black_box(total_energy)
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark curvature impact
+fn bench_curvature_impact(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_curvature");
+
+    let dim = 64;
+    let x = generate_point(dim, 42, 0.5);
+    let y = generate_point(dim, 123, 0.5);
+
+    for curvature in [-0.1, -0.5, -1.0, -2.0, -5.0] {
+        group.bench_with_input(
+            BenchmarkId::new("curvature", format!("{:.1}", curvature)),
+            &curvature,
+            |b, &c| b.iter(|| poincare_distance(black_box(&x), black_box(&y), black_box(c))),
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_poincare_distance,
+    bench_mobius_add,
+    bench_exp_log_map,
+    bench_projection,
+    bench_depth,
+    bench_batch_distance,
+    bench_knn_hyperbolic,
+    bench_hierarchy_weighted_energy,
+    bench_curvature_impact,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/incremental_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/incremental_bench.rs
@@ -0,0 +1,608 @@
+//! Benchmarks for incremental coherence updates
+//!
+//! ADR-014 Performance Target: < 100us for single node update
+//!
+//! Incremental computation recomputes only affected edges when
+//! a single node changes, avoiding full graph recomputation.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::{HashMap, HashSet};
+
+// ============================================================================
+// Types (Simulated for benchmarking)
+// ============================================================================
+
+#[derive(Clone)]
+pub struct RestrictionMap {
+    pub matrix: Vec<f32>,
+    pub bias: Vec<f32>,
+    pub input_dim: usize,
+    pub output_dim: usize,
+}
+
+impl RestrictionMap {
+    pub fn identity(dim: usize) -> Self {
+        let mut matrix = vec![0.0f32; dim * dim];
+        for i in 0..dim {
+            matrix[i * dim + i] = 1.0;
+        }
+        Self {
+            matrix,
+            bias: vec![0.0; dim],
+            input_dim: dim,
+            output_dim: dim,
+        }
+    }
+
+    #[inline]
+    pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
+        output.copy_from_slice(&self.bias);
+        for i in 0..self.output_dim {
+            let row_start = i * self.input_dim;
+            for j in 0..self.input_dim {
+                output[i] += self.matrix[row_start + j] * input[j];
+            }
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct SheafNode {
+    pub id: u64,
+    pub state: Vec<f32>,
+}
+
+#[derive(Clone)]
+pub struct SheafEdge {
+    pub id: u64,
+    pub source: u64,
+    pub target: u64,
+    pub weight: f32,
+    pub rho_source: RestrictionMap,
+    pub rho_target: RestrictionMap,
+}
+
+impl SheafEdge {
+    #[inline]
+    pub fn weighted_residual_energy_into(
+        &self,
+        source: &[f32],
+        target: &[f32],
+        source_buf: &mut [f32],
+        target_buf: &mut [f32],
+    ) -> f32 {
+        self.rho_source.apply_into(source, source_buf);
+        self.rho_target.apply_into(target, target_buf);
+
+        let mut norm_sq = 0.0f32;
+        for i in 0..source_buf.len() {
+            let diff = source_buf[i] - target_buf[i];
+            norm_sq += diff * diff;
+        }
+
+        self.weight * norm_sq
+    }
+}
+
+/// Incremental coherence tracker
+pub struct IncrementalCoherence {
+    pub nodes: HashMap<u64, SheafNode>,
+    pub edges: Vec<SheafEdge>,
+    pub state_dim: usize,
+    /// Node -> incident edge indices
+    pub node_to_edges: HashMap<u64, Vec<usize>>,
+    /// Cached per-edge energies
+    pub edge_energies: Vec<f32>,
+    /// Cached total energy
+    pub total_energy: f32,
+    /// Fingerprint for staleness detection
+    pub fingerprint: u64,
+}
+
+impl IncrementalCoherence {
+    pub fn new(nodes: HashMap<u64, SheafNode>, edges: Vec<SheafEdge>, state_dim: usize) -> Self {
+        // Build node-to-edge index
+        let mut node_to_edges: HashMap<u64, Vec<usize>> = HashMap::new();
+        for (idx, edge) in edges.iter().enumerate() {
+            node_to_edges.entry(edge.source).or_default().push(idx);
+            node_to_edges.entry(edge.target).or_default().push(idx);
+        }
+
+        let mut tracker = Self {
+            nodes,
+            edges,
+            state_dim,
+            node_to_edges,
+            edge_energies: Vec::new(),
+            total_energy: 0.0,
+            fingerprint: 0,
+        };
+
+        tracker.full_recompute();
+        tracker
+    }
+
+    /// Full recomputation (initial or when needed)
+    pub fn full_recompute(&mut self) {
+        let mut source_buf = vec![0.0f32; self.state_dim];
+        let mut target_buf = vec![0.0f32; self.state_dim];
+
+        self.edge_energies = self
+            .edges
+            .iter()
+            .map(|edge| {
+                let source_state = &self.nodes[&edge.source].state;
+                let target_state = &self.nodes[&edge.target].state;
+                edge.weighted_residual_energy_into(
+                    source_state,
+                    target_state,
+                    &mut source_buf,
+                    &mut target_buf,
+                )
+            })
+            .collect();
+
+        self.total_energy = self.edge_energies.iter().sum();
+        self.update_fingerprint();
+    }
+
+    /// Update single node and recompute affected edges only
+    pub fn update_node(&mut self, node_id: u64, new_state: Vec<f32>) {
+        // Update node state
+        if let Some(node) = self.nodes.get_mut(&node_id) {
+            node.state = new_state;
+        } else {
+            return;
+        }
+
+        // Get affected edges
+        let affected_edges = match self.node_to_edges.get(&node_id) {
+            Some(edges) => edges.clone(),
+            None => return,
+        };
+
+        // Recompute only affected edges
+        let mut source_buf = vec![0.0f32; self.state_dim];
+        let mut target_buf = vec![0.0f32; self.state_dim];
+
+        let mut energy_delta = 0.0f32;
+
+        for &edge_idx in &affected_edges {
+            let edge = &self.edges[edge_idx];
+            let source_state = &self.nodes[&edge.source].state;
+            let target_state = &self.nodes[&edge.target].state;
+
+            let old_energy = self.edge_energies[edge_idx];
+            let new_energy = edge.weighted_residual_energy_into(
+                source_state,
+                target_state,
+                &mut source_buf,
+                &mut target_buf,
+            );
+
+            energy_delta += new_energy - old_energy;
+            self.edge_energies[edge_idx] = new_energy;
+        }
+
+        self.total_energy += energy_delta;
+        self.update_fingerprint();
+    }
+
+    /// Update multiple nodes in batch
+    pub fn update_nodes_batch(&mut self, updates: Vec<(u64, Vec<f32>)>) {
+        // Collect all affected edges
+        let mut affected_edges: HashSet<usize> = HashSet::new();
+
+        for (node_id, new_state) in updates {
+            if let Some(node) = self.nodes.get_mut(&node_id) {
+                node.state = new_state;
+            }
+            if let Some(edges) = self.node_to_edges.get(&node_id) {
+                affected_edges.extend(edges.iter());
+            }
+        }
+
+        // Recompute affected edges
+        let mut source_buf = vec![0.0f32; self.state_dim];
+        let mut target_buf = vec![0.0f32; self.state_dim];
+
+        let mut energy_delta = 0.0f32;
+
+        for edge_idx in affected_edges {
+            let edge = &self.edges[edge_idx];
+            let source_state = &self.nodes[&edge.source].state;
+            let target_state = &self.nodes[&edge.target].state;
+
+            let old_energy = self.edge_energies[edge_idx];
+            let new_energy = edge.weighted_residual_energy_into(
+                source_state,
+                target_state,
+                &mut source_buf,
+                &mut target_buf,
+            );
+
+            energy_delta += new_energy - old_energy;
+            self.edge_energies[edge_idx] = new_energy;
+        }
+
+        self.total_energy += energy_delta;
+        self.update_fingerprint();
+    }
+
+    fn update_fingerprint(&mut self) {
+        self.fingerprint = self.fingerprint.wrapping_add(1);
+    }
+
+    /// Get current total energy
+    pub fn energy(&self) -> f32 {
+        self.total_energy
+    }
+
+    /// Get energy for specific edge
+    pub fn edge_energy(&self, edge_idx: usize) -> f32 {
+        self.edge_energies[edge_idx]
+    }
+
+    /// Check if cache is stale (fingerprint changed)
+    pub fn is_stale(&self, last_fingerprint: u64) -> bool {
+        self.fingerprint != last_fingerprint
+    }
+}
+
+// ============================================================================
+// Test Data Generation
+// ============================================================================
+
+fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    (0..dim)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+fn create_random_graph(
+    num_nodes: usize,
+    avg_degree: usize,
+    state_dim: usize,
+) -> IncrementalCoherence {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
+        .map(|id| {
+            (
+                id,
+                SheafNode {
+                    id,
+                    state: generate_state(state_dim, id),
+                },
+            )
+        })
+        .collect();
+
+    let num_edges = (num_nodes * avg_degree) / 2;
+    let edges: Vec<SheafEdge> = (0..num_edges)
+        .filter_map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (42u64, i, "src").hash(&mut hasher);
+            let source = hasher.finish() % num_nodes as u64;
+
+            let mut hasher = DefaultHasher::new();
+            (42u64, i, "tgt").hash(&mut hasher);
+            let target = hasher.finish() % num_nodes as u64;
+
+            if source != target {
+                Some(SheafEdge {
+                    id: i as u64,
+                    source,
+                    target,
+                    weight: 1.0,
+                    rho_source: RestrictionMap::identity(state_dim),
+                    rho_target: RestrictionMap::identity(state_dim),
+                })
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    IncrementalCoherence::new(nodes, edges, state_dim)
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark single node update at various graph sizes
+fn bench_single_node_update(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_single_node");
+    group.throughput(Throughput::Elements(1));
+
+    // ADR-014 target: <100us for single node update
+    for num_nodes in [100, 1_000, 10_000] {
+        let state_dim = 64;
+        let avg_degree = 4;
+        let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+        group.bench_with_input(
+            BenchmarkId::new("update", format!("{}nodes", num_nodes)),
+            &num_nodes,
+            |b, _| {
+                let node_id = (num_nodes / 2) as u64; // Update middle node
+                b.iter(|| {
+                    let new_state = generate_state(state_dim, black_box(rand::random()));
+                    tracker.update_node(black_box(node_id), new_state);
+                    black_box(tracker.energy())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark incremental vs full recomputation
+fn bench_incremental_vs_full(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_vs_full");
+
+    let num_nodes = 10_000;
+    let state_dim = 64;
+    let avg_degree = 4;
+    let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+    // Incremental update
+    group.bench_function("incremental_single", |b| {
+        let node_id = 5000u64;
+        b.iter(|| {
+            let new_state = generate_state(state_dim, rand::random());
+            tracker.update_node(black_box(node_id), new_state);
+            black_box(tracker.energy())
+        })
+    });
+
+    // Full recomputation
+    group.bench_function("full_recompute", |b| {
+        b.iter(|| {
+            tracker.full_recompute();
+            black_box(tracker.energy())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark node degree impact on update time
+fn bench_node_degree_impact(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_degree_impact");
+
+    let num_nodes = 10_000;
+    let state_dim = 64;
+
+    // Create graph with hub node (high degree)
+    let nodes: HashMap<u64, SheafNode> = (0..num_nodes as u64)
+        .map(|id| {
+            (
+                id,
+                SheafNode {
+                    id,
+                    state: generate_state(state_dim, id),
+                },
+            )
+        })
+        .collect();
+
+    // Hub node 0 connects to many nodes
+    let hub_degree = 1000;
+    let mut edges: Vec<SheafEdge> = (1..=hub_degree)
+        .map(|i| SheafEdge {
+            id: i as u64,
+            source: 0,
+            target: i as u64,
+            weight: 1.0,
+            rho_source: RestrictionMap::identity(state_dim),
+            rho_target: RestrictionMap::identity(state_dim),
+        })
+        .collect();
+
+    // Regular edges for other nodes (degree ~4)
+    for i in hub_degree + 1..num_nodes - 1 {
+        edges.push(SheafEdge {
+            id: i as u64,
+            source: i as u64,
+            target: (i + 1) as u64,
+            weight: 1.0,
+            rho_source: RestrictionMap::identity(state_dim),
+            rho_target: RestrictionMap::identity(state_dim),
+        });
+    }
+
+    let mut tracker = IncrementalCoherence::new(nodes, edges, state_dim);
+
+    // Update hub node (high degree)
+    group.bench_function("update_hub_1000_edges", |b| {
+        b.iter(|| {
+            let new_state = generate_state(state_dim, rand::random());
+            tracker.update_node(black_box(0), new_state);
+            black_box(tracker.energy())
+        })
+    });
+
+    // Update leaf node (degree 1-2)
+    group.bench_function("update_leaf_2_edges", |b| {
+        let leaf_id = (hub_degree + 100) as u64;
+        b.iter(|| {
+            let new_state = generate_state(state_dim, rand::random());
+            tracker.update_node(black_box(leaf_id), new_state);
+            black_box(tracker.energy())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark batch updates
+fn bench_batch_updates(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_batch");
+
+    let num_nodes = 10_000;
+    let state_dim = 64;
+    let avg_degree = 4;
+
+    for batch_size in [1, 10, 100, 1000] {
+        let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("batch_update", batch_size),
+            &batch_size,
+            |b, &size| {
+                b.iter(|| {
+                    let updates: Vec<(u64, Vec<f32>)> = (0..size)
+                        .map(|i| {
+                            let node_id = (i * 10) as u64 % num_nodes as u64;
+                            let state = generate_state(state_dim, rand::random());
+                            (node_id, state)
+                        })
+                        .collect();
+
+                    tracker.update_nodes_batch(black_box(updates));
+                    black_box(tracker.energy())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark state dimension impact
+fn bench_state_dim_impact(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_state_dim");
+
+    let num_nodes = 10_000;
+    let avg_degree = 4;
+
+    for state_dim in [8, 32, 64, 128, 256] {
+        let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+        group.bench_with_input(
+            BenchmarkId::new("update", state_dim),
+            &state_dim,
+            |b, &dim| {
+                let node_id = 5000u64;
+                b.iter(|| {
+                    let new_state = generate_state(dim, rand::random());
+                    tracker.update_node(black_box(node_id), new_state);
+                    black_box(tracker.energy())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark index lookup performance
+fn bench_index_lookup(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_index_lookup");
+
+    let num_nodes = 100_000;
+    let avg_degree = 4;
+    let state_dim = 64;
+    let tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+    // Lookup incident edges for a node
+    group.bench_function("lookup_incident_edges", |b| {
+        b.iter(|| {
+            let node_id = black_box(50_000u64);
+            black_box(tracker.node_to_edges.get(&node_id))
+        })
+    });
+
+    // Iterate incident edges
+    group.bench_function("iterate_incident_edges", |b| {
+        let node_id = 50_000u64;
+        b.iter(|| {
+            let sum = if let Some(edges) = tracker.node_to_edges.get(&node_id) {
+                edges.iter().map(|&idx| tracker.edge_energies[idx]).sum()
+            } else {
+                0.0f32
+            };
+            black_box(sum)
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark fingerprint operations
+fn bench_fingerprint(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_fingerprint");
+
+    let num_nodes = 10_000;
+    let avg_degree = 4;
+    let state_dim = 64;
+    let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+    group.bench_function("check_staleness", |b| {
+        let fp = tracker.fingerprint;
+        b.iter(|| black_box(tracker.is_stale(black_box(fp))))
+    });
+
+    group.bench_function("update_with_fingerprint_check", |b| {
+        let node_id = 5000u64;
+        b.iter(|| {
+            let old_fp = tracker.fingerprint;
+            let new_state = generate_state(state_dim, rand::random());
+            tracker.update_node(black_box(node_id), new_state);
+            let is_changed = tracker.is_stale(old_fp);
+            black_box((tracker.energy(), is_changed))
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark worst case: update all nodes sequentially
+fn bench_sequential_all_updates(c: &mut Criterion) {
+    let mut group = c.benchmark_group("incremental_sequential_all");
+    group.sample_size(10);
+
+    let num_nodes = 1000;
+    let avg_degree = 4;
+    let state_dim = 64;
+
+    let mut tracker = create_random_graph(num_nodes, avg_degree, state_dim);
+
+    group.bench_function("update_all_1000_sequential", |b| {
+        b.iter(|| {
+            for node_id in 0..num_nodes as u64 {
+                let new_state = generate_state(state_dim, node_id);
+                tracker.update_node(node_id, new_state);
+            }
+            black_box(tracker.energy())
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_single_node_update,
+    bench_incremental_vs_full,
+    bench_node_degree_impact,
+    bench_batch_updates,
+    bench_state_dim_impact,
+    bench_index_lookup,
+    bench_fingerprint,
+    bench_sequential_all_updates,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/mincut_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/mincut_bench.rs
@@ -0,0 +1,630 @@
+//! Benchmarks for dynamic mincut updates
+//!
+//! ADR-014 Performance Target: n^o(1) amortized time per update
+//!
+//! The mincut algorithm isolates incoherent subgraphs using
+//! subpolynomial dynamic updates.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::{HashMap, HashSet, VecDeque};
+
+// ============================================================================
+// Dynamic MinCut Types (Simulated for benchmarking)
+// ============================================================================
+
+/// Edge in dynamic graph
+#[derive(Clone, Copy)]
+pub struct Edge {
+    pub source: u64,
+    pub target: u64,
+    pub weight: f64,
+}
+
+/// Dynamic graph with mincut tracking
+pub struct DynamicGraph {
+    /// Adjacency lists
+    adjacency: HashMap<u64, HashMap<u64, f64>>,
+    /// Total edge count
+    edge_count: usize,
+    /// Vertex count
+    vertex_count: usize,
+    /// Cached connected components
+    components: Option<Vec<HashSet<u64>>>,
+    /// Modification counter for cache invalidation
+    mod_count: u64,
+}
+
+impl DynamicGraph {
+    pub fn new() -> Self {
+        Self {
+            adjacency: HashMap::new(),
+            edge_count: 0,
+            vertex_count: 0,
+            components: None,
+            mod_count: 0,
+        }
+    }
+
+    pub fn with_capacity(vertices: usize, _edges: usize) -> Self {
+        Self {
+            adjacency: HashMap::with_capacity(vertices),
+            edge_count: 0,
+            vertex_count: 0,
+            components: None,
+            mod_count: 0,
+        }
+    }
+
+    /// Insert edge
+    pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
+        self.components = None;
+        self.mod_count += 1;
+
+        let adj = self.adjacency.entry(source).or_insert_with(HashMap::new);
+        if adj.contains_key(&target) {
+            return false;
+        }
+        adj.insert(target, weight);
+
+        let adj = self.adjacency.entry(target).or_insert_with(HashMap::new);
+        adj.insert(source, weight);
+
+        self.edge_count += 1;
+        self.vertex_count = self.adjacency.len();
+        true
+    }
+
+    /// Delete edge
+    pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
+        self.components = None;
+        self.mod_count += 1;
+
+        let removed = if let Some(adj) = self.adjacency.get_mut(&source) {
+            adj.remove(&target).is_some()
+        } else {
+            false
+        };
+
+        if removed {
+            if let Some(adj) = self.adjacency.get_mut(&target) {
+                adj.remove(&source);
+            }
+            self.edge_count -= 1;
+        }
+
+        removed
+    }
+
+    /// Check if edge exists
+    pub fn has_edge(&self, source: u64, target: u64) -> bool {
+        self.adjacency
+            .get(&source)
+            .map(|adj| adj.contains_key(&target))
+            .unwrap_or(false)
+    }
+
+    /// Get vertex degree
+    pub fn degree(&self, vertex: u64) -> usize {
+        self.adjacency
+            .get(&vertex)
+            .map(|adj| adj.len())
+            .unwrap_or(0)
+    }
+
+    /// Get neighbors
+    pub fn neighbors(&self, vertex: u64) -> Vec<u64> {
+        self.adjacency
+            .get(&vertex)
+            .map(|adj| adj.keys().copied().collect())
+            .unwrap_or_default()
+    }
+
+    /// Compute connected components using BFS
+    pub fn connected_components(&mut self) -> &Vec<HashSet<u64>> {
+        if self.components.is_some() {
+            return self.components.as_ref().unwrap();
+        }
+
+        let mut visited = HashSet::new();
+        let mut components = Vec::new();
+
+        for &vertex in self.adjacency.keys() {
+            if visited.contains(&vertex) {
+                continue;
+            }
+
+            let mut component = HashSet::new();
+            let mut queue = VecDeque::new();
+            queue.push_back(vertex);
+
+            while let Some(v) = queue.pop_front() {
+                if visited.insert(v) {
+                    component.insert(v);
+                    if let Some(neighbors) = self.adjacency.get(&v) {
+                        for &neighbor in neighbors.keys() {
+                            if !visited.contains(&neighbor) {
+                                queue.push_back(neighbor);
+                            }
+                        }
+                    }
+                }
+            }
+
+            components.push(component);
+        }
+
+        self.components = Some(components);
+        self.components.as_ref().unwrap()
+    }
+
+    /// Check if graph is connected
+    pub fn is_connected(&mut self) -> bool {
+        let components = self.connected_components();
+        components.len() <= 1
+    }
+
+    /// Get edges as list
+    pub fn edges(&self) -> Vec<Edge> {
+        let mut edges = Vec::with_capacity(self.edge_count);
+        let mut seen = HashSet::new();
+
+        for (&source, neighbors) in &self.adjacency {
+            for (&target, &weight) in neighbors {
+                let key = if source < target {
+                    (source, target)
+                } else {
+                    (target, source)
+                };
+                if seen.insert(key) {
+                    edges.push(Edge {
+                        source,
+                        target,
+                        weight,
+                    });
+                }
+            }
+        }
+
+        edges
+    }
+
+    /// Get graph statistics
+    pub fn stats(&self) -> GraphStats {
+        GraphStats {
+            vertices: self.vertex_count,
+            edges: self.edge_count,
+            max_degree: self
+                .adjacency
+                .values()
+                .map(|adj| adj.len())
+                .max()
+                .unwrap_or(0),
+            avg_degree: if self.vertex_count > 0 {
+                (self.edge_count * 2) as f64 / self.vertex_count as f64
+            } else {
+                0.0
+            },
+        }
+    }
+}
+
+pub struct GraphStats {
+    pub vertices: usize,
+    pub edges: usize,
+    pub max_degree: usize,
+    pub avg_degree: f64,
+}
+
+/// Subpolynomial MinCut (simplified simulation)
+/// Real implementation would use randomized contraction or tree packing
+pub struct SubpolynomialMinCut {
+    graph: DynamicGraph,
+    /// Cached mincut value
+    cached_mincut: Option<f64>,
+    /// Update count since last computation
+    updates_since_compute: usize,
+    /// Threshold for recomputation
+    recompute_threshold: usize,
+}
+
+impl SubpolynomialMinCut {
+    pub fn new() -> Self {
+        Self {
+            graph: DynamicGraph::new(),
+            cached_mincut: None,
+            updates_since_compute: 0,
+            recompute_threshold: 10,
+        }
+    }
+
+    pub fn with_capacity(vertices: usize, edges: usize) -> Self {
+        Self {
+            graph: DynamicGraph::with_capacity(vertices, edges),
+            cached_mincut: None,
+            updates_since_compute: 0,
+            recompute_threshold: ((vertices as f64).sqrt() as usize).max(10),
+        }
+    }
+
+    /// Insert edge with lazy mincut update
+    pub fn insert_edge(&mut self, source: u64, target: u64, weight: f64) -> bool {
+        let result = self.graph.insert_edge(source, target, weight);
+        if result {
+            self.updates_since_compute += 1;
+            // Mincut can only decrease or stay same on edge insertion
+            // So we can keep cached value as upper bound
+        }
+        result
+    }
+
+    /// Delete edge with lazy mincut update
+    pub fn delete_edge(&mut self, source: u64, target: u64) -> bool {
+        let result = self.graph.delete_edge(source, target);
+        if result {
+            self.updates_since_compute += 1;
+            // Mincut might have decreased, invalidate cache
+            self.cached_mincut = None;
+        }
+        result
+    }
+
+    /// Compute mincut (lazy - uses cache if available)
+    pub fn min_cut(&mut self) -> f64 {
+        if let Some(cached) = self.cached_mincut {
+            if self.updates_since_compute < self.recompute_threshold {
+                return cached;
+            }
+        }
+
+        // Simplified: use min degree as lower bound approximation
+        // Real implementation: Karger's algorithm or tree packing
+        let mincut = self.compute_mincut_approximation();
+        self.cached_mincut = Some(mincut);
+        self.updates_since_compute = 0;
+        mincut
+    }
+
+    /// Approximate mincut using min degree heuristic
+    fn compute_mincut_approximation(&self) -> f64 {
+        // Min cut <= min weighted degree
+        let mut min_cut = f64::MAX;
+
+        for (_vertex, neighbors) in &self.graph.adjacency {
+            let weighted_degree: f64 = neighbors.values().sum();
+            if weighted_degree < min_cut {
+                min_cut = weighted_degree;
+            }
+        }
+
+        if min_cut == f64::MAX {
+            0.0
+        } else {
+            min_cut
+        }
+    }
+
+    /// Get partition (simplified: just split by component)
+    pub fn partition(&mut self) -> (HashSet<u64>, HashSet<u64>) {
+        let components = self.graph.connected_components();
+
+        if components.is_empty() {
+            return (HashSet::new(), HashSet::new());
+        }
+
+        if components.len() == 1 {
+            // Single component - split roughly in half
+            let vertices: Vec<_> = components[0].iter().copied().collect();
+            let mid = vertices.len() / 2;
+            let left: HashSet<_> = vertices[..mid].iter().copied().collect();
+            let right: HashSet<_> = vertices[mid..].iter().copied().collect();
+            (left, right)
+        } else {
+            // Multiple components - use first vs rest
+            let left = components[0].clone();
+            let right: HashSet<_> = components[1..]
+                .iter()
+                .flat_map(|c| c.iter())
+                .copied()
+                .collect();
+            (left, right)
+        }
+    }
+}
+
+// ============================================================================
+// Test Data Generation
+// ============================================================================
+
+fn generate_random_graph(n: usize, m: usize, seed: u64) -> Vec<(u64, u64, f64)> {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    let mut edges = Vec::with_capacity(m);
+    let mut edge_set = HashSet::new();
+
+    for i in 0..m * 2 {
+        if edges.len() >= m {
+            break;
+        }
+
+        let mut hasher = DefaultHasher::new();
+        (seed, i, "source").hash(&mut hasher);
+        let u = hasher.finish() % n as u64;
+
+        let mut hasher = DefaultHasher::new();
+        (seed, i, "target").hash(&mut hasher);
+        let v = hasher.finish() % n as u64;
+
+        if u != v {
+            let key = if u < v { (u, v) } else { (v, u) };
+            if edge_set.insert(key) {
+                edges.push((u, v, 1.0));
+            }
+        }
+    }
+
+    edges
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark edge insertion
+fn bench_insert_edge(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_insert");
+    group.throughput(Throughput::Elements(1));
+
+    for size in [100, 1000, 10000] {
+        let edges = generate_random_graph(size, size * 2, 42);
+        let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
+
+        // Pre-populate
+        for (u, v, w) in &edges[..edges.len() / 2] {
+            mincut.insert_edge(*u, *v, *w);
+        }
+
+        group.bench_with_input(BenchmarkId::new("insert_single", size), &size, |b, &n| {
+            let mut i = edges.len() / 2;
+            b.iter(|| {
+                let (u, v, w) = edges[i % edges.len()];
+                black_box(mincut.insert_edge(u + n as u64, v + n as u64, w));
+                i += 1;
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark edge deletion
+fn bench_delete_edge(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_delete");
+    group.throughput(Throughput::Elements(1));
+
+    for size in [100, 1000, 10000] {
+        let edges = generate_random_graph(size, size * 2, 42);
+
+        group.bench_with_input(BenchmarkId::new("delete_single", size), &size, |b, _| {
+            b.iter_batched(
+                || {
+                    let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
+                    for (u, v, w) in &edges {
+                        mincut.insert_edge(*u, *v, *w);
+                    }
+                    (mincut, edges.clone())
+                },
+                |(mut mincut, edges)| {
+                    let (u, v, _) = edges[edges.len() / 2];
+                    black_box(mincut.delete_edge(u, v))
+                },
+                criterion::BatchSize::SmallInput,
+            )
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark mincut query
+fn bench_mincut_query(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_query");
+    group.throughput(Throughput::Elements(1));
+
+    for size in [100, 1000, 10000] {
+        let edges = generate_random_graph(size, size * 2, 42);
+        let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
+
+        for (u, v, w) in &edges {
+            mincut.insert_edge(*u, *v, *w);
+        }
+
+        // Cold query (no cache)
+        group.bench_with_input(BenchmarkId::new("cold_query", size), &size, |b, _| {
+            b.iter_batched(
+                || {
+                    let mc = mincut.graph.adjacency.clone();
+                    SubpolynomialMinCut {
+                        graph: DynamicGraph {
+                            adjacency: mc,
+                            edge_count: mincut.graph.edge_count,
+                            vertex_count: mincut.graph.vertex_count,
+                            components: None,
+                            mod_count: 0,
+                        },
+                        cached_mincut: None,
+                        updates_since_compute: 0,
+                        recompute_threshold: 10,
+                    }
+                },
+                |mut mc| black_box(mc.min_cut()),
+                criterion::BatchSize::SmallInput,
+            )
+        });
+
+        // Warm query (cached)
+        mincut.min_cut(); // Prime cache
+        group.bench_with_input(BenchmarkId::new("warm_query", size), &size, |b, _| {
+            b.iter(|| black_box(mincut.min_cut()))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark scaling behavior (verify subpolynomial)
+fn bench_scaling(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_scaling");
+    group.sample_size(20);
+
+    // Sizes chosen for subpolynomial verification
+    // n^(2/3) scaling should show sub-linear growth
+    let sizes = vec![100, 316, 1000, 3162, 10000];
+
+    for size in sizes {
+        let edges = generate_random_graph(size, size * 2, 42);
+
+        // Measure insert amortized time
+        group.throughput(Throughput::Elements(1));
+        group.bench_with_input(
+            BenchmarkId::new("insert_amortized", size),
+            &size,
+            |b, &n| {
+                b.iter_batched(
+                    || {
+                        let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
+                        for (u, v, w) in &edges[..edges.len() / 2] {
+                            mincut.insert_edge(*u, *v, *w);
+                        }
+                        (mincut, n)
+                    },
+                    |(mut mincut, n)| {
+                        for i in 0..10 {
+                            let u = (i * 37) as u64 % n as u64;
+                            let v = (i * 73 + 1) as u64 % n as u64;
+                            if u != v {
+                                mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
+                            }
+                        }
+                        black_box(mincut.min_cut())
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark mixed workload
+fn bench_mixed_workload(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_mixed");
+    group.throughput(Throughput::Elements(1));
+
+    for size in [100, 1000, 10000] {
+        let edges = generate_random_graph(size, size * 2, 42);
+
+        group.bench_with_input(BenchmarkId::new("mixed_ops", size), &size, |b, &n| {
+            b.iter_batched(
+                || {
+                    let mut mincut = SubpolynomialMinCut::with_capacity(n, n * 3);
+                    for (u, v, w) in &edges {
+                        mincut.insert_edge(*u, *v, *w);
+                    }
+                    (mincut, 0usize)
+                },
+                |(mut mincut, mut op_idx)| {
+                    // 50% insert, 30% delete, 20% query
+                    match op_idx % 10 {
+                        0..=4 => {
+                            let u = (op_idx * 37) as u64 % n as u64;
+                            let v = (op_idx * 73 + 1) as u64 % n as u64;
+                            if u != v {
+                                mincut.insert_edge(u + n as u64, v + n as u64, 1.0);
+                            }
+                        }
+                        5..=7 => {
+                            if !edges.is_empty() {
+                                let (u, v, _) = edges[op_idx % edges.len()];
+                                mincut.delete_edge(u, v);
+                            }
+                        }
+                        _ => {
+                            let _ = mincut.min_cut();
+                        }
+                    }
+                    op_idx += 1;
+                    black_box(op_idx)
+                },
+                criterion::BatchSize::SmallInput,
+            )
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark partition computation
+fn bench_partition(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_partition");
+
+    for size in [100, 1000, 10000] {
+        let edges = generate_random_graph(size, size * 2, 42);
+        let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 3);
+
+        for (u, v, w) in &edges {
+            mincut.insert_edge(*u, *v, *w);
+        }
+
+        group.bench_with_input(BenchmarkId::new("partition", size), &size, |b, _| {
+            b.iter(|| black_box(mincut.partition()))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark connected components
+fn bench_components(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mincut_components");
+
+    for size in [100, 1000, 10000] {
+        // Create graph with multiple components
+        let mut mincut = SubpolynomialMinCut::with_capacity(size, size * 2);
+
+        let component_size = size / 5;
+        for comp in 0..5 {
+            let offset = comp * component_size;
+            for i in 0..component_size - 1 {
+                let u = (offset + i) as u64;
+                let v = (offset + i + 1) as u64;
+                mincut.insert_edge(u, v, 1.0);
+            }
+        }
+
+        group.bench_with_input(BenchmarkId::new("multi_component", size), &size, |b, _| {
+            b.iter(|| {
+                // Force recomputation
+                mincut.graph.components = None;
+                let components = mincut.graph.connected_components();
+                black_box(components.len())
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_insert_edge,
+    bench_delete_edge,
+    bench_mincut_query,
+    bench_scaling,
+    bench_mixed_workload,
+    bench_partition,
+    bench_components,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/residual_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/residual_bench.rs
@@ -0,0 +1,506 @@
+//! Benchmarks for single residual calculation
+//!
+//! ADR-014 Performance Target: < 1us per residual calculation
+//!
+//! Residual is the core primitive: r_e = rho_u(x_u) - rho_v(x_v)
+//! This measures the local constraint violation at each edge.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+
+// ============================================================================
+// Restriction Map Types (Simulated for benchmarking)
+// ============================================================================
+
+/// Linear restriction map: y = Ax + b
+/// Maps node state to shared constraint space
+#[derive(Clone)]
+pub struct RestrictionMap {
+    /// Linear transformation matrix (row-major, output_dim x input_dim)
+    pub matrix: Vec<f32>,
+    /// Bias vector
+    pub bias: Vec<f32>,
+    /// Input dimension
+    pub input_dim: usize,
+    /// Output dimension
+    pub output_dim: usize,
+}
+
+impl RestrictionMap {
+    /// Create identity restriction map
+    pub fn identity(dim: usize) -> Self {
+        let mut matrix = vec![0.0f32; dim * dim];
+        for i in 0..dim {
+            matrix[i * dim + i] = 1.0;
+        }
+        Self {
+            matrix,
+            bias: vec![0.0; dim],
+            input_dim: dim,
+            output_dim: dim,
+        }
+    }
+
+    /// Create random restriction map for testing
+    pub fn random(input_dim: usize, output_dim: usize, seed: u64) -> Self {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let mut matrix = Vec::with_capacity(output_dim * input_dim);
+        let mut bias = Vec::with_capacity(output_dim);
+
+        for i in 0..(output_dim * input_dim) {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
+            matrix.push(val);
+        }
+
+        for i in 0..output_dim {
+            let mut hasher = DefaultHasher::new();
+            (seed, i, "bias").hash(&mut hasher);
+            let val = (hasher.finish() % 1000) as f32 / 1000.0 - 0.5;
+            bias.push(val);
+        }
+
+        Self {
+            matrix,
+            bias,
+            input_dim,
+            output_dim,
+        }
+    }
+
+    /// Apply restriction map: y = Ax + b
+    #[inline]
+    pub fn apply(&self, input: &[f32]) -> Vec<f32> {
+        debug_assert_eq!(input.len(), self.input_dim);
+        let mut output = self.bias.clone();
+
+        for i in 0..self.output_dim {
+            let row_start = i * self.input_dim;
+            for j in 0..self.input_dim {
+                output[i] += self.matrix[row_start + j] * input[j];
+            }
+        }
+
+        output
+    }
+
+    /// Apply restriction map with SIMD-friendly layout (output buffer provided)
+    #[inline]
+    pub fn apply_into(&self, input: &[f32], output: &mut [f32]) {
+        debug_assert_eq!(input.len(), self.input_dim);
+        debug_assert_eq!(output.len(), self.output_dim);
+
+        // Copy bias first
+        output.copy_from_slice(&self.bias);
+
+        // Matrix-vector multiply
+        for i in 0..self.output_dim {
+            let row_start = i * self.input_dim;
+            for j in 0..self.input_dim {
+                output[i] += self.matrix[row_start + j] * input[j];
+            }
+        }
+    }
+}
+
+/// Edge with restriction maps
+pub struct SheafEdge {
+    pub source: u64,
+    pub target: u64,
+    pub weight: f32,
+    pub rho_source: RestrictionMap,
+    pub rho_target: RestrictionMap,
+}
+
+impl SheafEdge {
+    /// Calculate the edge residual (local mismatch)
+    /// r_e = rho_u(x_u) - rho_v(x_v)
+    #[inline]
+    pub fn residual(&self, source_state: &[f32], target_state: &[f32]) -> Vec<f32> {
+        let projected_source = self.rho_source.apply(source_state);
+        let projected_target = self.rho_target.apply(target_state);
+
+        projected_source
+            .iter()
+            .zip(projected_target.iter())
+            .map(|(a, b)| a - b)
+            .collect()
+    }
+
+    /// Calculate residual with pre-allocated buffers (zero allocation)
+    #[inline]
+    pub fn residual_into(
+        &self,
+        source_state: &[f32],
+        target_state: &[f32],
+        source_buf: &mut [f32],
+        target_buf: &mut [f32],
+        residual: &mut [f32],
+    ) {
+        self.rho_source.apply_into(source_state, source_buf);
+        self.rho_target.apply_into(target_state, target_buf);
+
+        for i in 0..residual.len() {
+            residual[i] = source_buf[i] - target_buf[i];
+        }
+    }
+
+    /// Calculate weighted residual norm squared: w_e * |r_e|^2
+    #[inline]
+    pub fn weighted_residual_energy(&self, source: &[f32], target: &[f32]) -> f32 {
+        let r = self.residual(source, target);
+        let norm_sq: f32 = r.iter().map(|x| x * x).sum();
+        self.weight * norm_sq
+    }
+
+    /// Weighted residual energy with pre-allocated buffers
+    #[inline]
+    pub fn weighted_residual_energy_into(
+        &self,
+        source: &[f32],
+        target: &[f32],
+        source_buf: &mut [f32],
+        target_buf: &mut [f32],
+    ) -> f32 {
+        self.rho_source.apply_into(source, source_buf);
+        self.rho_target.apply_into(target, target_buf);
+
+        let mut norm_sq = 0.0f32;
+        for i in 0..source_buf.len() {
+            let diff = source_buf[i] - target_buf[i];
+            norm_sq += diff * diff;
+        }
+
+        self.weight * norm_sq
+    }
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    (0..dim)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+/// Benchmark single residual calculation at various dimensions
+fn bench_single_residual(c: &mut Criterion) {
+    let mut group = c.benchmark_group("residual_single");
+    group.throughput(Throughput::Elements(1));
+
+    // Test dimensions relevant for coherence engine:
+    // 8: Minimal state
+    // 32: Compact embedding
+    // 64: Standard embedding
+    // 128: Rich state
+    // 256: Large state
+    for dim in [8, 32, 64, 128, 256] {
+        let rho_source = RestrictionMap::identity(dim);
+        let rho_target = RestrictionMap::identity(dim);
+        let source_state = generate_state(dim, 42);
+        let target_state = generate_state(dim, 123);
+
+        let edge = SheafEdge {
+            source: 0,
+            target: 1,
+            weight: 1.0,
+            rho_source,
+            rho_target,
+        };
+
+        group.bench_with_input(BenchmarkId::new("identity_map", dim), &dim, |b, _| {
+            b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state)))
+        });
+    }
+
+    // Test with projection (non-identity maps)
+    for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128)] {
+        let rho_source = RestrictionMap::random(input_dim, output_dim, 42);
+        let rho_target = RestrictionMap::random(input_dim, output_dim, 123);
+        let source_state = generate_state(input_dim, 42);
+        let target_state = generate_state(input_dim, 123);
+
+        let edge = SheafEdge {
+            source: 0,
+            target: 1,
+            weight: 1.0,
+            rho_source,
+            rho_target,
+        };
+
+        group.bench_with_input(
+            BenchmarkId::new("projection_map", format!("{}to{}", input_dim, output_dim)),
+            &(input_dim, output_dim),
+            |b, _| b.iter(|| edge.residual(black_box(&source_state), black_box(&target_state))),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark residual calculation with pre-allocated buffers (zero allocation)
+fn bench_residual_zero_alloc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("residual_zero_alloc");
+    group.throughput(Throughput::Elements(1));
+
+    for dim in [32, 64, 128, 256] {
+        let rho_source = RestrictionMap::identity(dim);
+        let rho_target = RestrictionMap::identity(dim);
+        let source_state = generate_state(dim, 42);
+        let target_state = generate_state(dim, 123);
+
+        let edge = SheafEdge {
+            source: 0,
+            target: 1,
+            weight: 1.0,
+            rho_source,
+            rho_target,
+        };
+
+        // Pre-allocate buffers
+        let mut source_buf = vec![0.0f32; dim];
+        let mut target_buf = vec![0.0f32; dim];
+        let mut residual = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| {
+                edge.residual_into(
+                    black_box(&source_state),
+                    black_box(&target_state),
+                    black_box(&mut source_buf),
+                    black_box(&mut target_buf),
+                    black_box(&mut residual),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark weighted residual energy computation
+fn bench_weighted_energy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("residual_weighted_energy");
+    group.throughput(Throughput::Elements(1));
+
+    for dim in [32, 64, 128, 256] {
+        let rho_source = RestrictionMap::identity(dim);
+        let rho_target = RestrictionMap::identity(dim);
+        let source_state = generate_state(dim, 42);
+        let target_state = generate_state(dim, 123);
+
+        let edge = SheafEdge {
+            source: 0,
+            target: 1,
+            weight: 1.5,
+            rho_source,
+            rho_target,
+        };
+
+        group.bench_with_input(BenchmarkId::new("allocating", dim), &dim, |b, _| {
+            b.iter(|| {
+                edge.weighted_residual_energy(black_box(&source_state), black_box(&target_state))
+            })
+        });
+
+        // Pre-allocate buffers for zero-alloc version
+        let mut source_buf = vec![0.0f32; dim];
+        let mut target_buf = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("zero_alloc", dim), &dim, |b, _| {
+            b.iter(|| {
+                edge.weighted_residual_energy_into(
+                    black_box(&source_state),
+                    black_box(&target_state),
+                    black_box(&mut source_buf),
+                    black_box(&mut target_buf),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark batch residual computation (for parallel evaluation)
+fn bench_batch_residual(c: &mut Criterion) {
+    let mut group = c.benchmark_group("residual_batch");
+
+    for batch_size in [10, 100, 1000] {
+        let dim = 64;
+
+        // Create batch of edges
+        let edges: Vec<SheafEdge> = (0..batch_size)
+            .map(|i| SheafEdge {
+                source: i as u64,
+                target: (i + 1) as u64,
+                weight: 1.0,
+                rho_source: RestrictionMap::identity(dim),
+                rho_target: RestrictionMap::identity(dim),
+            })
+            .collect();
+
+        let states: Vec<Vec<f32>> = (0..batch_size + 1)
+            .map(|i| generate_state(dim, i as u64))
+            .collect();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+
+        // Sequential computation
+        group.bench_with_input(
+            BenchmarkId::new("sequential", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    let mut total_energy = 0.0f32;
+                    for (i, edge) in edges.iter().enumerate() {
+                        total_energy += edge.weighted_residual_energy(
+                            black_box(&states[i]),
+                            black_box(&states[i + 1]),
+                        );
+                    }
+                    black_box(total_energy)
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark restriction map application alone
+fn bench_restriction_map(c: &mut Criterion) {
+    let mut group = c.benchmark_group("restriction_map");
+    group.throughput(Throughput::Elements(1));
+
+    // Identity maps
+    for dim in [32, 64, 128, 256] {
+        let rho = RestrictionMap::identity(dim);
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("identity_apply", dim), &dim, |b, _| {
+            b.iter(|| rho.apply(black_box(&input)))
+        });
+
+        group.bench_with_input(
+            BenchmarkId::new("identity_apply_into", dim),
+            &dim,
+            |b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
+        );
+    }
+
+    // Projection maps (dense matrix multiply)
+    for (input_dim, output_dim) in [(64, 32), (128, 64), (256, 128), (512, 256)] {
+        let rho = RestrictionMap::random(input_dim, output_dim, 42);
+        let input = generate_state(input_dim, 42);
+        let mut output = vec![0.0f32; output_dim];
+
+        group.bench_with_input(
+            BenchmarkId::new("projection_apply", format!("{}x{}", input_dim, output_dim)),
+            &(input_dim, output_dim),
+            |b, _| b.iter(|| rho.apply(black_box(&input))),
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new(
+                "projection_apply_into",
+                format!("{}x{}", input_dim, output_dim),
+            ),
+            &(input_dim, output_dim),
+            |b, _| b.iter(|| rho.apply_into(black_box(&input), black_box(&mut output))),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark SIMD-optimized residual patterns
+fn bench_simd_patterns(c: &mut Criterion) {
+    let mut group = c.benchmark_group("residual_simd_patterns");
+    group.throughput(Throughput::Elements(1));
+
+    // Aligned dimensions for SIMD (multiples of 8 for AVX2, 16 for AVX-512)
+    for dim in [32, 64, 128, 256, 512] {
+        let a = generate_state(dim, 42);
+        let b = generate_state(dim, 123);
+
+        // Scalar subtraction and norm
+        group.bench_with_input(
+            BenchmarkId::new("scalar_diff_norm", dim),
+            &dim,
+            |b_iter, _| {
+                b_iter.iter(|| {
+                    let mut norm_sq = 0.0f32;
+                    for i in 0..dim {
+                        let diff = a[i] - b[i];
+                        norm_sq += diff * diff;
+                    }
+                    black_box(norm_sq)
+                })
+            },
+        );
+
+        // Iterator-based (auto-vectorization friendly)
+        group.bench_with_input(
+            BenchmarkId::new("iter_diff_norm", dim),
+            &dim,
+            |b_iter, _| {
+                b_iter.iter(|| {
+                    let norm_sq: f32 = a
+                        .iter()
+                        .zip(b.iter())
+                        .map(|(x, y)| {
+                            let d = x - y;
+                            d * d
+                        })
+                        .sum();
+                    black_box(norm_sq)
+                })
+            },
+        );
+
+        // Chunked for explicit SIMD opportunity
+        group.bench_with_input(
+            BenchmarkId::new("chunked_diff_norm", dim),
+            &dim,
+            |b_iter, _| {
+                b_iter.iter(|| {
+                    let mut accum = [0.0f32; 8];
+                    for (chunk_a, chunk_b) in a.chunks(8).zip(b.chunks(8)) {
+                        for i in 0..chunk_a.len() {
+                            let d = chunk_a[i] - chunk_b[i];
+                            accum[i] += d * d;
+                        }
+                    }
+                    black_box(accum.iter().sum::<f32>())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_single_residual,
+    bench_residual_zero_alloc,
+    bench_weighted_energy,
+    bench_batch_residual,
+    bench_restriction_map,
+    bench_simd_patterns,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
@@ -0,0 +1,800 @@
+//! SIMD-Specific Benchmarks for Prime-Radiant Coherence Engine
+//!
+//! This benchmark suite compares naive/scalar implementations against
+//! SIMD-optimized versions for core coherence operations.
+//!
+//! ## Benchmark Categories
+//! 1. Dense Matrix Multiply - naive vs SIMD
+//! 2. Vector Norm Computation - naive vs SIMD
+//! 3. Batch Residual Computation - naive vs SIMD
+//! 4. Dot Products and Reductions
+//!
+//! ## Architecture Notes
+//! - x86_64: AVX2 (256-bit, f32x8) or AVX-512 (512-bit, f32x16)
+//! - aarch64: NEON (128-bit, f32x4)
+//! - WASM: SIMD128 (128-bit)
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+
+// ============================================================================
+// TEST DATA GENERATION
+// ============================================================================
+
+fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
+    (0..len)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
+    (0..rows * cols)
+        .map(|i| {
+            let mut hasher = DefaultHasher::new();
+            (seed, i).hash(&mut hasher);
+            (hasher.finish() % 1000) as f32 / 1000.0 - 0.5
+        })
+        .collect()
+}
+
+// ============================================================================
+// NAIVE IMPLEMENTATIONS (BASELINE)
+// ============================================================================
+
+/// Naive matrix-vector multiply: y = Ax
+#[inline(never)]
+fn matmul_naive(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
+    for i in 0..rows {
+        let mut sum = 0.0f32;
+        let row_start = i * cols;
+        for j in 0..cols {
+            sum += matrix[row_start + j] * x[j];
+        }
+        y[i] = sum;
+    }
+}
+
+/// Naive squared norm: |v|^2
+#[inline(never)]
+fn norm_sq_naive(v: &[f32]) -> f32 {
+    let mut sum = 0.0f32;
+    for &x in v {
+        sum += x * x;
+    }
+    sum
+}
+
+/// Naive dot product: a . b
+#[inline(never)]
+fn dot_naive(a: &[f32], b: &[f32]) -> f32 {
+    let mut sum = 0.0f32;
+    for i in 0..a.len() {
+        sum += a[i] * b[i];
+    }
+    sum
+}
+
+/// Naive residual norm: |a - b|^2
+#[inline(never)]
+fn residual_norm_naive(a: &[f32], b: &[f32]) -> f32 {
+    let mut sum = 0.0f32;
+    for i in 0..a.len() {
+        let diff = a[i] - b[i];
+        sum += diff * diff;
+    }
+    sum
+}
+
+/// Naive batch residual computation
+#[inline(never)]
+fn batch_residual_naive(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
+    let mut total = 0.0f32;
+    for (src, tgt) in sources.iter().zip(targets.iter()) {
+        total += residual_norm_naive(src, tgt);
+    }
+    total
+}
+
+// ============================================================================
+// SIMD-FRIENDLY IMPLEMENTATIONS
+// ============================================================================
+
+/// Unrolled matrix-vector multiply (auto-vectorization friendly)
+#[inline(never)]
+fn matmul_unrolled(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
+    for i in 0..rows {
+        let row_start = i * cols;
+
+        // Process in chunks of 8
+        let chunks = cols / 8;
+        let mut acc0 = 0.0f32;
+        let mut acc1 = 0.0f32;
+        let mut acc2 = 0.0f32;
+        let mut acc3 = 0.0f32;
+        let mut acc4 = 0.0f32;
+        let mut acc5 = 0.0f32;
+        let mut acc6 = 0.0f32;
+        let mut acc7 = 0.0f32;
+
+        for c in 0..chunks {
+            let base = row_start + c * 8;
+            acc0 += matrix[base] * x[c * 8];
+            acc1 += matrix[base + 1] * x[c * 8 + 1];
+            acc2 += matrix[base + 2] * x[c * 8 + 2];
+            acc3 += matrix[base + 3] * x[c * 8 + 3];
+            acc4 += matrix[base + 4] * x[c * 8 + 4];
+            acc5 += matrix[base + 5] * x[c * 8 + 5];
+            acc6 += matrix[base + 6] * x[c * 8 + 6];
+            acc7 += matrix[base + 7] * x[c * 8 + 7];
+        }
+
+        let mut sum = acc0 + acc1 + acc2 + acc3 + acc4 + acc5 + acc6 + acc7;
+
+        // Handle remainder
+        for j in (chunks * 8)..cols {
+            sum += matrix[row_start + j] * x[j];
+        }
+
+        y[i] = sum;
+    }
+}
+
+/// Unrolled squared norm with 4 accumulators
+#[inline(never)]
+fn norm_sq_unrolled(v: &[f32]) -> f32 {
+    let chunks = v.chunks_exact(4);
+    let remainder = chunks.remainder();
+
+    let mut acc0 = 0.0f32;
+    let mut acc1 = 0.0f32;
+    let mut acc2 = 0.0f32;
+    let mut acc3 = 0.0f32;
+
+    for chunk in chunks {
+        acc0 += chunk[0] * chunk[0];
+        acc1 += chunk[1] * chunk[1];
+        acc2 += chunk[2] * chunk[2];
+        acc3 += chunk[3] * chunk[3];
+    }
+
+    let mut sum = acc0 + acc1 + acc2 + acc3;
+    for &x in remainder {
+        sum += x * x;
+    }
+    sum
+}
+
+/// Unrolled squared norm with 8 accumulators (better for wider SIMD)
+#[inline(never)]
+fn norm_sq_unrolled_8(v: &[f32]) -> f32 {
+    let chunks = v.chunks_exact(8);
+    let remainder = chunks.remainder();
+
+    let mut acc = [0.0f32; 8];
+
+    for chunk in chunks {
+        acc[0] += chunk[0] * chunk[0];
+        acc[1] += chunk[1] * chunk[1];
+        acc[2] += chunk[2] * chunk[2];
+        acc[3] += chunk[3] * chunk[3];
+        acc[4] += chunk[4] * chunk[4];
+        acc[5] += chunk[5] * chunk[5];
+        acc[6] += chunk[6] * chunk[6];
+        acc[7] += chunk[7] * chunk[7];
+    }
+
+    let mut sum: f32 = acc.iter().sum();
+    for &x in remainder {
+        sum += x * x;
+    }
+    sum
+}
+
+/// Iterator-based squared norm (relies on auto-vectorization)
+#[inline(never)]
+fn norm_sq_iter(v: &[f32]) -> f32 {
+    v.iter().map(|x| x * x).sum()
+}
+
+/// Unrolled dot product
+#[inline(never)]
+fn dot_unrolled(a: &[f32], b: &[f32]) -> f32 {
+    let chunks_a = a.chunks_exact(4);
+    let chunks_b = b.chunks_exact(4);
+    let rem_a = chunks_a.remainder();
+    let rem_b = chunks_b.remainder();
+
+    let mut acc0 = 0.0f32;
+    let mut acc1 = 0.0f32;
+    let mut acc2 = 0.0f32;
+    let mut acc3 = 0.0f32;
+
+    for (ca, cb) in chunks_a.zip(chunks_b) {
+        acc0 += ca[0] * cb[0];
+        acc1 += ca[1] * cb[1];
+        acc2 += ca[2] * cb[2];
+        acc3 += ca[3] * cb[3];
+    }
+
+    let mut sum = acc0 + acc1 + acc2 + acc3;
+    for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
+        sum += a * b;
+    }
+    sum
+}
+
+/// Unrolled residual norm
+#[inline(never)]
+fn residual_norm_unrolled(a: &[f32], b: &[f32]) -> f32 {
+    let chunks_a = a.chunks_exact(4);
+    let chunks_b = b.chunks_exact(4);
+    let rem_a = chunks_a.remainder();
+    let rem_b = chunks_b.remainder();
+
+    let mut acc0 = 0.0f32;
+    let mut acc1 = 0.0f32;
+    let mut acc2 = 0.0f32;
+    let mut acc3 = 0.0f32;
+
+    for (ca, cb) in chunks_a.zip(chunks_b) {
+        let d0 = ca[0] - cb[0];
+        let d1 = ca[1] - cb[1];
+        let d2 = ca[2] - cb[2];
+        let d3 = ca[3] - cb[3];
+        acc0 += d0 * d0;
+        acc1 += d1 * d1;
+        acc2 += d2 * d2;
+        acc3 += d3 * d3;
+    }
+
+    let mut sum = acc0 + acc1 + acc2 + acc3;
+    for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
+        let d = a - b;
+        sum += d * d;
+    }
+    sum
+}
+
+/// Batch residual with unrolled inner loop
+#[inline(never)]
+fn batch_residual_unrolled(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
+    let mut total = 0.0f32;
+    for (src, tgt) in sources.iter().zip(targets.iter()) {
+        total += residual_norm_unrolled(src, tgt);
+    }
+    total
+}
+
+// ============================================================================
+// EXPLICIT SIMD (when wide crate is available)
+// ============================================================================
+
+#[cfg(feature = "simd")]
+mod simd_impl {
+    use wide::f32x8;
+
+    /// SIMD squared norm using f32x8
+    #[inline(never)]
+    pub fn norm_sq_simd(v: &[f32]) -> f32 {
+        let chunks = v.chunks_exact(8);
+        let remainder = chunks.remainder();
+
+        let mut acc = f32x8::ZERO;
+
+        for chunk in chunks {
+            let vals = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
+            acc += vals * vals;
+        }
+
+        let mut sum: f32 = acc.reduce_add();
+        for &x in remainder {
+            sum += x * x;
+        }
+        sum
+    }
+
+    /// SIMD dot product using f32x8
+    #[inline(never)]
+    pub fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
+        let chunks_a = a.chunks_exact(8);
+        let chunks_b = b.chunks_exact(8);
+        let rem_a = chunks_a.remainder();
+        let rem_b = chunks_b.remainder();
+
+        let mut acc = f32x8::ZERO;
+
+        for (ca, cb) in chunks_a.zip(chunks_b) {
+            let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
+            let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
+            acc += va * vb;
+        }
+
+        let mut sum: f32 = acc.reduce_add();
+        for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
+            sum += a * b;
+        }
+        sum
+    }
+
+    /// SIMD residual norm using f32x8
+    #[inline(never)]
+    pub fn residual_norm_simd(a: &[f32], b: &[f32]) -> f32 {
+        let chunks_a = a.chunks_exact(8);
+        let chunks_b = b.chunks_exact(8);
+        let rem_a = chunks_a.remainder();
+        let rem_b = chunks_b.remainder();
+
+        let mut acc = f32x8::ZERO;
+
+        for (ca, cb) in chunks_a.zip(chunks_b) {
+            let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
+            let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
+            let diff = va - vb;
+            acc += diff * diff;
+        }
+
+        let mut sum: f32 = acc.reduce_add();
+        for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
+            let d = a - b;
+            sum += d * d;
+        }
+        sum
+    }
+
+    /// SIMD matrix-vector multiply
+    #[inline(never)]
+    pub fn matmul_simd(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
+        for i in 0..rows {
+            let row_start = i * cols;
+            let row = &matrix[row_start..row_start + cols];
+
+            let chunks_m = row.chunks_exact(8);
+            let chunks_x = x.chunks_exact(8);
+            let rem_m = chunks_m.remainder();
+            let rem_x = chunks_x.remainder();
+
+            let mut acc = f32x8::ZERO;
+
+            for (cm, cx) in chunks_m.zip(chunks_x) {
+                let vm = f32x8::from(<[f32; 8]>::try_from(cm).unwrap());
+                let vx = f32x8::from(<[f32; 8]>::try_from(cx).unwrap());
+                acc += vm * vx;
+            }
+
+            let mut sum: f32 = acc.reduce_add();
+            for (&m, &xv) in rem_m.iter().zip(rem_x.iter()) {
+                sum += m * xv;
+            }
+
+            y[i] = sum;
+        }
+    }
+
+    /// SIMD batch residual
+    #[inline(never)]
+    pub fn batch_residual_simd(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
+        let mut total = 0.0f32;
+        for (src, tgt) in sources.iter().zip(targets.iter()) {
+            total += residual_norm_simd(src, tgt);
+        }
+        total
+    }
+}
+
+// ============================================================================
+// DENSE MATRIX MULTIPLY BENCHMARKS
+// ============================================================================
+
+fn bench_dense_matmul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_matmul");
+
+    // Test matrix sizes: 64x64, 128x128, 256x256
+    for size in [64, 128, 256] {
+        let matrix = generate_matrix(size, size, 42);
+        let x = generate_vec(size, 123);
+        let mut y = vec![0.0f32; size];
+
+        group.throughput(Throughput::Elements((size * size) as u64));
+
+        group.bench_with_input(BenchmarkId::new("naive", size), &size, |b, _| {
+            b.iter(|| {
+                matmul_naive(black_box(&matrix), black_box(&x), &mut y, size, size);
+                black_box(y[0])
+            })
+        });
+
+        group.bench_with_input(BenchmarkId::new("unrolled", size), &size, |b, _| {
+            b.iter(|| {
+                matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, size, size);
+                black_box(y[0])
+            })
+        });
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(BenchmarkId::new("simd", size), &size, |b, _| {
+            b.iter(|| {
+                simd_impl::matmul_simd(black_box(&matrix), black_box(&x), &mut y, size, size);
+                black_box(y[0])
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark non-square matrix multiply (projection)
+fn bench_projection_matmul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_matmul_projection");
+
+    // Common projection sizes in coherence: 64->32, 128->64, 256->128
+    for (in_dim, out_dim) in [(64, 32), (128, 64), (256, 128)] {
+        let matrix = generate_matrix(out_dim, in_dim, 42);
+        let x = generate_vec(in_dim, 123);
+        let mut y = vec![0.0f32; out_dim];
+
+        group.throughput(Throughput::Elements((out_dim * in_dim) as u64));
+
+        group.bench_with_input(
+            BenchmarkId::new("naive", format!("{}x{}", in_dim, out_dim)),
+            &(in_dim, out_dim),
+            |b, _| {
+                b.iter(|| {
+                    matmul_naive(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
+                    black_box(y[0])
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("unrolled", format!("{}x{}", in_dim, out_dim)),
+            &(in_dim, out_dim),
+            |b, _| {
+                b.iter(|| {
+                    matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
+                    black_box(y[0])
+                })
+            },
+        );
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(
+            BenchmarkId::new("simd", format!("{}x{}", in_dim, out_dim)),
+            &(in_dim, out_dim),
+            |b, _| {
+                b.iter(|| {
+                    simd_impl::matmul_simd(
+                        black_box(&matrix),
+                        black_box(&x),
+                        &mut y,
+                        out_dim,
+                        in_dim,
+                    );
+                    black_box(y[0])
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// NORM COMPUTATION BENCHMARKS
+// ============================================================================
+
+fn bench_norm_computation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_norm");
+
+    // Test dimensions aligned for SIMD
+    for dim in [64, 128, 256, 512, 1024] {
+        let v = generate_vec(dim, 42);
+
+        group.throughput(Throughput::Elements(dim as u64));
+
+        group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b, _| {
+            b.iter(|| black_box(norm_sq_naive(black_box(&v))))
+        });
+
+        group.bench_with_input(BenchmarkId::new("iter", dim), &dim, |b, _| {
+            b.iter(|| black_box(norm_sq_iter(black_box(&v))))
+        });
+
+        group.bench_with_input(BenchmarkId::new("unrolled_4", dim), &dim, |b, _| {
+            b.iter(|| black_box(norm_sq_unrolled(black_box(&v))))
+        });
+
+        group.bench_with_input(BenchmarkId::new("unrolled_8", dim), &dim, |b, _| {
+            b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
+        });
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(BenchmarkId::new("simd_f32x8", dim), &dim, |b, _| {
+            b.iter(|| black_box(simd_impl::norm_sq_simd(black_box(&v))))
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// DOT PRODUCT BENCHMARKS
+// ============================================================================
+
+fn bench_dot_product(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_dot");
+
+    for dim in [64, 256, 1024] {
+        let a = generate_vec(dim, 42);
+        let b = generate_vec(dim, 123);
+
+        group.throughput(Throughput::Elements(dim as u64));
+
+        group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(dot_naive(black_box(&a), black_box(&b))))
+        });
+
+        group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(dot_unrolled(black_box(&a), black_box(&b))))
+        });
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(simd_impl::dot_simd(black_box(&a), black_box(&b))))
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// RESIDUAL NORM BENCHMARKS (CORE COHERENCE OPERATION)
+// ============================================================================
+
+fn bench_residual_norm(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_residual_norm");
+
+    for dim in [64, 256, 1024] {
+        let a = generate_vec(dim, 42);
+        let b = generate_vec(dim, 123);
+
+        group.throughput(Throughput::Elements(dim as u64));
+
+        group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(residual_norm_naive(black_box(&a), black_box(&b))))
+        });
+
+        group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
+        });
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
+            b_iter.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// BATCH RESIDUAL BENCHMARKS
+// ============================================================================
+
+fn bench_batch_residual(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_batch_residual");
+
+    let dim = 64;
+
+    for batch_size in [100, 1000, 10000] {
+        let sources: Vec<Vec<f32>> = (0..batch_size)
+            .map(|i| generate_vec(dim, i as u64))
+            .collect();
+        let targets: Vec<Vec<f32>> = (0..batch_size)
+            .map(|i| generate_vec(dim, i as u64 + 10000))
+            .collect();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+
+        group.bench_with_input(
+            BenchmarkId::new("naive", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    black_box(batch_residual_naive(
+                        black_box(&sources),
+                        black_box(&targets),
+                    ))
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("unrolled", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    black_box(batch_residual_unrolled(
+                        black_box(&sources),
+                        black_box(&targets),
+                    ))
+                })
+            },
+        );
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(BenchmarkId::new("simd", batch_size), &batch_size, |b, _| {
+            b.iter(|| {
+                black_box(simd_impl::batch_residual_simd(
+                    black_box(&sources),
+                    black_box(&targets),
+                ))
+            })
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// MEMORY ALIGNMENT BENCHMARKS
+// ============================================================================
+
+fn bench_alignment_impact(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_alignment");
+
+    let dim = 256;
+
+    // Aligned (multiple of 8)
+    {
+        let v = generate_vec(dim, 42);
+        group.bench_function("aligned_256", |b| {
+            b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
+        });
+    }
+
+    // Misaligned (not multiple of 8)
+    {
+        let v = generate_vec(dim + 3, 42);
+        group.bench_function("misaligned_259", |b| {
+            b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
+        });
+    }
+
+    // Small vector (below SIMD threshold)
+    {
+        let v = generate_vec(7, 42);
+        group.bench_function("small_7", |b| {
+            b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
+        });
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// THROUGHPUT SCALING BENCHMARKS
+// ============================================================================
+
+fn bench_throughput_scaling(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_throughput_scaling");
+
+    // Test how throughput scales with vector size
+    let sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
+
+    for &size in &sizes {
+        let a = generate_vec(size, 42);
+        let b = generate_vec(size, 123);
+
+        group.throughput(Throughput::Bytes((size * 4 * 2) as u64)); // 2 vectors, 4 bytes each
+
+        group.bench_with_input(
+            BenchmarkId::new("residual_unrolled", size),
+            &size,
+            |bench, _| {
+                bench.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
+            },
+        );
+
+        #[cfg(feature = "simd")]
+        group.bench_with_input(
+            BenchmarkId::new("residual_simd", size),
+            &size,
+            |bench, _| {
+                bench
+                    .iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// ============================================================================
+// COHERENCE-SPECIFIC SIMD PATTERNS
+// ============================================================================
+
+/// Fused multiply-add pattern for coherence energy
+fn bench_fma_pattern(c: &mut Criterion) {
+    let mut group = c.benchmark_group("simd_fma_pattern");
+
+    let dim = 256;
+    let a = generate_vec(dim, 42);
+    let b = generate_vec(dim, 123);
+    let weight = 1.5f32;
+
+    // Without FMA (separate multiply and add)
+    group.bench_function("separate_ops", |bench| {
+        bench.iter(|| {
+            let mut sum = 0.0f32;
+            for i in 0..dim {
+                let diff = a[i] - b[i];
+                let sq = diff * diff;
+                sum += sq;
+            }
+            black_box(weight * sum)
+        })
+    });
+
+    // With potential FMA (compiler may optimize)
+    group.bench_function("fma_friendly", |bench| {
+        bench.iter(|| {
+            let mut acc0 = 0.0f32;
+            let mut acc1 = 0.0f32;
+            let mut acc2 = 0.0f32;
+            let mut acc3 = 0.0f32;
+
+            let chunks = dim / 4;
+            for c in 0..chunks {
+                let base = c * 4;
+                let d0 = a[base] - b[base];
+                let d1 = a[base + 1] - b[base + 1];
+                let d2 = a[base + 2] - b[base + 2];
+                let d3 = a[base + 3] - b[base + 3];
+
+                // These can become FMA operations
+                acc0 = d0.mul_add(d0, acc0);
+                acc1 = d1.mul_add(d1, acc1);
+                acc2 = d2.mul_add(d2, acc2);
+                acc3 = d3.mul_add(d3, acc3);
+            }
+
+            black_box(weight * (acc0 + acc1 + acc2 + acc3))
+        })
+    });
+
+    group.finish();
+}
+
+// ============================================================================
+// CRITERION CONFIGURATION
+// ============================================================================
+
+criterion_group!(matmul_benches, bench_dense_matmul, bench_projection_matmul,);
+
+criterion_group!(
+    vector_ops_benches,
+    bench_norm_computation,
+    bench_dot_product,
+    bench_residual_norm,
+);
+
+criterion_group!(batch_benches, bench_batch_residual,);
+
+criterion_group!(
+    optimization_benches,
+    bench_alignment_impact,
+    bench_throughput_scaling,
+    bench_fma_pattern,
+);
+
+criterion_main!(
+    matmul_benches,
+    vector_ops_benches,
+    batch_benches,
+    optimization_benches
+);
--- a/vendor/ruvector/crates/prime-radiant/benches/sona_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/sona_bench.rs
@@ -0,0 +1,549 @@
+//! Benchmarks for SONA Micro-LoRA instant adaptation
+//!
+//! ADR-014 Performance Target: < 0.05ms (50us) for instant adaptation
+//!
+//! SONA provides self-optimizing threshold tuning with:
+//! - Micro-LoRA: Ultra-low rank (1-2) for instant learning
+//! - Base-LoRA: Standard LoRA for background learning
+//! - EWC++: Elastic Weight Consolidation to prevent forgetting
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+
+// ============================================================================
+// SONA Types (Simulated for benchmarking)
+// ============================================================================
+
+/// Micro-LoRA layer (rank 1-2 for instant adaptation)
+pub struct MicroLoRA {
+    /// Low-rank factor A (dim x rank)
+    pub a: Vec<f32>,
+    /// Low-rank factor B (rank x dim)
+    pub b: Vec<f32>,
+    /// Scaling factor
+    pub scale: f32,
+    /// Input dimension
+    pub dim: usize,
+    /// Rank (typically 1-2)
+    pub rank: usize,
+}
+
+impl MicroLoRA {
+    pub fn new(dim: usize, rank: usize) -> Self {
+        // Initialize with small random values
+        let a: Vec<f32> = (0..dim * rank)
+            .map(|i| ((i as f32 * 0.1234).sin() * 0.01))
+            .collect();
+        let b: Vec<f32> = (0..rank * dim)
+            .map(|i| ((i as f32 * 0.5678).cos() * 0.01))
+            .collect();
+
+        Self {
+            a,
+            b,
+            scale: 0.1,
+            dim,
+            rank,
+        }
+    }
+
+    /// Apply micro-LoRA transform: y = x + scale * B @ A @ x
+    #[inline]
+    pub fn apply(&self, input: &[f32], output: &mut [f32]) {
+        debug_assert_eq!(input.len(), self.dim);
+        debug_assert_eq!(output.len(), self.dim);
+
+        // Copy input to output first (identity component)
+        output.copy_from_slice(input);
+
+        // Compute A @ x -> hidden (rank-dimensional)
+        let mut hidden = vec![0.0f32; self.rank];
+        for r in 0..self.rank {
+            for i in 0..self.dim {
+                hidden[r] += self.a[i * self.rank + r] * input[i];
+            }
+        }
+
+        // Compute B @ hidden and add to output
+        for i in 0..self.dim {
+            let mut delta = 0.0f32;
+            for r in 0..self.rank {
+                delta += self.b[r * self.dim + i] * hidden[r];
+            }
+            output[i] += self.scale * delta;
+        }
+    }
+
+    /// Apply with pre-allocated hidden buffer (zero allocation)
+    #[inline]
+    pub fn apply_zero_alloc(&self, input: &[f32], hidden: &mut [f32], output: &mut [f32]) {
+        debug_assert_eq!(hidden.len(), self.rank);
+
+        // Copy input
+        output.copy_from_slice(input);
+
+        // A @ x
+        hidden.fill(0.0);
+        for r in 0..self.rank {
+            for i in 0..self.dim {
+                hidden[r] += self.a[i * self.rank + r] * input[i];
+            }
+        }
+
+        // B @ hidden
+        for i in 0..self.dim {
+            let mut delta = 0.0f32;
+            for r in 0..self.rank {
+                delta += self.b[r * self.dim + i] * hidden[r];
+            }
+            output[i] += self.scale * delta;
+        }
+    }
+
+    /// Update weights from gradient (instant learning)
+    #[inline]
+    pub fn update(&mut self, grad_a: &[f32], grad_b: &[f32], learning_rate: f32) {
+        for i in 0..self.a.len() {
+            self.a[i] -= learning_rate * grad_a[i];
+        }
+        for i in 0..self.b.len() {
+            self.b[i] -= learning_rate * grad_b[i];
+        }
+    }
+}
+
+/// Base-LoRA layer (higher rank for background learning)
+pub struct BaseLoRA {
+    pub a: Vec<f32>,
+    pub b: Vec<f32>,
+    pub scale: f32,
+    pub dim: usize,
+    pub rank: usize,
+}
+
+impl BaseLoRA {
+    pub fn new(dim: usize, rank: usize) -> Self {
+        let a: Vec<f32> = (0..dim * rank)
+            .map(|i| ((i as f32 * 0.3456).sin() * 0.01))
+            .collect();
+        let b: Vec<f32> = (0..rank * dim)
+            .map(|i| ((i as f32 * 0.7890).cos() * 0.01))
+            .collect();
+
+        Self {
+            a,
+            b,
+            scale: 0.05,
+            dim,
+            rank,
+        }
+    }
+
+    #[inline]
+    pub fn apply(&self, input: &[f32], output: &mut [f32]) {
+        output.copy_from_slice(input);
+
+        let mut hidden = vec![0.0f32; self.rank];
+        for r in 0..self.rank {
+            for i in 0..self.dim {
+                hidden[r] += self.a[i * self.rank + r] * input[i];
+            }
+        }
+
+        for i in 0..self.dim {
+            let mut delta = 0.0f32;
+            for r in 0..self.rank {
+                delta += self.b[r * self.dim + i] * hidden[r];
+            }
+            output[i] += self.scale * delta;
+        }
+    }
+}
+
+/// EWC++ weight importance
+pub struct EwcPlusPlus {
+    /// Fisher information diagonal
+    pub fisher: Vec<f32>,
+    /// Optimal weights from previous tasks
+    pub optimal_weights: Vec<f32>,
+    /// Regularization strength
+    pub lambda: f32,
+}
+
+impl EwcPlusPlus {
+    pub fn new(param_count: usize, lambda: f32) -> Self {
+        Self {
+            fisher: vec![1.0; param_count],
+            optimal_weights: vec![0.0; param_count],
+            lambda,
+        }
+    }
+
+    /// Compute EWC penalty for given weights
+    #[inline]
+    pub fn penalty(&self, weights: &[f32]) -> f32 {
+        let mut penalty = 0.0f32;
+        for i in 0..weights.len().min(self.fisher.len()) {
+            let diff = weights[i] - self.optimal_weights[i];
+            penalty += self.fisher[i] * diff * diff;
+        }
+        self.lambda * 0.5 * penalty
+    }
+
+    /// Update Fisher information (consolidation)
+    pub fn consolidate(&mut self, weights: &[f32], new_fisher: &[f32]) {
+        for i in 0..self.fisher.len().min(new_fisher.len()) {
+            // Online Fisher update (running average)
+            self.fisher[i] = 0.9 * self.fisher[i] + 0.1 * new_fisher[i];
+            self.optimal_weights[i] = weights[i];
+        }
+    }
+}
+
+/// Trajectory step for learning
+#[derive(Clone)]
+pub struct TrajectoryStep {
+    pub state: Vec<f32>,
+    pub action_embedding: Vec<f32>,
+    pub reward: f32,
+}
+
+/// Trajectory builder
+pub struct TrajectoryBuilder {
+    pub initial_state: Vec<f32>,
+    pub steps: Vec<TrajectoryStep>,
+}
+
+impl TrajectoryBuilder {
+    pub fn new(initial_state: Vec<f32>) -> Self {
+        Self {
+            initial_state,
+            steps: Vec::new(),
+        }
+    }
+
+    pub fn add_step(&mut self, state: Vec<f32>, action: Vec<f32>, reward: f32) {
+        self.steps.push(TrajectoryStep {
+            state,
+            action_embedding: action,
+            reward,
+        });
+    }
+}
+
+/// SONA engine (simplified for benchmarking)
+pub struct SonaEngine {
+    pub micro_lora: MicroLoRA,
+    pub base_lora: BaseLoRA,
+    pub ewc: EwcPlusPlus,
+    pub dim: usize,
+}
+
+impl SonaEngine {
+    pub fn new(dim: usize) -> Self {
+        let micro_rank = 2;
+        let base_rank = 8;
+        let param_count = dim * micro_rank * 2 + dim * base_rank * 2;
+
+        Self {
+            micro_lora: MicroLoRA::new(dim, micro_rank),
+            base_lora: BaseLoRA::new(dim, base_rank),
+            ewc: EwcPlusPlus::new(param_count, 0.4),
+            dim,
+        }
+    }
+
+    /// Begin trajectory
+    pub fn begin_trajectory(&self, initial_state: Vec<f32>) -> TrajectoryBuilder {
+        TrajectoryBuilder::new(initial_state)
+    }
+
+    /// End trajectory and trigger learning
+    pub fn end_trajectory(&mut self, builder: TrajectoryBuilder, final_reward: f32) {
+        // Simplified learning: update micro-LoRA based on reward
+        let lr = 0.001 * final_reward.max(0.0);
+
+        // Pseudo-gradient (simplified)
+        let grad_a: Vec<f32> = self.micro_lora.a.iter().map(|w| w * lr).collect();
+        let grad_b: Vec<f32> = self.micro_lora.b.iter().map(|w| w * lr).collect();
+
+        self.micro_lora.update(&grad_a, &grad_b, lr);
+    }
+
+    /// Apply micro-LoRA (instant)
+    #[inline]
+    pub fn apply_micro(&self, input: &[f32], output: &mut [f32]) {
+        self.micro_lora.apply(input, output);
+    }
+
+    /// Apply base-LoRA (background)
+    pub fn apply_base(&self, input: &[f32], output: &mut [f32]) {
+        self.base_lora.apply(input, output);
+    }
+
+    /// Apply both LoRAs combined
+    pub fn apply_combined(&self, input: &[f32], output: &mut [f32]) {
+        // Apply micro first
+        let mut intermediate = vec![0.0f32; self.dim];
+        self.micro_lora.apply(input, &mut intermediate);
+        // Then base
+        self.base_lora.apply(&intermediate, output);
+    }
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+fn generate_state(dim: usize, seed: u64) -> Vec<f32> {
+    (0..dim)
+        .map(|i| ((seed as f32 * 0.123 + i as f32 * 0.456).sin()))
+        .collect()
+}
+
+/// Benchmark Micro-LoRA application (target: <50us)
+fn bench_micro_lora_apply(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_micro_lora_apply");
+    group.throughput(Throughput::Elements(1));
+
+    for dim in [64, 128, 256, 512] {
+        let lora = MicroLoRA::new(dim, 2); // Rank 2
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
+        });
+    }
+
+    // Different ranks
+    let dim = 256;
+    for rank in [1, 2, 4] {
+        let lora = MicroLoRA::new(dim, rank);
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
+            b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark zero-allocation Micro-LoRA
+fn bench_micro_lora_zero_alloc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_micro_lora_zero_alloc");
+    group.throughput(Throughput::Elements(1));
+
+    for dim in [64, 128, 256, 512] {
+        let lora = MicroLoRA::new(dim, 2);
+        let input = generate_state(dim, 42);
+        let mut hidden = vec![0.0f32; 2];
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| {
+                lora.apply_zero_alloc(
+                    black_box(&input),
+                    black_box(&mut hidden),
+                    black_box(&mut output),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark Base-LoRA application
+fn bench_base_lora_apply(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_base_lora_apply");
+    group.throughput(Throughput::Elements(1));
+
+    for dim in [64, 128, 256, 512] {
+        let lora = BaseLoRA::new(dim, 8); // Rank 8
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
+        });
+    }
+
+    // Different ranks
+    let dim = 256;
+    for rank in [4, 8, 16, 32] {
+        let lora = BaseLoRA::new(dim, rank);
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        group.bench_with_input(BenchmarkId::new("rank", rank), &rank, |b, _| {
+            b.iter(|| lora.apply(black_box(&input), black_box(&mut output)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark EWC++ penalty computation
+fn bench_ewc_penalty(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_ewc_penalty");
+    group.throughput(Throughput::Elements(1));
+
+    for param_count in [1000, 10000, 100000] {
+        let ewc = EwcPlusPlus::new(param_count, 0.4);
+        let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
+
+        group.bench_with_input(
+            BenchmarkId::new("params", param_count),
+            &param_count,
+            |b, _| b.iter(|| black_box(ewc.penalty(black_box(&weights)))),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark EWC++ consolidation
+fn bench_ewc_consolidate(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_ewc_consolidate");
+
+    for param_count in [1000, 10000, 100000] {
+        let mut ewc = EwcPlusPlus::new(param_count, 0.4);
+        let weights: Vec<f32> = (0..param_count).map(|i| (i as f32 * 0.001).sin()).collect();
+        let new_fisher: Vec<f32> = (0..param_count)
+            .map(|i| (i as f32 * 0.002).cos().abs())
+            .collect();
+
+        group.bench_with_input(
+            BenchmarkId::new("params", param_count),
+            &param_count,
+            |b, _| b.iter(|| ewc.consolidate(black_box(&weights), black_box(&new_fisher))),
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark full trajectory learning cycle
+fn bench_trajectory_learning(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_trajectory_learning");
+
+    let dim = 256;
+    let mut engine = SonaEngine::new(dim);
+
+    // Single step trajectory
+    group.bench_function("single_step_trajectory", |b| {
+        b.iter(|| {
+            let mut builder = engine.begin_trajectory(generate_state(dim, 42));
+            builder.add_step(generate_state(dim, 43), vec![], 0.8);
+            engine.end_trajectory(builder, black_box(0.85));
+        })
+    });
+
+    // Multi-step trajectory
+    group.bench_function("10_step_trajectory", |b| {
+        b.iter(|| {
+            let mut builder = engine.begin_trajectory(generate_state(dim, 42));
+            for i in 0..10 {
+                builder.add_step(generate_state(dim, 43 + i), vec![], 0.5 + (i as f32) * 0.05);
+            }
+            engine.end_trajectory(builder, black_box(0.9));
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark combined LoRA application
+fn bench_combined_lora(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_combined_lora");
+
+    for dim in [64, 128, 256, 512] {
+        let engine = SonaEngine::new(dim);
+        let input = generate_state(dim, 42);
+        let mut output = vec![0.0f32; dim];
+
+        // Micro only
+        group.bench_with_input(BenchmarkId::new("micro_only", dim), &dim, |b, _| {
+            b.iter(|| engine.apply_micro(black_box(&input), black_box(&mut output)))
+        });
+
+        // Base only
+        group.bench_with_input(BenchmarkId::new("base_only", dim), &dim, |b, _| {
+            b.iter(|| engine.apply_base(black_box(&input), black_box(&mut output)))
+        });
+
+        // Combined
+        group.bench_with_input(BenchmarkId::new("combined", dim), &dim, |b, _| {
+            b.iter(|| engine.apply_combined(black_box(&input), black_box(&mut output)))
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark batch inference
+fn bench_batch_inference(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_batch_inference");
+
+    let dim = 256;
+    let engine = SonaEngine::new(dim);
+
+    for batch_size in [1, 10, 100, 1000] {
+        let inputs: Vec<Vec<f32>> = (0..batch_size)
+            .map(|i| generate_state(dim, i as u64))
+            .collect();
+        let mut outputs: Vec<Vec<f32>> = (0..batch_size).map(|_| vec![0.0f32; dim]).collect();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("batch", batch_size),
+            &batch_size,
+            |b, _| {
+                b.iter(|| {
+                    for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
+                        engine.apply_micro(input, output);
+                    }
+                    black_box(outputs.len())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark weight update (instant learning)
+fn bench_weight_update(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sona_weight_update");
+
+    for dim in [64, 128, 256, 512] {
+        let mut lora = MicroLoRA::new(dim, 2);
+        let grad_a: Vec<f32> = (0..dim * 2).map(|i| (i as f32 * 0.001).sin()).collect();
+        let grad_b: Vec<f32> = (0..2 * dim).map(|i| (i as f32 * 0.002).cos()).collect();
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, _| {
+            b.iter(|| {
+                lora.update(black_box(&grad_a), black_box(&grad_b), black_box(0.001));
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_micro_lora_apply,
+    bench_micro_lora_zero_alloc,
+    bench_base_lora_apply,
+    bench_ewc_penalty,
+    bench_ewc_consolidate,
+    bench_trajectory_learning,
+    bench_combined_lora,
+    bench_batch_inference,
+    bench_weight_update,
+);
+
+criterion_main!(benches);
--- a/vendor/ruvector/crates/prime-radiant/benches/tile_bench.rs
+++ b/vendor/ruvector/crates/prime-radiant/benches/tile_bench.rs
@@ -0,0 +1,663 @@
+//! Benchmarks for 256-tile parallel tick
+//!
+//! ADR-014 Performance Target: < 1ms for 256-tile parallel tick
+//!
+//! The cognitum-gate-kernel provides 256 WASM tiles, each maintaining
+//! a local graph shard with E-value accumulation and witness fragments.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+
+// ============================================================================
+// Tile Types (Simulated, matching cognitum-gate-kernel structure)
+// ============================================================================
+
+/// Maximum delta buffer per tile
+pub const MAX_DELTA_BUFFER: usize = 64;
+/// Number of tiles in fabric
+pub const NUM_TILES: usize = 256;
+/// Maximum vertices per shard
+pub const MAX_SHARD_VERTICES: usize = 256;
+/// Maximum edges per shard
+pub const MAX_SHARD_EDGES: usize = 1024;
+
+/// Delta operation type
+#[derive(Clone, Copy)]
+pub enum DeltaType {
+    EdgeAdd,
+    EdgeRemove,
+    Observation,
+    WeightUpdate,
+}
+
+/// Delta (change event) for tile
+#[derive(Clone, Copy)]
+pub struct Delta {
+    pub delta_type: DeltaType,
+    pub source: u16,
+    pub target: u16,
+    pub weight: u16,
+    pub payload: u32,
+}
+
+impl Delta {
+    pub fn edge_add(src: u16, tgt: u16, weight: u16) -> Self {
+        Self {
+            delta_type: DeltaType::EdgeAdd,
+            source: src,
+            target: tgt,
+            weight,
+            payload: 0,
+        }
+    }
+
+    pub fn observation(vertex: u16, positive: bool) -> Self {
+        Self {
+            delta_type: DeltaType::Observation,
+            source: vertex,
+            target: 0,
+            weight: 0,
+            payload: positive as u32,
+        }
+    }
+}
+
+/// Compact vertex state
+#[derive(Clone, Copy, Default)]
+pub struct VertexState {
+    pub degree: u8,
+    pub component_id: u8,
+    pub active: bool,
+    pub energy_contrib: f32,
+}
+
+impl VertexState {
+    pub fn is_active(&self) -> bool {
+        self.active
+    }
+}
+
+/// Compact edge
+#[derive(Clone, Copy, Default)]
+pub struct CompactEdge {
+    pub source: u16,
+    pub target: u16,
+    pub weight: u16,
+    pub active: bool,
+}
+
+impl CompactEdge {
+    pub fn is_active(&self) -> bool {
+        self.active
+    }
+}
+
+/// Compact graph for single tile
+pub struct CompactGraph {
+    pub vertices: [VertexState; MAX_SHARD_VERTICES],
+    pub edges: [CompactEdge; MAX_SHARD_EDGES],
+    pub edge_count: usize,
+    pub vertex_count: usize,
+    pub component_count: u8,
+}
+
+impl CompactGraph {
+    pub fn new() -> Self {
+        Self {
+            vertices: [VertexState::default(); MAX_SHARD_VERTICES],
+            edges: [CompactEdge::default(); MAX_SHARD_EDGES],
+            edge_count: 0,
+            vertex_count: 0,
+            component_count: 0,
+        }
+    }
+
+    pub fn add_edge(&mut self, src: u16, tgt: u16, weight: u16) -> bool {
+        if self.edge_count >= MAX_SHARD_EDGES {
+            return false;
+        }
+
+        // Activate vertices
+        self.vertices[src as usize].active = true;
+        self.vertices[src as usize].degree += 1;
+        self.vertices[tgt as usize].active = true;
+        self.vertices[tgt as usize].degree += 1;
+
+        // Add edge
+        self.edges[self.edge_count] = CompactEdge {
+            source: src,
+            target: tgt,
+            weight,
+            active: true,
+        };
+        self.edge_count += 1;
+
+        true
+    }
+
+    pub fn recompute_components(&mut self) {
+        // Simple union-find simulation
+        let mut parent = [0u8; MAX_SHARD_VERTICES];
+        for i in 0..MAX_SHARD_VERTICES {
+            parent[i] = i as u8;
+        }
+
+        // Union edges
+        for edge in &self.edges[..self.edge_count] {
+            if edge.active {
+                let s = edge.source as usize;
+                let t = edge.target as usize;
+                parent[s] = parent[t];
+            }
+        }
+
+        // Count unique components
+        let mut seen = [false; MAX_SHARD_VERTICES];
+        let mut count = 0u8;
+        for i in 0..MAX_SHARD_VERTICES {
+            if self.vertices[i].active && !seen[parent[i] as usize] {
+                seen[parent[i] as usize] = true;
+                count += 1;
+            }
+        }
+        self.component_count = count;
+    }
+
+    pub fn compute_total_energy(&self) -> f32 {
+        let mut energy = 0.0f32;
+        for edge in &self.edges[..self.edge_count] {
+            if edge.active {
+                // Simplified: weight as energy contribution
+                energy += edge.weight as f32 / 100.0;
+            }
+        }
+        energy
+    }
+}
+
+/// E-value accumulator (log-space evidence)
+pub struct EvidenceAccumulator {
+    /// Log e-value (fixed-point: value / 65536 = log2(e-value))
+    pub log_e_values: Vec<i32>,
+    pub hypothesis_count: usize,
+}
+
+impl EvidenceAccumulator {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            log_e_values: vec![0; capacity],
+            hypothesis_count: 0,
+        }
+    }
+
+    pub fn add_hypothesis(&mut self) -> usize {
+        let idx = self.hypothesis_count;
+        if idx < self.log_e_values.len() {
+            self.hypothesis_count += 1;
+        }
+        idx
+    }
+
+    #[inline]
+    pub fn update(&mut self, idx: usize, log_lr: i32) {
+        if idx < self.hypothesis_count {
+            self.log_e_values[idx] = self.log_e_values[idx].saturating_add(log_lr);
+        }
+    }
+
+    pub fn global_log_e(&self) -> i64 {
+        self.log_e_values[..self.hypothesis_count]
+            .iter()
+            .map(|&v| v as i64)
+            .sum()
+    }
+}
+
+/// Tile report (output of tick)
+#[derive(Clone, Copy)]
+pub struct TileReport {
+    pub tile_id: u8,
+    pub tick: u32,
+    pub connected: bool,
+    pub component_count: u8,
+    pub log_e_value: i64,
+    pub energy: f32,
+    pub witness_hash: u64,
+}
+
+impl TileReport {
+    pub fn new(tile_id: u8) -> Self {
+        Self {
+            tile_id,
+            tick: 0,
+            connected: true,
+            component_count: 1,
+            log_e_value: 0,
+            energy: 0.0,
+            witness_hash: 0,
+        }
+    }
+}
+
+/// Single tile state
+pub struct TileState {
+    pub tile_id: u8,
+    pub graph: CompactGraph,
+    pub evidence: EvidenceAccumulator,
+    pub delta_buffer: Vec<Delta>,
+    pub tick_count: u32,
+}
+
+impl TileState {
+    pub fn new(tile_id: u8) -> Self {
+        Self {
+            tile_id,
+            graph: CompactGraph::new(),
+            evidence: EvidenceAccumulator::new(64),
+            delta_buffer: Vec::with_capacity(MAX_DELTA_BUFFER),
+            tick_count: 0,
+        }
+    }
+
+    pub fn ingest_delta(&mut self, delta: &Delta) -> bool {
+        if self.delta_buffer.len() >= MAX_DELTA_BUFFER {
+            return false;
+        }
+        self.delta_buffer.push(*delta);
+        true
+    }
+
+    pub fn tick(&mut self, tick_number: u32) -> TileReport {
+        // Process pending deltas
+        for delta in self.delta_buffer.drain(..) {
+            match delta.delta_type {
+                DeltaType::EdgeAdd => {
+                    self.graph
+                        .add_edge(delta.source, delta.target, delta.weight);
+                }
+                DeltaType::Observation => {
+                    // Update evidence accumulator
+                    let log_lr = if delta.payload != 0 { 65536 } else { -65536 };
+                    if self.evidence.hypothesis_count > 0 {
+                        self.evidence.update(0, log_lr);
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        // Recompute components if needed
+        self.graph.recompute_components();
+
+        // Compute energy
+        let energy = self.graph.compute_total_energy();
+
+        // Build report
+        self.tick_count = tick_number;
+        TileReport {
+            tile_id: self.tile_id,
+            tick: tick_number,
+            connected: self.graph.component_count <= 1,
+            component_count: self.graph.component_count,
+            log_e_value: self.evidence.global_log_e(),
+            energy,
+            witness_hash: self.compute_witness_hash(),
+        }
+    }
+
+    fn compute_witness_hash(&self) -> u64 {
+        let mut hash = self.tile_id as u64;
+        hash = hash.wrapping_mul(0x517cc1b727220a95);
+        hash ^= self.tick_count as u64;
+        hash = hash.wrapping_mul(0x517cc1b727220a95);
+        hash ^= self.graph.edge_count as u64;
+        hash
+    }
+
+    pub fn reset(&mut self) {
+        self.graph = CompactGraph::new();
+        self.delta_buffer.clear();
+        self.tick_count = 0;
+    }
+}
+
+/// 256-tile coherence fabric
+pub struct CoherenceFabric {
+    pub tiles: Vec<TileState>,
+}
+
+impl CoherenceFabric {
+    pub fn new() -> Self {
+        Self {
+            tiles: (0..NUM_TILES).map(|i| TileState::new(i as u8)).collect(),
+        }
+    }
+
+    /// Execute tick on all tiles sequentially
+    pub fn tick_sequential(&mut self, tick_number: u32) -> Vec<TileReport> {
+        self.tiles.iter_mut().map(|t| t.tick(tick_number)).collect()
+    }
+
+    /// Aggregate reports into global coherence
+    pub fn aggregate_reports(reports: &[TileReport]) -> FabricReport {
+        let total_energy: f32 = reports.iter().map(|r| r.energy).sum();
+        let total_log_e: i64 = reports.iter().map(|r| r.log_e_value).sum();
+        let all_connected = reports.iter().all(|r| r.connected);
+
+        // Compute global witness hash
+        let mut global_hash = 0u64;
+        for r in reports {
+            global_hash = global_hash.wrapping_mul(0x517cc1b727220a95);
+            global_hash ^= r.witness_hash;
+        }
+
+        FabricReport {
+            tick: reports.first().map(|r| r.tick).unwrap_or(0),
+            total_energy,
+            total_log_e,
+            all_connected,
+            global_witness_hash: global_hash,
+        }
+    }
+
+    /// Distribute delta to appropriate tile
+    pub fn distribute_delta(&mut self, node_id: u64, delta: &Delta) {
+        let tile_id = (node_id % NUM_TILES as u64) as usize;
+        self.tiles[tile_id].ingest_delta(delta);
+    }
+}
+
+/// Aggregated fabric report
+pub struct FabricReport {
+    pub tick: u32,
+    pub total_energy: f32,
+    pub total_log_e: i64,
+    pub all_connected: bool,
+    pub global_witness_hash: u64,
+}
+
+// ============================================================================
+// Benchmarks
+// ============================================================================
+
+/// Benchmark single tile tick
+fn bench_single_tile_tick(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_single_tick");
+    group.throughput(Throughput::Elements(1));
+
+    // Empty tick
+    let mut tile = TileState::new(0);
+    group.bench_function("empty", |b| b.iter(|| black_box(tile.tick(black_box(1)))));
+
+    // Tick with small graph
+    let mut tile = TileState::new(0);
+    for i in 0..20u16 {
+        tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
+    }
+    tile.tick(0);
+
+    group.bench_function("small_graph_20_edges", |b| {
+        b.iter(|| black_box(tile.tick(black_box(1))))
+    });
+
+    // Tick with pending deltas
+    group.bench_function("with_10_deltas", |b| {
+        b.iter_batched(
+            || {
+                let mut t = TileState::new(0);
+                for i in 0..10u16 {
+                    t.ingest_delta(&Delta::edge_add(i, i + 1, 100));
+                }
+                t
+            },
+            |mut t| black_box(t.tick(1)),
+            criterion::BatchSize::SmallInput,
+        )
+    });
+
+    // Tick with full delta buffer
+    group.bench_function("with_64_deltas", |b| {
+        b.iter_batched(
+            || {
+                let mut t = TileState::new(0);
+                for i in 0..MAX_DELTA_BUFFER as u16 {
+                    t.ingest_delta(&Delta::edge_add(i % 200, (i + 1) % 200, 100));
+                }
+                t
+            },
+            |mut t| black_box(t.tick(1)),
+            criterion::BatchSize::SmallInput,
+        )
+    });
+
+    group.finish();
+}
+
+/// Benchmark 256-tile parallel tick (sequential baseline)
+fn bench_256_tile_tick_sequential(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_256_sequential");
+    group.throughput(Throughput::Elements(NUM_TILES as u64));
+
+    // Empty fabric
+    let mut fabric = CoherenceFabric::new();
+    group.bench_function("empty_fabric", |b| {
+        b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
+    });
+
+    // Fabric with some data per tile
+    let mut fabric = CoherenceFabric::new();
+    for i in 0..NUM_TILES {
+        for j in 0..10u16 {
+            fabric.tiles[i].ingest_delta(&Delta::edge_add(j, j + 1, 100));
+        }
+        fabric.tiles[i].tick(0);
+    }
+
+    group.bench_function("populated_10_edges_per_tile", |b| {
+        b.iter(|| black_box(fabric.tick_sequential(black_box(1))))
+    });
+
+    group.finish();
+}
+
+/// Benchmark report aggregation
+fn bench_report_aggregation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_report_aggregation");
+    group.throughput(Throughput::Elements(NUM_TILES as u64));
+
+    // Generate 256 reports
+    let reports: Vec<TileReport> = (0..NUM_TILES)
+        .map(|i| TileReport {
+            tile_id: i as u8,
+            tick: 1,
+            connected: i % 10 != 0,
+            component_count: (i % 5) as u8 + 1,
+            log_e_value: (i as i64) * 1000 - 128000,
+            energy: (i as f32) * 0.1,
+            witness_hash: i as u64 * 0x517cc1b727220a95,
+        })
+        .collect();
+
+    group.bench_function("aggregate_256_reports", |b| {
+        b.iter(|| black_box(CoherenceFabric::aggregate_reports(black_box(&reports))))
+    });
+
+    group.finish();
+}
+
+/// Benchmark delta distribution
+fn bench_delta_distribution(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_delta_distribution");
+
+    let mut fabric = CoherenceFabric::new();
+
+    // Single delta
+    let delta = Delta::edge_add(0, 1, 100);
+    group.bench_function("distribute_single", |b| {
+        b.iter(|| fabric.distribute_delta(black_box(12345), black_box(&delta)))
+    });
+
+    // Batch distribution
+    for batch_size in [100, 1000, 10000] {
+        let deltas: Vec<(u64, Delta)> = (0..batch_size)
+            .map(|i| {
+                (
+                    i as u64,
+                    Delta::edge_add((i % 200) as u16, ((i + 1) % 200) as u16, 100),
+                )
+            })
+            .collect();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("distribute_batch", batch_size),
+            &deltas,
+            |b, deltas| {
+                b.iter(|| {
+                    for (node_id, delta) in deltas {
+                        fabric.distribute_delta(*node_id, delta);
+                    }
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark evidence accumulator
+fn bench_evidence_accumulator(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_evidence");
+
+    let mut acc = EvidenceAccumulator::new(64);
+    for _ in 0..16 {
+        acc.add_hypothesis();
+    }
+
+    // Single update
+    group.bench_function("update_single", |b| {
+        b.iter(|| acc.update(black_box(5), black_box(65536)))
+    });
+
+    // Global e-value computation
+    group.bench_function("global_log_e_16_hyp", |b| {
+        b.iter(|| black_box(acc.global_log_e()))
+    });
+
+    // 64 hypotheses
+    let mut acc64 = EvidenceAccumulator::new(64);
+    for _ in 0..64 {
+        acc64.add_hypothesis();
+    }
+    for i in 0..64 {
+        acc64.log_e_values[i] = (i as i32 - 32) * 1000;
+    }
+
+    group.bench_function("global_log_e_64_hyp", |b| {
+        b.iter(|| black_box(acc64.global_log_e()))
+    });
+
+    group.finish();
+}
+
+/// Benchmark component recomputation
+fn bench_component_recompute(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_component_recompute");
+
+    for edge_count in [50, 200, 500, 1000] {
+        let mut graph = CompactGraph::new();
+        for i in 0..edge_count.min(MAX_SHARD_EDGES) {
+            let src = (i % 200) as u16;
+            let tgt = ((i + 1) % 200) as u16;
+            if src != tgt {
+                graph.add_edge(src, tgt, 100);
+            }
+        }
+
+        group.bench_with_input(
+            BenchmarkId::new("recompute", edge_count),
+            &edge_count,
+            |b, _| {
+                b.iter(|| {
+                    graph.recompute_components();
+                    black_box(graph.component_count)
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark full tick + aggregate cycle
+fn bench_full_cycle(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_full_cycle");
+    group.sample_size(50);
+
+    // Populate fabric
+    let mut fabric = CoherenceFabric::new();
+    for i in 0..NUM_TILES {
+        for j in 0..50u16 {
+            fabric.tiles[i].ingest_delta(&Delta::edge_add(j, (j + 1) % 200, 100));
+        }
+        fabric.tiles[i].tick(0);
+    }
+
+    group.bench_function("tick_and_aggregate_256_tiles", |b| {
+        let mut tick = 1u32;
+        b.iter(|| {
+            let reports = fabric.tick_sequential(tick);
+            let fabric_report = CoherenceFabric::aggregate_reports(&reports);
+            tick += 1;
+            black_box(fabric_report)
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark memory access patterns
+fn bench_memory_patterns(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tile_memory");
+
+    // Sequential tile access
+    let fabric = CoherenceFabric::new();
+    group.bench_function("sequential_tile_scan", |b| {
+        b.iter(|| {
+            let mut total = 0usize;
+            for tile in &fabric.tiles {
+                total += tile.graph.edge_count;
+            }
+            black_box(total)
+        })
+    });
+
+    // Strided tile access
+    group.bench_function("strided_tile_scan", |b| {
+        let stride = 7;
+        b.iter(|| {
+            let mut total = 0usize;
+            let mut idx = 0;
+            for _ in 0..NUM_TILES {
+                total += fabric.tiles[idx % NUM_TILES].graph.edge_count;
+                idx += stride;
+            }
+            black_box(total)
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_single_tile_tick,
+    bench_256_tile_tick_sequential,
+    bench_report_aggregation,
+    bench_delta_distribution,
+    bench_evidence_accumulator,
+    bench_component_recompute,
+    bench_full_cycle,
+    bench_memory_patterns,
+);
+
+criterion_main!(benches);