//! GPU-Specific Benchmarks for Prime-Radiant Coherence Engine //! //! This benchmark suite compares CPU and GPU implementations of core //! coherence operations. Requires the `gpu` feature to be enabled. //! //! ## Benchmark Categories //! 1. Energy Computation - CPU vs GPU //! 2. Attention Forward Pass - CPU vs GPU //! 3. Batch Routing Decisions - CPU vs GPU //! 4. Memory Transfer Overhead //! //! ## GPU Backend Notes //! - Primary: wgpu (cross-platform WebGPU) //! - Optional: CUDA (NVIDIA), Metal (Apple), Vulkan //! //! ## Running GPU Benchmarks //! ```bash //! cargo bench --features gpu --bench gpu_benchmarks //! ``` use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use std::collections::hash_map::DefaultHasher; use std::collections::HashMap; use std::hash::{Hash, Hasher}; // ============================================================================ // TEST DATA GENERATION // ============================================================================ fn generate_vec(len: usize, seed: u64) -> Vec { (0..len) .map(|i| { let mut hasher = DefaultHasher::new(); (seed, i).hash(&mut hasher); (hasher.finish() % 1000) as f32 / 1000.0 - 0.5 }) .collect() } fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec { (0..rows * cols) .map(|i| { let mut hasher = DefaultHasher::new(); (seed, i).hash(&mut hasher); (hasher.finish() % 1000) as f32 / 1000.0 - 0.5 }) .collect() } // ============================================================================ // CPU BASELINE IMPLEMENTATIONS // ============================================================================ /// CPU coherence energy computation #[derive(Clone)] struct CpuSheafGraph { nodes: HashMap>, edges: Vec<(u64, u64, f32)>, // (source, target, weight) state_dim: usize, } impl CpuSheafGraph { fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self { let nodes: HashMap> = (0..num_nodes as u64) .map(|id| (id, generate_vec(state_dim, seed + id))) .collect(); let num_edges = (num_nodes * avg_degree) / 2; let edges: Vec<(u64, u64, f32)> = (0..num_edges) .filter_map(|i| { let mut h = DefaultHasher::new(); (seed, i, "src").hash(&mut h); let source = h.finish() % num_nodes as u64; let mut h = DefaultHasher::new(); (seed, i, "tgt").hash(&mut h); let target = h.finish() % num_nodes as u64; if source != target { Some((source, target, 1.0)) } else { None } }) .collect(); Self { nodes, edges, state_dim, } } /// Compute total energy on CPU fn compute_energy_cpu(&self) -> f32 { let mut total = 0.0f32; for &(src, tgt, weight) in &self.edges { let src_state = &self.nodes[&src]; let tgt_state = &self.nodes[&tgt]; let mut norm_sq = 0.0f32; for i in 0..self.state_dim { let diff = src_state[i] - tgt_state[i]; norm_sq += diff * diff; } total += weight * norm_sq; } total } /// Compute energy with per-edge results on CPU fn compute_energy_with_edges_cpu(&self) -> (f32, Vec) { let edge_energies: Vec = self .edges .iter() .map(|&(src, tgt, weight)| { let src_state = &self.nodes[&src]; let tgt_state = &self.nodes[&tgt]; let mut norm_sq = 0.0f32; for i in 0..self.state_dim { let diff = src_state[i] - tgt_state[i]; norm_sq += diff * diff; } weight * norm_sq }) .collect(); let total: f32 = edge_energies.iter().sum(); (total, edge_energies) } } /// CPU attention forward pass (simplified) fn attention_forward_cpu( queries: &[f32], keys: &[f32], values: &[f32], seq_len: usize, head_dim: usize, output: &mut [f32], ) { let scale = 1.0 / (head_dim as f32).sqrt(); // For each query position for i in 0..seq_len { let q_offset = i * head_dim; // Compute attention scores let mut scores = vec![0.0f32; seq_len]; let mut max_score = f32::NEG_INFINITY; for j in 0..seq_len { let k_offset = j * head_dim; let mut dot = 0.0f32; for k in 0..head_dim { dot += queries[q_offset + k] * keys[k_offset + k]; } scores[j] = dot * scale; if scores[j] > max_score { max_score = scores[j]; } } // Softmax let mut sum_exp = 0.0f32; for s in &mut scores { *s = (*s - max_score).exp(); sum_exp += *s; } for s in &mut scores { *s /= sum_exp; } // Weighted sum of values let out_offset = i * head_dim; for k in 0..head_dim { let mut weighted_sum = 0.0f32; for j in 0..seq_len { let v_offset = j * head_dim; weighted_sum += scores[j] * values[v_offset + k]; } output[out_offset + k] = weighted_sum; } } } /// CPU batch routing (expert selection for MoE) fn batch_routing_cpu( token_embeddings: &[f32], expert_weights: &[f32], num_tokens: usize, embed_dim: usize, num_experts: usize, top_k: usize, ) -> Vec<(usize, Vec)> { // token_embeddings: [num_tokens, embed_dim] // expert_weights: [num_experts, embed_dim] // Returns: for each token, the indices of top-k experts let mut results = Vec::with_capacity(num_tokens); for t in 0..num_tokens { let token_offset = t * embed_dim; let token = &token_embeddings[token_offset..token_offset + embed_dim]; // Compute scores for each expert let mut expert_scores: Vec<(usize, f32)> = (0..num_experts) .map(|e| { let expert_offset = e * embed_dim; let expert = &expert_weights[expert_offset..expert_offset + embed_dim]; let mut dot = 0.0f32; for i in 0..embed_dim { dot += token[i] * expert[i]; } (e, dot) }) .collect(); // Sort by score (descending) and take top-k expert_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); let top_experts: Vec = expert_scores .iter() .take(top_k) .map(|(idx, _)| *idx) .collect(); results.push((t, top_experts)); } results } // ============================================================================ // GPU IMPLEMENTATIONS (SIMULATED WITHOUT ACTUAL GPU) // When gpu feature is enabled, these would use actual GPU code // ============================================================================ #[cfg(feature = "gpu")] mod gpu_impl { //! GPU implementations using wgpu or similar //! //! These would contain actual GPU shader code and buffer management. //! For now, we simulate the overhead. use super::*; /// Simulated GPU energy computation /// In reality, this would: /// 1. Upload node states to GPU buffer /// 2. Execute compute shader for parallel residual computation /// 3. Reduce edge energies /// 4. Read back result pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 { // Simulate GPU overhead let _upload_time = simulate_memory_transfer( graph.nodes.len() * graph.state_dim * 4, // bytes true, // host to device ); // Actual computation would happen on GPU // Here we just call CPU version let result = graph.compute_energy_cpu(); let _download_time = simulate_memory_transfer( 4, // single f32 result false, ); result } /// Simulated GPU attention forward pass pub fn attention_forward_gpu( queries: &[f32], keys: &[f32], values: &[f32], seq_len: usize, head_dim: usize, output: &mut [f32], ) { // Simulate upload let input_bytes = (queries.len() + keys.len() + values.len()) * 4; let _upload_time = simulate_memory_transfer(input_bytes, true); // CPU fallback attention_forward_cpu(queries, keys, values, seq_len, head_dim, output); // Simulate download let _download_time = simulate_memory_transfer(output.len() * 4, false); } /// Simulated GPU batch routing pub fn batch_routing_gpu( token_embeddings: &[f32], expert_weights: &[f32], num_tokens: usize, embed_dim: usize, num_experts: usize, top_k: usize, ) -> Vec<(usize, Vec)> { // Simulate upload let input_bytes = (token_embeddings.len() + expert_weights.len()) * 4; let _upload_time = simulate_memory_transfer(input_bytes, true); // CPU fallback let result = batch_routing_cpu( token_embeddings, expert_weights, num_tokens, embed_dim, num_experts, top_k, ); // Simulate download let result_bytes = num_tokens * top_k * 4; let _download_time = simulate_memory_transfer(result_bytes, false); result } /// Simulate memory transfer time /// Returns simulated nanoseconds fn simulate_memory_transfer(bytes: usize, _host_to_device: bool) -> u64 { // Assume ~10 GB/s transfer rate (PCIe 3.0 x16 theoretical) // In practice, smaller transfers have higher overhead let base_overhead_ns = 1000; // 1 microsecond base overhead let transfer_ns = (bytes as u64 * 100) / 1_000_000_000; // ~10 GB/s base_overhead_ns + transfer_ns } } // Fallback for non-GPU builds #[cfg(not(feature = "gpu"))] mod gpu_impl { use super::*; pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 { graph.compute_energy_cpu() } pub fn attention_forward_gpu( queries: &[f32], keys: &[f32], values: &[f32], seq_len: usize, head_dim: usize, output: &mut [f32], ) { attention_forward_cpu(queries, keys, values, seq_len, head_dim, output); } pub fn batch_routing_gpu( token_embeddings: &[f32], expert_weights: &[f32], num_tokens: usize, embed_dim: usize, num_experts: usize, top_k: usize, ) -> Vec<(usize, Vec)> { batch_routing_cpu( token_embeddings, expert_weights, num_tokens, embed_dim, num_experts, top_k, ) } } // ============================================================================ // ENERGY COMPUTATION BENCHMARKS // ============================================================================ fn bench_energy_cpu_vs_gpu(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_energy"); // Test at various graph sizes let sizes = [(1_000, 50), (10_000, 30), (100_000, 10)]; for (num_nodes, sample_size) in sizes { let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42); group.sample_size(sample_size); group.throughput(Throughput::Elements(graph.edges.len() as u64)); group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| { b.iter(|| black_box(graph.compute_energy_cpu())) }); #[cfg(feature = "gpu")] group.bench_with_input(BenchmarkId::new("gpu", num_nodes), &num_nodes, |b, _| { b.iter(|| black_box(gpu_impl::compute_energy_gpu(&graph))) }); } group.finish(); } /// Benchmark energy computation with per-edge tracking fn bench_energy_with_edges(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_energy_with_edges"); for num_nodes in [1_000, 10_000] { let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42); group.throughput(Throughput::Elements(graph.edges.len() as u64)); group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| { b.iter(|| black_box(graph.compute_energy_with_edges_cpu())) }); // GPU version would return per-edge results // Useful for hotspot detection } group.finish(); } // ============================================================================ // ATTENTION BENCHMARKS // ============================================================================ fn bench_attention_cpu_vs_gpu(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_attention"); // Typical attention configurations let configs = [ (128, 64, "small"), // seq_len=128, head_dim=64 (512, 64, "medium"), // seq_len=512, head_dim=64 (2048, 64, "large"), // seq_len=2048, head_dim=64 ]; for (seq_len, head_dim, label) in configs { let queries = generate_vec(seq_len * head_dim, 42); let keys = generate_vec(seq_len * head_dim, 123); let values = generate_vec(seq_len * head_dim, 456); let mut output = vec![0.0f32; seq_len * head_dim]; // Attention is O(n^2) in sequence length let sample_size = if seq_len > 1024 { 10 } else { 50 }; group.sample_size(sample_size); group.throughput(Throughput::Elements((seq_len * seq_len) as u64)); group.bench_with_input(BenchmarkId::new("cpu", label), &seq_len, |b, _| { b.iter(|| { attention_forward_cpu( black_box(&queries), black_box(&keys), black_box(&values), seq_len, head_dim, &mut output, ); black_box(output[0]) }) }); #[cfg(feature = "gpu")] group.bench_with_input(BenchmarkId::new("gpu", label), &seq_len, |b, _| { b.iter(|| { gpu_impl::attention_forward_gpu( black_box(&queries), black_box(&keys), black_box(&values), seq_len, head_dim, &mut output, ); black_box(output[0]) }) }); } group.finish(); } /// Benchmark multi-head attention fn bench_multihead_attention(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_multihead_attention"); let seq_len = 512; let head_dim = 64; let num_heads = 8; let queries = generate_vec(seq_len * head_dim * num_heads, 42); let keys = generate_vec(seq_len * head_dim * num_heads, 123); let values = generate_vec(seq_len * head_dim * num_heads, 456); let mut output = vec![0.0f32; seq_len * head_dim * num_heads]; group.sample_size(20); group.throughput(Throughput::Elements((seq_len * seq_len * num_heads) as u64)); // CPU: sequential over heads group.bench_function("cpu_sequential_heads", |b| { b.iter(|| { for h in 0..num_heads { let offset = h * seq_len * head_dim; let q = &queries[offset..offset + seq_len * head_dim]; let k = &keys[offset..offset + seq_len * head_dim]; let v = &values[offset..offset + seq_len * head_dim]; let out = &mut output[offset..offset + seq_len * head_dim]; attention_forward_cpu(q, k, v, seq_len, head_dim, out); } black_box(output[0]) }) }); // GPU would parallelize across heads #[cfg(feature = "gpu")] group.bench_function("gpu_parallel_heads", |b| { b.iter(|| { // In reality, GPU would process all heads in parallel for h in 0..num_heads { let offset = h * seq_len * head_dim; let q = &queries[offset..offset + seq_len * head_dim]; let k = &keys[offset..offset + seq_len * head_dim]; let v = &values[offset..offset + seq_len * head_dim]; let out = &mut output[offset..offset + seq_len * head_dim]; gpu_impl::attention_forward_gpu(q, k, v, seq_len, head_dim, out); } black_box(output[0]) }) }); group.finish(); } // ============================================================================ // BATCH ROUTING BENCHMARKS (MoE) // ============================================================================ fn bench_batch_routing_cpu_vs_gpu(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_routing"); let embed_dim = 768; // Typical transformer embedding let num_experts = 8; let top_k = 2; for num_tokens in [256, 1024, 4096] { let token_embeddings = generate_vec(num_tokens * embed_dim, 42); let expert_weights = generate_vec(num_experts * embed_dim, 123); let sample_size = if num_tokens > 2048 { 20 } else { 50 }; group.sample_size(sample_size); group.throughput(Throughput::Elements(num_tokens as u64)); group.bench_with_input(BenchmarkId::new("cpu", num_tokens), &num_tokens, |b, _| { b.iter(|| { black_box(batch_routing_cpu( black_box(&token_embeddings), black_box(&expert_weights), num_tokens, embed_dim, num_experts, top_k, )) }) }); #[cfg(feature = "gpu")] group.bench_with_input(BenchmarkId::new("gpu", num_tokens), &num_tokens, |b, _| { b.iter(|| { black_box(gpu_impl::batch_routing_gpu( black_box(&token_embeddings), black_box(&expert_weights), num_tokens, embed_dim, num_experts, top_k, )) }) }); } group.finish(); } // ============================================================================ // MEMORY TRANSFER BENCHMARKS // ============================================================================ fn bench_memory_transfer_overhead(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_memory_transfer"); // Simulate different transfer sizes let sizes_kb = [1, 4, 16, 64, 256, 1024, 4096]; for &size_kb in &sizes_kb { let data = generate_vec(size_kb * 1024 / 4, 42); // f32 = 4 bytes group.throughput(Throughput::Bytes((size_kb * 1024) as u64)); // Baseline: just accessing memory on CPU group.bench_with_input( BenchmarkId::new("cpu_access", format!("{}KB", size_kb)), &size_kb, |b, _| { b.iter(|| { let sum: f32 = data.iter().sum(); black_box(sum) }) }, ); // GPU would have additional transfer overhead // This benchmark shows the amortization point } group.finish(); } // ============================================================================ // CROSSOVER POINT BENCHMARKS // ============================================================================ /// Find the problem size where GPU becomes faster than CPU fn bench_gpu_crossover(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_crossover"); // Matrix multiply is a classic GPU workload // Test different sizes to find crossover let sizes = [32, 64, 128, 256, 512, 1024]; for &size in &sizes { let a = generate_matrix(size, size, 42); let b = generate_matrix(size, size, 123); let mut c = vec![0.0f32; size * size]; group.throughput(Throughput::Elements((size * size * size) as u64)); // O(n^3) let sample_size = if size > 512 { 10 } else { 50 }; group.sample_size(sample_size); // CPU matrix multiply (naive) group.bench_with_input(BenchmarkId::new("cpu_matmul", size), &size, |b_iter, _| { b_iter.iter(|| { for i in 0..size { for j in 0..size { let mut sum = 0.0f32; for k in 0..size { sum += a[i * size + k] * b[k * size + j]; } c[i * size + j] = sum; } } black_box(c[0]) }) }); // GPU would win for size >= 256 typically } group.finish(); } // ============================================================================ // COHERENCE-SPECIFIC GPU PATTERNS // ============================================================================ /// Benchmark parallel residual computation pattern fn bench_parallel_residual(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_parallel_residual"); let state_dim = 64; for num_edges in [1_000, 10_000, 100_000] { // Prepare edge data in GPU-friendly format let sources: Vec> = (0..num_edges) .map(|i| generate_vec(state_dim, i as u64)) .collect(); let targets: Vec> = (0..num_edges) .map(|i| generate_vec(state_dim, i as u64 + 1000000)) .collect(); let sample_size = if num_edges > 50000 { 10 } else { 50 }; group.sample_size(sample_size); group.throughput(Throughput::Elements(num_edges as u64)); // CPU sequential group.bench_with_input( BenchmarkId::new("cpu_sequential", num_edges), &num_edges, |b, _| { b.iter(|| { let mut total = 0.0f32; for (src, tgt) in sources.iter().zip(targets.iter()) { let mut norm_sq = 0.0f32; for i in 0..state_dim { let diff = src[i] - tgt[i]; norm_sq += diff * diff; } total += norm_sq; } black_box(total) }) }, ); // GPU would parallelize all edges // Each work item computes one residual } group.finish(); } /// Benchmark reduction patterns (sum of energies) fn bench_gpu_reduction(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_reduction"); for size in [1_000, 10_000, 100_000, 1_000_000] { let data = generate_vec(size, 42); let sample_size = if size > 100000 { 10 } else { 50 }; group.sample_size(sample_size); group.throughput(Throughput::Elements(size as u64)); // CPU sequential sum group.bench_with_input(BenchmarkId::new("cpu_sum", size), &size, |b, _| { b.iter(|| { let sum: f32 = data.iter().sum(); black_box(sum) }) }); // CPU parallel reduction would use multiple accumulators group.bench_with_input(BenchmarkId::new("cpu_parallel", size), &size, |b, _| { b.iter(|| { let chunks = data.chunks(1024); let partial_sums: Vec = chunks.map(|c| c.iter().sum()).collect(); let sum: f32 = partial_sums.iter().sum(); black_box(sum) }) }); // GPU reduction uses tree-based parallel reduction } group.finish(); } // ============================================================================ // CRITERION CONFIGURATION // ============================================================================ criterion_group!( energy_benches, bench_energy_cpu_vs_gpu, bench_energy_with_edges, ); criterion_group!( attention_benches, bench_attention_cpu_vs_gpu, bench_multihead_attention, ); criterion_group!(routing_benches, bench_batch_routing_cpu_vs_gpu,); criterion_group!( transfer_benches, bench_memory_transfer_overhead, bench_gpu_crossover, ); criterion_group!( coherence_gpu_benches, bench_parallel_residual, bench_gpu_reduction, ); criterion_main!( energy_benches, attention_benches, routing_benches, transfer_benches, coherence_gpu_benches );