Files
wifi-densepose/vendor/ruvector/crates/prime-radiant/benches/gpu_benchmarks.rs

785 lines
24 KiB
Rust

//! GPU-Specific Benchmarks for Prime-Radiant Coherence Engine
//!
//! This benchmark suite compares CPU and GPU implementations of core
//! coherence operations. Requires the `gpu` feature to be enabled.
//!
//! ## Benchmark Categories
//! 1. Energy Computation - CPU vs GPU
//! 2. Attention Forward Pass - CPU vs GPU
//! 3. Batch Routing Decisions - CPU vs GPU
//! 4. Memory Transfer Overhead
//!
//! ## GPU Backend Notes
//! - Primary: wgpu (cross-platform WebGPU)
//! - Optional: CUDA (NVIDIA), Metal (Apple), Vulkan
//!
//! ## Running GPU Benchmarks
//! ```bash
//! cargo bench --features gpu --bench gpu_benchmarks
//! ```
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
// ============================================================================
// TEST DATA GENERATION
// ============================================================================
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
(0..len)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
(0..rows * cols)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
// ============================================================================
// CPU BASELINE IMPLEMENTATIONS
// ============================================================================
/// CPU coherence energy computation
#[derive(Clone)]
struct CpuSheafGraph {
nodes: HashMap<u64, Vec<f32>>,
edges: Vec<(u64, u64, f32)>, // (source, target, weight)
state_dim: usize,
}
impl CpuSheafGraph {
fn random(num_nodes: usize, avg_degree: usize, state_dim: usize, seed: u64) -> Self {
let nodes: HashMap<u64, Vec<f32>> = (0..num_nodes as u64)
.map(|id| (id, generate_vec(state_dim, seed + id)))
.collect();
let num_edges = (num_nodes * avg_degree) / 2;
let edges: Vec<(u64, u64, f32)> = (0..num_edges)
.filter_map(|i| {
let mut h = DefaultHasher::new();
(seed, i, "src").hash(&mut h);
let source = h.finish() % num_nodes as u64;
let mut h = DefaultHasher::new();
(seed, i, "tgt").hash(&mut h);
let target = h.finish() % num_nodes as u64;
if source != target {
Some((source, target, 1.0))
} else {
None
}
})
.collect();
Self {
nodes,
edges,
state_dim,
}
}
/// Compute total energy on CPU
fn compute_energy_cpu(&self) -> f32 {
let mut total = 0.0f32;
for &(src, tgt, weight) in &self.edges {
let src_state = &self.nodes[&src];
let tgt_state = &self.nodes[&tgt];
let mut norm_sq = 0.0f32;
for i in 0..self.state_dim {
let diff = src_state[i] - tgt_state[i];
norm_sq += diff * diff;
}
total += weight * norm_sq;
}
total
}
/// Compute energy with per-edge results on CPU
fn compute_energy_with_edges_cpu(&self) -> (f32, Vec<f32>) {
let edge_energies: Vec<f32> = self
.edges
.iter()
.map(|&(src, tgt, weight)| {
let src_state = &self.nodes[&src];
let tgt_state = &self.nodes[&tgt];
let mut norm_sq = 0.0f32;
for i in 0..self.state_dim {
let diff = src_state[i] - tgt_state[i];
norm_sq += diff * diff;
}
weight * norm_sq
})
.collect();
let total: f32 = edge_energies.iter().sum();
(total, edge_energies)
}
}
/// CPU attention forward pass (simplified)
fn attention_forward_cpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
let scale = 1.0 / (head_dim as f32).sqrt();
// For each query position
for i in 0..seq_len {
let q_offset = i * head_dim;
// Compute attention scores
let mut scores = vec![0.0f32; seq_len];
let mut max_score = f32::NEG_INFINITY;
for j in 0..seq_len {
let k_offset = j * head_dim;
let mut dot = 0.0f32;
for k in 0..head_dim {
dot += queries[q_offset + k] * keys[k_offset + k];
}
scores[j] = dot * scale;
if scores[j] > max_score {
max_score = scores[j];
}
}
// Softmax
let mut sum_exp = 0.0f32;
for s in &mut scores {
*s = (*s - max_score).exp();
sum_exp += *s;
}
for s in &mut scores {
*s /= sum_exp;
}
// Weighted sum of values
let out_offset = i * head_dim;
for k in 0..head_dim {
let mut weighted_sum = 0.0f32;
for j in 0..seq_len {
let v_offset = j * head_dim;
weighted_sum += scores[j] * values[v_offset + k];
}
output[out_offset + k] = weighted_sum;
}
}
}
/// CPU batch routing (expert selection for MoE)
fn batch_routing_cpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
// token_embeddings: [num_tokens, embed_dim]
// expert_weights: [num_experts, embed_dim]
// Returns: for each token, the indices of top-k experts
let mut results = Vec::with_capacity(num_tokens);
for t in 0..num_tokens {
let token_offset = t * embed_dim;
let token = &token_embeddings[token_offset..token_offset + embed_dim];
// Compute scores for each expert
let mut expert_scores: Vec<(usize, f32)> = (0..num_experts)
.map(|e| {
let expert_offset = e * embed_dim;
let expert = &expert_weights[expert_offset..expert_offset + embed_dim];
let mut dot = 0.0f32;
for i in 0..embed_dim {
dot += token[i] * expert[i];
}
(e, dot)
})
.collect();
// Sort by score (descending) and take top-k
expert_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let top_experts: Vec<usize> = expert_scores
.iter()
.take(top_k)
.map(|(idx, _)| *idx)
.collect();
results.push((t, top_experts));
}
results
}
// ============================================================================
// GPU IMPLEMENTATIONS (SIMULATED WITHOUT ACTUAL GPU)
// When gpu feature is enabled, these would use actual GPU code
// ============================================================================
#[cfg(feature = "gpu")]
mod gpu_impl {
//! GPU implementations using wgpu or similar
//!
//! These would contain actual GPU shader code and buffer management.
//! For now, we simulate the overhead.
use super::*;
/// Simulated GPU energy computation
/// In reality, this would:
/// 1. Upload node states to GPU buffer
/// 2. Execute compute shader for parallel residual computation
/// 3. Reduce edge energies
/// 4. Read back result
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
// Simulate GPU overhead
let _upload_time = simulate_memory_transfer(
graph.nodes.len() * graph.state_dim * 4, // bytes
true, // host to device
);
// Actual computation would happen on GPU
// Here we just call CPU version
let result = graph.compute_energy_cpu();
let _download_time = simulate_memory_transfer(
4, // single f32 result
false,
);
result
}
/// Simulated GPU attention forward pass
pub fn attention_forward_gpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
// Simulate upload
let input_bytes = (queries.len() + keys.len() + values.len()) * 4;
let _upload_time = simulate_memory_transfer(input_bytes, true);
// CPU fallback
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
// Simulate download
let _download_time = simulate_memory_transfer(output.len() * 4, false);
}
/// Simulated GPU batch routing
pub fn batch_routing_gpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
// Simulate upload
let input_bytes = (token_embeddings.len() + expert_weights.len()) * 4;
let _upload_time = simulate_memory_transfer(input_bytes, true);
// CPU fallback
let result = batch_routing_cpu(
token_embeddings,
expert_weights,
num_tokens,
embed_dim,
num_experts,
top_k,
);
// Simulate download
let result_bytes = num_tokens * top_k * 4;
let _download_time = simulate_memory_transfer(result_bytes, false);
result
}
/// Simulate memory transfer time
/// Returns simulated nanoseconds
fn simulate_memory_transfer(bytes: usize, _host_to_device: bool) -> u64 {
// Assume ~10 GB/s transfer rate (PCIe 3.0 x16 theoretical)
// In practice, smaller transfers have higher overhead
let base_overhead_ns = 1000; // 1 microsecond base overhead
let transfer_ns = (bytes as u64 * 100) / 1_000_000_000; // ~10 GB/s
base_overhead_ns + transfer_ns
}
}
// Fallback for non-GPU builds
#[cfg(not(feature = "gpu"))]
mod gpu_impl {
use super::*;
pub fn compute_energy_gpu(graph: &CpuSheafGraph) -> f32 {
graph.compute_energy_cpu()
}
pub fn attention_forward_gpu(
queries: &[f32],
keys: &[f32],
values: &[f32],
seq_len: usize,
head_dim: usize,
output: &mut [f32],
) {
attention_forward_cpu(queries, keys, values, seq_len, head_dim, output);
}
pub fn batch_routing_gpu(
token_embeddings: &[f32],
expert_weights: &[f32],
num_tokens: usize,
embed_dim: usize,
num_experts: usize,
top_k: usize,
) -> Vec<(usize, Vec<usize>)> {
batch_routing_cpu(
token_embeddings,
expert_weights,
num_tokens,
embed_dim,
num_experts,
top_k,
)
}
}
// ============================================================================
// ENERGY COMPUTATION BENCHMARKS
// ============================================================================
fn bench_energy_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_energy");
// Test at various graph sizes
let sizes = [(1_000, 50), (10_000, 30), (100_000, 10)];
for (num_nodes, sample_size) in sizes {
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
group.sample_size(sample_size);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(graph.compute_energy_cpu()))
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(gpu_impl::compute_energy_gpu(&graph)))
});
}
group.finish();
}
/// Benchmark energy computation with per-edge tracking
fn bench_energy_with_edges(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_energy_with_edges");
for num_nodes in [1_000, 10_000] {
let graph = CpuSheafGraph::random(num_nodes, 4, 64, 42);
group.throughput(Throughput::Elements(graph.edges.len() as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_nodes), &num_nodes, |b, _| {
b.iter(|| black_box(graph.compute_energy_with_edges_cpu()))
});
// GPU version would return per-edge results
// Useful for hotspot detection
}
group.finish();
}
// ============================================================================
// ATTENTION BENCHMARKS
// ============================================================================
fn bench_attention_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_attention");
// Typical attention configurations
let configs = [
(128, 64, "small"), // seq_len=128, head_dim=64
(512, 64, "medium"), // seq_len=512, head_dim=64
(2048, 64, "large"), // seq_len=2048, head_dim=64
];
for (seq_len, head_dim, label) in configs {
let queries = generate_vec(seq_len * head_dim, 42);
let keys = generate_vec(seq_len * head_dim, 123);
let values = generate_vec(seq_len * head_dim, 456);
let mut output = vec![0.0f32; seq_len * head_dim];
// Attention is O(n^2) in sequence length
let sample_size = if seq_len > 1024 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements((seq_len * seq_len) as u64));
group.bench_with_input(BenchmarkId::new("cpu", label), &seq_len, |b, _| {
b.iter(|| {
attention_forward_cpu(
black_box(&queries),
black_box(&keys),
black_box(&values),
seq_len,
head_dim,
&mut output,
);
black_box(output[0])
})
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", label), &seq_len, |b, _| {
b.iter(|| {
gpu_impl::attention_forward_gpu(
black_box(&queries),
black_box(&keys),
black_box(&values),
seq_len,
head_dim,
&mut output,
);
black_box(output[0])
})
});
}
group.finish();
}
/// Benchmark multi-head attention
fn bench_multihead_attention(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_multihead_attention");
let seq_len = 512;
let head_dim = 64;
let num_heads = 8;
let queries = generate_vec(seq_len * head_dim * num_heads, 42);
let keys = generate_vec(seq_len * head_dim * num_heads, 123);
let values = generate_vec(seq_len * head_dim * num_heads, 456);
let mut output = vec![0.0f32; seq_len * head_dim * num_heads];
group.sample_size(20);
group.throughput(Throughput::Elements((seq_len * seq_len * num_heads) as u64));
// CPU: sequential over heads
group.bench_function("cpu_sequential_heads", |b| {
b.iter(|| {
for h in 0..num_heads {
let offset = h * seq_len * head_dim;
let q = &queries[offset..offset + seq_len * head_dim];
let k = &keys[offset..offset + seq_len * head_dim];
let v = &values[offset..offset + seq_len * head_dim];
let out = &mut output[offset..offset + seq_len * head_dim];
attention_forward_cpu(q, k, v, seq_len, head_dim, out);
}
black_box(output[0])
})
});
// GPU would parallelize across heads
#[cfg(feature = "gpu")]
group.bench_function("gpu_parallel_heads", |b| {
b.iter(|| {
// In reality, GPU would process all heads in parallel
for h in 0..num_heads {
let offset = h * seq_len * head_dim;
let q = &queries[offset..offset + seq_len * head_dim];
let k = &keys[offset..offset + seq_len * head_dim];
let v = &values[offset..offset + seq_len * head_dim];
let out = &mut output[offset..offset + seq_len * head_dim];
gpu_impl::attention_forward_gpu(q, k, v, seq_len, head_dim, out);
}
black_box(output[0])
})
});
group.finish();
}
// ============================================================================
// BATCH ROUTING BENCHMARKS (MoE)
// ============================================================================
fn bench_batch_routing_cpu_vs_gpu(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_routing");
let embed_dim = 768; // Typical transformer embedding
let num_experts = 8;
let top_k = 2;
for num_tokens in [256, 1024, 4096] {
let token_embeddings = generate_vec(num_tokens * embed_dim, 42);
let expert_weights = generate_vec(num_experts * embed_dim, 123);
let sample_size = if num_tokens > 2048 { 20 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(num_tokens as u64));
group.bench_with_input(BenchmarkId::new("cpu", num_tokens), &num_tokens, |b, _| {
b.iter(|| {
black_box(batch_routing_cpu(
black_box(&token_embeddings),
black_box(&expert_weights),
num_tokens,
embed_dim,
num_experts,
top_k,
))
})
});
#[cfg(feature = "gpu")]
group.bench_with_input(BenchmarkId::new("gpu", num_tokens), &num_tokens, |b, _| {
b.iter(|| {
black_box(gpu_impl::batch_routing_gpu(
black_box(&token_embeddings),
black_box(&expert_weights),
num_tokens,
embed_dim,
num_experts,
top_k,
))
})
});
}
group.finish();
}
// ============================================================================
// MEMORY TRANSFER BENCHMARKS
// ============================================================================
fn bench_memory_transfer_overhead(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_memory_transfer");
// Simulate different transfer sizes
let sizes_kb = [1, 4, 16, 64, 256, 1024, 4096];
for &size_kb in &sizes_kb {
let data = generate_vec(size_kb * 1024 / 4, 42); // f32 = 4 bytes
group.throughput(Throughput::Bytes((size_kb * 1024) as u64));
// Baseline: just accessing memory on CPU
group.bench_with_input(
BenchmarkId::new("cpu_access", format!("{}KB", size_kb)),
&size_kb,
|b, _| {
b.iter(|| {
let sum: f32 = data.iter().sum();
black_box(sum)
})
},
);
// GPU would have additional transfer overhead
// This benchmark shows the amortization point
}
group.finish();
}
// ============================================================================
// CROSSOVER POINT BENCHMARKS
// ============================================================================
/// Find the problem size where GPU becomes faster than CPU
fn bench_gpu_crossover(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_crossover");
// Matrix multiply is a classic GPU workload
// Test different sizes to find crossover
let sizes = [32, 64, 128, 256, 512, 1024];
for &size in &sizes {
let a = generate_matrix(size, size, 42);
let b = generate_matrix(size, size, 123);
let mut c = vec![0.0f32; size * size];
group.throughput(Throughput::Elements((size * size * size) as u64)); // O(n^3)
let sample_size = if size > 512 { 10 } else { 50 };
group.sample_size(sample_size);
// CPU matrix multiply (naive)
group.bench_with_input(BenchmarkId::new("cpu_matmul", size), &size, |b_iter, _| {
b_iter.iter(|| {
for i in 0..size {
for j in 0..size {
let mut sum = 0.0f32;
for k in 0..size {
sum += a[i * size + k] * b[k * size + j];
}
c[i * size + j] = sum;
}
}
black_box(c[0])
})
});
// GPU would win for size >= 256 typically
}
group.finish();
}
// ============================================================================
// COHERENCE-SPECIFIC GPU PATTERNS
// ============================================================================
/// Benchmark parallel residual computation pattern
fn bench_parallel_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_parallel_residual");
let state_dim = 64;
for num_edges in [1_000, 10_000, 100_000] {
// Prepare edge data in GPU-friendly format
let sources: Vec<Vec<f32>> = (0..num_edges)
.map(|i| generate_vec(state_dim, i as u64))
.collect();
let targets: Vec<Vec<f32>> = (0..num_edges)
.map(|i| generate_vec(state_dim, i as u64 + 1000000))
.collect();
let sample_size = if num_edges > 50000 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(num_edges as u64));
// CPU sequential
group.bench_with_input(
BenchmarkId::new("cpu_sequential", num_edges),
&num_edges,
|b, _| {
b.iter(|| {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
let mut norm_sq = 0.0f32;
for i in 0..state_dim {
let diff = src[i] - tgt[i];
norm_sq += diff * diff;
}
total += norm_sq;
}
black_box(total)
})
},
);
// GPU would parallelize all edges
// Each work item computes one residual
}
group.finish();
}
/// Benchmark reduction patterns (sum of energies)
fn bench_gpu_reduction(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_reduction");
for size in [1_000, 10_000, 100_000, 1_000_000] {
let data = generate_vec(size, 42);
let sample_size = if size > 100000 { 10 } else { 50 };
group.sample_size(sample_size);
group.throughput(Throughput::Elements(size as u64));
// CPU sequential sum
group.bench_with_input(BenchmarkId::new("cpu_sum", size), &size, |b, _| {
b.iter(|| {
let sum: f32 = data.iter().sum();
black_box(sum)
})
});
// CPU parallel reduction would use multiple accumulators
group.bench_with_input(BenchmarkId::new("cpu_parallel", size), &size, |b, _| {
b.iter(|| {
let chunks = data.chunks(1024);
let partial_sums: Vec<f32> = chunks.map(|c| c.iter().sum()).collect();
let sum: f32 = partial_sums.iter().sum();
black_box(sum)
})
});
// GPU reduction uses tree-based parallel reduction
}
group.finish();
}
// ============================================================================
// CRITERION CONFIGURATION
// ============================================================================
criterion_group!(
energy_benches,
bench_energy_cpu_vs_gpu,
bench_energy_with_edges,
);
criterion_group!(
attention_benches,
bench_attention_cpu_vs_gpu,
bench_multihead_attention,
);
criterion_group!(routing_benches, bench_batch_routing_cpu_vs_gpu,);
criterion_group!(
transfer_benches,
bench_memory_transfer_overhead,
bench_gpu_crossover,
);
criterion_group!(
coherence_gpu_benches,
bench_parallel_residual,
bench_gpu_reduction,
);
criterion_main!(
energy_benches,
attention_benches,
routing_benches,
transfer_benches,
coherence_gpu_benches
);