//! GPU Acceleration Benchmarks //! //! Benchmarks comparing CPU vs GPU performance for: //! - Similarity computations //! - Pooling operations //! - Vector operations use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput}; #[cfg(feature = "gpu")] use ruvector_onnx_embeddings::gpu::{ GpuAccelerator, GpuConfig, GpuPooler, GpuSimilarity, GpuVectorOps, batch_cosine_similarity_gpu, batch_dot_product_gpu, batch_euclidean_gpu, }; /// CPU baseline implementations for comparison mod cpu_baseline { use rayon::prelude::*; pub fn batch_cosine_similarity(query: &[f32], candidates: &[Vec]) -> Vec { candidates .par_iter() .map(|c| cosine_similarity(query, c)) .collect() } pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); let norm_a: f32 = a.iter().map(|x| x * x).sum::().sqrt(); let norm_b: f32 = b.iter().map(|x| x * x).sum::().sqrt(); if norm_a > 1e-12 && norm_b > 1e-12 { dot / (norm_a * norm_b) } else { 0.0 } } pub fn mean_pool( tokens: &[f32], mask: &[i64], batch_size: usize, seq_length: usize, hidden_size: usize, ) -> Vec { let mut output = vec![0.0f32; batch_size * hidden_size]; for batch_idx in 0..batch_size { let tokens_base = batch_idx * seq_length * hidden_size; let mask_base = batch_idx * seq_length; let out_base = batch_idx * hidden_size; let mut count = 0.0f32; for seq_idx in 0..seq_length { if mask[mask_base + seq_idx] == 1 { let start = tokens_base + seq_idx * hidden_size; for j in 0..hidden_size { output[out_base + j] += tokens[start + j]; } count += 1.0; } } if count > 0.0 { for j in 0..hidden_size { output[out_base + j] /= count; } } } output } pub fn normalize_batch(vectors: &mut [f32], dimension: usize) { for chunk in vectors.chunks_mut(dimension) { let norm: f32 = chunk.iter().map(|x| x * x).sum::().sqrt(); if norm > 1e-12 { for val in chunk.iter_mut() { *val /= norm; } } } } } // ==================== Similarity Benchmarks ==================== fn similarity_benchmarks(c: &mut Criterion) { let mut group = c.benchmark_group("similarity"); // Test different dimensions for dimension in [128, 384, 768, 1536].iter() { let query: Vec = (0..*dimension).map(|i| (i as f32) * 0.001).collect(); // Test different candidate counts for num_candidates in [100, 1000, 10000].iter() { let candidates: Vec> = (0..*num_candidates) .map(|i| { (0..*dimension) .map(|j| ((i + j) as f32) * 0.0001) .collect() }) .collect(); let id = format!("dim{}_n{}", dimension, num_candidates); group.throughput(Throughput::Elements(*num_candidates as u64)); // CPU baseline group.bench_with_input( BenchmarkId::new("cpu_cosine", &id), &(&query, &candidates), |b, (q, c)| { b.iter(|| cpu_baseline::batch_cosine_similarity(black_box(q), black_box(c))) }, ); // GPU implementation (uses rayon parallel CPU when GPU unavailable) #[cfg(feature = "gpu")] { let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect(); group.bench_with_input( BenchmarkId::new("gpu_cosine", &id), &(&query, &refs), |b, (q, c)| { b.iter(|| batch_cosine_similarity_gpu(black_box(q), black_box(c))) }, ); } } } group.finish(); } // ==================== Pooling Benchmarks ==================== fn pooling_benchmarks(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_pooling"); // Test different batch sizes and sequence lengths for (batch_size, seq_length, hidden_size) in [ (1, 128, 384), (8, 128, 384), (32, 128, 384), (64, 256, 768), (128, 512, 384), ] { let tokens: Vec = (0..batch_size * seq_length * hidden_size) .map(|i| (i as f32) * 0.0001) .collect(); let mask: Vec = (0..batch_size * seq_length) .map(|i| if i % seq_length < seq_length - 10 { 1 } else { 0 }) .collect(); let id = format!("b{}_s{}_h{}", batch_size, seq_length, hidden_size); group.throughput(Throughput::Elements(batch_size as u64)); // CPU baseline group.bench_with_input( BenchmarkId::new("cpu_mean_pool", &id), &(&tokens, &mask, batch_size, seq_length, hidden_size), |b, (t, m, bs, sl, hs)| { b.iter(|| { cpu_baseline::mean_pool(black_box(t), black_box(m), *bs, *sl, *hs) }) }, ); // Note: GPU pooling would be benchmarked here when full GPU backend is implemented } group.finish(); } // ==================== Vector Operations Benchmarks ==================== fn vector_ops_benchmarks(c: &mut Criterion) { let mut group = c.benchmark_group("vector_ops"); // Test normalization at different scales for (num_vectors, dimension) in [ (100, 384), (1000, 384), (10000, 384), (1000, 768), (1000, 1536), ] { let mut vectors: Vec = (0..num_vectors * dimension) .map(|i| (i as f32) * 0.001) .collect(); let id = format!("n{}_d{}", num_vectors, dimension); group.throughput(Throughput::Elements(num_vectors as u64)); // CPU baseline group.bench_with_input( BenchmarkId::new("cpu_normalize", &id), &(dimension,), |b, (dim,)| { let mut v = vectors.clone(); b.iter(|| { cpu_baseline::normalize_batch(black_box(&mut v), *dim) }) }, ); } group.finish(); } // ==================== End-to-End Benchmarks ==================== fn e2e_similarity_search(c: &mut Criterion) { let mut group = c.benchmark_group("e2e_search"); // Realistic similarity search scenario let dimension = 384; let num_candidates = 10000; let top_k = 10; let query: Vec = (0..dimension).map(|i| (i as f32) * 0.001).collect(); let candidates: Vec> = (0..num_candidates) .map(|i| { (0..dimension) .map(|j| ((i * j) as f32).sin() * 0.1) .collect() }) .collect(); group.throughput(Throughput::Elements(num_candidates as u64)); // CPU: compute similarities and find top-k group.bench_function("cpu_top_k", |b| { b.iter(|| { let sims = cpu_baseline::batch_cosine_similarity(black_box(&query), black_box(&candidates)); let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect(); indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); indexed.truncate(top_k); indexed }) }); // GPU path #[cfg(feature = "gpu")] { let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect(); group.bench_function("gpu_top_k", |b| { b.iter(|| { let sims = batch_cosine_similarity_gpu(black_box(&query), black_box(&refs)); let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect(); indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); indexed.truncate(top_k); indexed }) }); } group.finish(); } // ==================== Memory Throughput Benchmarks ==================== fn memory_throughput(c: &mut Criterion) { let mut group = c.benchmark_group("memory_throughput"); // Measure memory bandwidth with different sizes for size_mb in [1, 10, 100].iter() { let size = size_mb * 1024 * 1024 / 4; // Convert MB to f32 count let data: Vec = (0..size).map(|i| i as f32).collect(); group.throughput(Throughput::Bytes((*size_mb * 1024 * 1024) as u64)); // Simple copy benchmark group.bench_with_input( BenchmarkId::new("copy", format!("{}MB", size_mb)), &data, |b, d| { b.iter(|| { let _copy: Vec = black_box(d).iter().copied().collect(); }) }, ); // Sum reduction benchmark group.bench_with_input( BenchmarkId::new("sum", format!("{}MB", size_mb)), &data, |b, d| { b.iter(|| { let sum: f32 = black_box(d).iter().sum(); sum }) }, ); } group.finish(); } criterion_group!( benches, similarity_benchmarks, pooling_benchmarks, vector_ops_benchmarks, e2e_similarity_search, memory_throughput, ); criterion_main!(benches);