314 lines
9.5 KiB
Rust
314 lines
9.5 KiB
Rust
//! GPU Acceleration Benchmarks
|
|
//!
|
|
//! Benchmarks comparing CPU vs GPU performance for:
|
|
//! - Similarity computations
|
|
//! - Pooling operations
|
|
//! - Vector operations
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
|
|
|
|
#[cfg(feature = "gpu")]
|
|
use ruvector_onnx_embeddings::gpu::{
|
|
GpuAccelerator, GpuConfig, GpuPooler, GpuSimilarity, GpuVectorOps,
|
|
batch_cosine_similarity_gpu, batch_dot_product_gpu, batch_euclidean_gpu,
|
|
};
|
|
|
|
/// CPU baseline implementations for comparison
|
|
mod cpu_baseline {
|
|
use rayon::prelude::*;
|
|
|
|
pub fn batch_cosine_similarity(query: &[f32], candidates: &[Vec<f32>]) -> Vec<f32> {
|
|
candidates
|
|
.par_iter()
|
|
.map(|c| cosine_similarity(query, c))
|
|
.collect()
|
|
}
|
|
|
|
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
|
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
|
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
if norm_a > 1e-12 && norm_b > 1e-12 {
|
|
dot / (norm_a * norm_b)
|
|
} else {
|
|
0.0
|
|
}
|
|
}
|
|
|
|
pub fn mean_pool(
|
|
tokens: &[f32],
|
|
mask: &[i64],
|
|
batch_size: usize,
|
|
seq_length: usize,
|
|
hidden_size: usize,
|
|
) -> Vec<f32> {
|
|
let mut output = vec![0.0f32; batch_size * hidden_size];
|
|
|
|
for batch_idx in 0..batch_size {
|
|
let tokens_base = batch_idx * seq_length * hidden_size;
|
|
let mask_base = batch_idx * seq_length;
|
|
let out_base = batch_idx * hidden_size;
|
|
|
|
let mut count = 0.0f32;
|
|
|
|
for seq_idx in 0..seq_length {
|
|
if mask[mask_base + seq_idx] == 1 {
|
|
let start = tokens_base + seq_idx * hidden_size;
|
|
for j in 0..hidden_size {
|
|
output[out_base + j] += tokens[start + j];
|
|
}
|
|
count += 1.0;
|
|
}
|
|
}
|
|
|
|
if count > 0.0 {
|
|
for j in 0..hidden_size {
|
|
output[out_base + j] /= count;
|
|
}
|
|
}
|
|
}
|
|
|
|
output
|
|
}
|
|
|
|
pub fn normalize_batch(vectors: &mut [f32], dimension: usize) {
|
|
for chunk in vectors.chunks_mut(dimension) {
|
|
let norm: f32 = chunk.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
if norm > 1e-12 {
|
|
for val in chunk.iter_mut() {
|
|
*val /= norm;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ==================== Similarity Benchmarks ====================
|
|
|
|
fn similarity_benchmarks(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("similarity");
|
|
|
|
// Test different dimensions
|
|
for dimension in [128, 384, 768, 1536].iter() {
|
|
let query: Vec<f32> = (0..*dimension).map(|i| (i as f32) * 0.001).collect();
|
|
|
|
// Test different candidate counts
|
|
for num_candidates in [100, 1000, 10000].iter() {
|
|
let candidates: Vec<Vec<f32>> = (0..*num_candidates)
|
|
.map(|i| {
|
|
(0..*dimension)
|
|
.map(|j| ((i + j) as f32) * 0.0001)
|
|
.collect()
|
|
})
|
|
.collect();
|
|
|
|
let id = format!("dim{}_n{}", dimension, num_candidates);
|
|
|
|
group.throughput(Throughput::Elements(*num_candidates as u64));
|
|
|
|
// CPU baseline
|
|
group.bench_with_input(
|
|
BenchmarkId::new("cpu_cosine", &id),
|
|
&(&query, &candidates),
|
|
|b, (q, c)| {
|
|
b.iter(|| cpu_baseline::batch_cosine_similarity(black_box(q), black_box(c)))
|
|
},
|
|
);
|
|
|
|
// GPU implementation (uses rayon parallel CPU when GPU unavailable)
|
|
#[cfg(feature = "gpu")]
|
|
{
|
|
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
|
|
group.bench_with_input(
|
|
BenchmarkId::new("gpu_cosine", &id),
|
|
&(&query, &refs),
|
|
|b, (q, c)| {
|
|
b.iter(|| batch_cosine_similarity_gpu(black_box(q), black_box(c)))
|
|
},
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ==================== Pooling Benchmarks ====================
|
|
|
|
fn pooling_benchmarks(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("gpu_pooling");
|
|
|
|
// Test different batch sizes and sequence lengths
|
|
for (batch_size, seq_length, hidden_size) in [
|
|
(1, 128, 384),
|
|
(8, 128, 384),
|
|
(32, 128, 384),
|
|
(64, 256, 768),
|
|
(128, 512, 384),
|
|
] {
|
|
let tokens: Vec<f32> = (0..batch_size * seq_length * hidden_size)
|
|
.map(|i| (i as f32) * 0.0001)
|
|
.collect();
|
|
|
|
let mask: Vec<i64> = (0..batch_size * seq_length)
|
|
.map(|i| if i % seq_length < seq_length - 10 { 1 } else { 0 })
|
|
.collect();
|
|
|
|
let id = format!("b{}_s{}_h{}", batch_size, seq_length, hidden_size);
|
|
|
|
group.throughput(Throughput::Elements(batch_size as u64));
|
|
|
|
// CPU baseline
|
|
group.bench_with_input(
|
|
BenchmarkId::new("cpu_mean_pool", &id),
|
|
&(&tokens, &mask, batch_size, seq_length, hidden_size),
|
|
|b, (t, m, bs, sl, hs)| {
|
|
b.iter(|| {
|
|
cpu_baseline::mean_pool(black_box(t), black_box(m), *bs, *sl, *hs)
|
|
})
|
|
},
|
|
);
|
|
|
|
// Note: GPU pooling would be benchmarked here when full GPU backend is implemented
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ==================== Vector Operations Benchmarks ====================
|
|
|
|
fn vector_ops_benchmarks(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("vector_ops");
|
|
|
|
// Test normalization at different scales
|
|
for (num_vectors, dimension) in [
|
|
(100, 384),
|
|
(1000, 384),
|
|
(10000, 384),
|
|
(1000, 768),
|
|
(1000, 1536),
|
|
] {
|
|
let mut vectors: Vec<f32> = (0..num_vectors * dimension)
|
|
.map(|i| (i as f32) * 0.001)
|
|
.collect();
|
|
|
|
let id = format!("n{}_d{}", num_vectors, dimension);
|
|
|
|
group.throughput(Throughput::Elements(num_vectors as u64));
|
|
|
|
// CPU baseline
|
|
group.bench_with_input(
|
|
BenchmarkId::new("cpu_normalize", &id),
|
|
&(dimension,),
|
|
|b, (dim,)| {
|
|
let mut v = vectors.clone();
|
|
b.iter(|| {
|
|
cpu_baseline::normalize_batch(black_box(&mut v), *dim)
|
|
})
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ==================== End-to-End Benchmarks ====================
|
|
|
|
fn e2e_similarity_search(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("e2e_search");
|
|
|
|
// Realistic similarity search scenario
|
|
let dimension = 384;
|
|
let num_candidates = 10000;
|
|
let top_k = 10;
|
|
|
|
let query: Vec<f32> = (0..dimension).map(|i| (i as f32) * 0.001).collect();
|
|
let candidates: Vec<Vec<f32>> = (0..num_candidates)
|
|
.map(|i| {
|
|
(0..dimension)
|
|
.map(|j| ((i * j) as f32).sin() * 0.1)
|
|
.collect()
|
|
})
|
|
.collect();
|
|
|
|
group.throughput(Throughput::Elements(num_candidates as u64));
|
|
|
|
// CPU: compute similarities and find top-k
|
|
group.bench_function("cpu_top_k", |b| {
|
|
b.iter(|| {
|
|
let sims = cpu_baseline::batch_cosine_similarity(black_box(&query), black_box(&candidates));
|
|
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
|
|
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
|
indexed.truncate(top_k);
|
|
indexed
|
|
})
|
|
});
|
|
|
|
// GPU path
|
|
#[cfg(feature = "gpu")]
|
|
{
|
|
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
|
|
group.bench_function("gpu_top_k", |b| {
|
|
b.iter(|| {
|
|
let sims = batch_cosine_similarity_gpu(black_box(&query), black_box(&refs));
|
|
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
|
|
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
|
indexed.truncate(top_k);
|
|
indexed
|
|
})
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ==================== Memory Throughput Benchmarks ====================
|
|
|
|
fn memory_throughput(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("memory_throughput");
|
|
|
|
// Measure memory bandwidth with different sizes
|
|
for size_mb in [1, 10, 100].iter() {
|
|
let size = size_mb * 1024 * 1024 / 4; // Convert MB to f32 count
|
|
let data: Vec<f32> = (0..size).map(|i| i as f32).collect();
|
|
|
|
group.throughput(Throughput::Bytes((*size_mb * 1024 * 1024) as u64));
|
|
|
|
// Simple copy benchmark
|
|
group.bench_with_input(
|
|
BenchmarkId::new("copy", format!("{}MB", size_mb)),
|
|
&data,
|
|
|b, d| {
|
|
b.iter(|| {
|
|
let _copy: Vec<f32> = black_box(d).iter().copied().collect();
|
|
})
|
|
},
|
|
);
|
|
|
|
// Sum reduction benchmark
|
|
group.bench_with_input(
|
|
BenchmarkId::new("sum", format!("{}MB", size_mb)),
|
|
&data,
|
|
|b, d| {
|
|
b.iter(|| {
|
|
let sum: f32 = black_box(d).iter().sum();
|
|
sum
|
|
})
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
criterion_group!(
|
|
benches,
|
|
similarity_benchmarks,
|
|
pooling_benchmarks,
|
|
vector_ops_benchmarks,
|
|
e2e_similarity_search,
|
|
memory_throughput,
|
|
);
|
|
|
|
criterion_main!(benches);
|