Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,155 @@
//! Benchmarks for ONNX embedding generation
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
use std::cell::RefCell;
fn embedding_benchmarks(c: &mut Criterion) {
// Note: These benchmarks require the tokio runtime
// Run with: cargo bench --features benchmark
let rt = tokio::runtime::Runtime::new().unwrap();
// Initialize embedder once (wrapped in RefCell for interior mutability)
let embedder = RefCell::new(rt.block_on(async {
ruvector_onnx_embeddings::Embedder::default_model()
.await
.expect("Failed to load model")
}));
let mut group = c.benchmark_group("embedding_generation");
// Single text embedding
group.bench_function("single_text", |b| {
b.iter(|| {
let _ = embedder.borrow_mut().embed_one(black_box("This is a test sentence for benchmarking."));
});
});
// Batch embedding at different sizes
for size in [1, 8, 16, 32, 64].iter() {
let texts: Vec<String> = (0..*size)
.map(|i| format!("Benchmark sentence number {} for testing.", i))
.collect();
group.bench_with_input(
BenchmarkId::new("batch", size),
&texts,
|b, texts| {
b.iter(|| {
let _ = embedder.borrow_mut().embed(black_box(texts));
});
},
);
}
// Large batch embedding
let large_batch: Vec<String> = (0..100)
.map(|i| format!("Large batch sentence {} for parallel benchmark.", i))
.collect();
group.bench_function("batch_100", |b| {
b.iter(|| {
let _ = embedder.borrow_mut().embed(black_box(&large_batch));
});
});
group.finish();
}
fn pooling_benchmarks(c: &mut Criterion) {
use ruvector_onnx_embeddings::{Pooler, PoolingStrategy};
let mut group = c.benchmark_group("pooling");
// Create test data
let hidden_size = 384;
let seq_length = 128;
let batch_size = 32;
let token_embeddings: Vec<Vec<f32>> = (0..batch_size)
.map(|_| {
(0..seq_length * hidden_size)
.map(|i| (i as f32) * 0.001)
.collect()
})
.collect();
let attention_masks: Vec<Vec<i64>> = (0..batch_size)
.map(|_| vec![1i64; seq_length])
.collect();
for strategy in [
PoolingStrategy::Mean,
PoolingStrategy::Cls,
PoolingStrategy::Max,
PoolingStrategy::MeanSqrtLen,
] {
let pooler = Pooler::new(strategy, true);
group.bench_with_input(
BenchmarkId::new("strategy", format!("{:?}", strategy)),
&(&token_embeddings, &attention_masks),
|b, (tokens, masks)| {
b.iter(|| {
pooler.pool(black_box(tokens), black_box(masks), seq_length, hidden_size)
});
},
);
}
group.finish();
}
fn similarity_benchmarks(c: &mut Criterion) {
use ruvector_onnx_embeddings::Pooler;
let mut group = c.benchmark_group("similarity");
// Create test vectors
let dim = 384;
let vec_a: Vec<f32> = (0..dim).map(|i| (i as f32) * 0.01).collect();
let vec_b: Vec<f32> = (0..dim).map(|i| ((dim - i) as f32) * 0.01).collect();
group.bench_function("cosine_similarity_384d", |b| {
b.iter(|| {
Pooler::cosine_similarity(black_box(&vec_a), black_box(&vec_b))
});
});
group.bench_function("dot_product_384d", |b| {
b.iter(|| {
Pooler::dot_product(black_box(&vec_a), black_box(&vec_b))
});
});
group.bench_function("euclidean_distance_384d", |b| {
b.iter(|| {
Pooler::euclidean_distance(black_box(&vec_a), black_box(&vec_b))
});
});
// Batch similarity
let candidates: Vec<Vec<f32>> = (0..1000)
.map(|i| (0..dim).map(|j| ((i + j) as f32) * 0.001).collect())
.collect();
group.bench_function("batch_cosine_1000", |b| {
b.iter(|| {
ruvector_onnx_embeddings::pooling::batch_cosine_similarity(
black_box(&vec_a),
black_box(&candidates),
)
});
});
group.finish();
}
criterion_group!(
benches,
embedding_benchmarks,
pooling_benchmarks,
similarity_benchmarks
);
criterion_main!(benches);

View File

@@ -0,0 +1,313 @@
//! GPU Acceleration Benchmarks
//!
//! Benchmarks comparing CPU vs GPU performance for:
//! - Similarity computations
//! - Pooling operations
//! - Vector operations
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
#[cfg(feature = "gpu")]
use ruvector_onnx_embeddings::gpu::{
GpuAccelerator, GpuConfig, GpuPooler, GpuSimilarity, GpuVectorOps,
batch_cosine_similarity_gpu, batch_dot_product_gpu, batch_euclidean_gpu,
};
/// CPU baseline implementations for comparison
mod cpu_baseline {
use rayon::prelude::*;
pub fn batch_cosine_similarity(query: &[f32], candidates: &[Vec<f32>]) -> Vec<f32> {
candidates
.par_iter()
.map(|c| cosine_similarity(query, c))
.collect()
}
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a > 1e-12 && norm_b > 1e-12 {
dot / (norm_a * norm_b)
} else {
0.0
}
}
pub fn mean_pool(
tokens: &[f32],
mask: &[i64],
batch_size: usize,
seq_length: usize,
hidden_size: usize,
) -> Vec<f32> {
let mut output = vec![0.0f32; batch_size * hidden_size];
for batch_idx in 0..batch_size {
let tokens_base = batch_idx * seq_length * hidden_size;
let mask_base = batch_idx * seq_length;
let out_base = batch_idx * hidden_size;
let mut count = 0.0f32;
for seq_idx in 0..seq_length {
if mask[mask_base + seq_idx] == 1 {
let start = tokens_base + seq_idx * hidden_size;
for j in 0..hidden_size {
output[out_base + j] += tokens[start + j];
}
count += 1.0;
}
}
if count > 0.0 {
for j in 0..hidden_size {
output[out_base + j] /= count;
}
}
}
output
}
pub fn normalize_batch(vectors: &mut [f32], dimension: usize) {
for chunk in vectors.chunks_mut(dimension) {
let norm: f32 = chunk.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 1e-12 {
for val in chunk.iter_mut() {
*val /= norm;
}
}
}
}
}
// ==================== Similarity Benchmarks ====================
fn similarity_benchmarks(c: &mut Criterion) {
let mut group = c.benchmark_group("similarity");
// Test different dimensions
for dimension in [128, 384, 768, 1536].iter() {
let query: Vec<f32> = (0..*dimension).map(|i| (i as f32) * 0.001).collect();
// Test different candidate counts
for num_candidates in [100, 1000, 10000].iter() {
let candidates: Vec<Vec<f32>> = (0..*num_candidates)
.map(|i| {
(0..*dimension)
.map(|j| ((i + j) as f32) * 0.0001)
.collect()
})
.collect();
let id = format!("dim{}_n{}", dimension, num_candidates);
group.throughput(Throughput::Elements(*num_candidates as u64));
// CPU baseline
group.bench_with_input(
BenchmarkId::new("cpu_cosine", &id),
&(&query, &candidates),
|b, (q, c)| {
b.iter(|| cpu_baseline::batch_cosine_similarity(black_box(q), black_box(c)))
},
);
// GPU implementation (uses rayon parallel CPU when GPU unavailable)
#[cfg(feature = "gpu")]
{
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
group.bench_with_input(
BenchmarkId::new("gpu_cosine", &id),
&(&query, &refs),
|b, (q, c)| {
b.iter(|| batch_cosine_similarity_gpu(black_box(q), black_box(c)))
},
);
}
}
}
group.finish();
}
// ==================== Pooling Benchmarks ====================
fn pooling_benchmarks(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_pooling");
// Test different batch sizes and sequence lengths
for (batch_size, seq_length, hidden_size) in [
(1, 128, 384),
(8, 128, 384),
(32, 128, 384),
(64, 256, 768),
(128, 512, 384),
] {
let tokens: Vec<f32> = (0..batch_size * seq_length * hidden_size)
.map(|i| (i as f32) * 0.0001)
.collect();
let mask: Vec<i64> = (0..batch_size * seq_length)
.map(|i| if i % seq_length < seq_length - 10 { 1 } else { 0 })
.collect();
let id = format!("b{}_s{}_h{}", batch_size, seq_length, hidden_size);
group.throughput(Throughput::Elements(batch_size as u64));
// CPU baseline
group.bench_with_input(
BenchmarkId::new("cpu_mean_pool", &id),
&(&tokens, &mask, batch_size, seq_length, hidden_size),
|b, (t, m, bs, sl, hs)| {
b.iter(|| {
cpu_baseline::mean_pool(black_box(t), black_box(m), *bs, *sl, *hs)
})
},
);
// Note: GPU pooling would be benchmarked here when full GPU backend is implemented
}
group.finish();
}
// ==================== Vector Operations Benchmarks ====================
fn vector_ops_benchmarks(c: &mut Criterion) {
let mut group = c.benchmark_group("vector_ops");
// Test normalization at different scales
for (num_vectors, dimension) in [
(100, 384),
(1000, 384),
(10000, 384),
(1000, 768),
(1000, 1536),
] {
let mut vectors: Vec<f32> = (0..num_vectors * dimension)
.map(|i| (i as f32) * 0.001)
.collect();
let id = format!("n{}_d{}", num_vectors, dimension);
group.throughput(Throughput::Elements(num_vectors as u64));
// CPU baseline
group.bench_with_input(
BenchmarkId::new("cpu_normalize", &id),
&(dimension,),
|b, (dim,)| {
let mut v = vectors.clone();
b.iter(|| {
cpu_baseline::normalize_batch(black_box(&mut v), *dim)
})
},
);
}
group.finish();
}
// ==================== End-to-End Benchmarks ====================
fn e2e_similarity_search(c: &mut Criterion) {
let mut group = c.benchmark_group("e2e_search");
// Realistic similarity search scenario
let dimension = 384;
let num_candidates = 10000;
let top_k = 10;
let query: Vec<f32> = (0..dimension).map(|i| (i as f32) * 0.001).collect();
let candidates: Vec<Vec<f32>> = (0..num_candidates)
.map(|i| {
(0..dimension)
.map(|j| ((i * j) as f32).sin() * 0.1)
.collect()
})
.collect();
group.throughput(Throughput::Elements(num_candidates as u64));
// CPU: compute similarities and find top-k
group.bench_function("cpu_top_k", |b| {
b.iter(|| {
let sims = cpu_baseline::batch_cosine_similarity(black_box(&query), black_box(&candidates));
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
indexed.truncate(top_k);
indexed
})
});
// GPU path
#[cfg(feature = "gpu")]
{
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
group.bench_function("gpu_top_k", |b| {
b.iter(|| {
let sims = batch_cosine_similarity_gpu(black_box(&query), black_box(&refs));
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
indexed.truncate(top_k);
indexed
})
});
}
group.finish();
}
// ==================== Memory Throughput Benchmarks ====================
fn memory_throughput(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_throughput");
// Measure memory bandwidth with different sizes
for size_mb in [1, 10, 100].iter() {
let size = size_mb * 1024 * 1024 / 4; // Convert MB to f32 count
let data: Vec<f32> = (0..size).map(|i| i as f32).collect();
group.throughput(Throughput::Bytes((*size_mb * 1024 * 1024) as u64));
// Simple copy benchmark
group.bench_with_input(
BenchmarkId::new("copy", format!("{}MB", size_mb)),
&data,
|b, d| {
b.iter(|| {
let _copy: Vec<f32> = black_box(d).iter().copied().collect();
})
},
);
// Sum reduction benchmark
group.bench_with_input(
BenchmarkId::new("sum", format!("{}MB", size_mb)),
&data,
|b, d| {
b.iter(|| {
let sum: f32 = black_box(d).iter().sum();
sum
})
},
);
}
group.finish();
}
criterion_group!(
benches,
similarity_benchmarks,
pooling_benchmarks,
vector_ops_benchmarks,
e2e_similarity_search,
memory_throughput,
);
criterion_main!(benches);