Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
155
vendor/ruvector/examples/onnx-embeddings/benches/embedding_benchmark.rs
vendored
Normal file
155
vendor/ruvector/examples/onnx-embeddings/benches/embedding_benchmark.rs
vendored
Normal file
@@ -0,0 +1,155 @@
|
||||
//! Benchmarks for ONNX embedding generation
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
|
||||
use std::cell::RefCell;
|
||||
|
||||
fn embedding_benchmarks(c: &mut Criterion) {
|
||||
// Note: These benchmarks require the tokio runtime
|
||||
// Run with: cargo bench --features benchmark
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
|
||||
// Initialize embedder once (wrapped in RefCell for interior mutability)
|
||||
let embedder = RefCell::new(rt.block_on(async {
|
||||
ruvector_onnx_embeddings::Embedder::default_model()
|
||||
.await
|
||||
.expect("Failed to load model")
|
||||
}));
|
||||
|
||||
let mut group = c.benchmark_group("embedding_generation");
|
||||
|
||||
// Single text embedding
|
||||
group.bench_function("single_text", |b| {
|
||||
b.iter(|| {
|
||||
let _ = embedder.borrow_mut().embed_one(black_box("This is a test sentence for benchmarking."));
|
||||
});
|
||||
});
|
||||
|
||||
// Batch embedding at different sizes
|
||||
for size in [1, 8, 16, 32, 64].iter() {
|
||||
let texts: Vec<String> = (0..*size)
|
||||
.map(|i| format!("Benchmark sentence number {} for testing.", i))
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("batch", size),
|
||||
&texts,
|
||||
|b, texts| {
|
||||
b.iter(|| {
|
||||
let _ = embedder.borrow_mut().embed(black_box(texts));
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Large batch embedding
|
||||
let large_batch: Vec<String> = (0..100)
|
||||
.map(|i| format!("Large batch sentence {} for parallel benchmark.", i))
|
||||
.collect();
|
||||
|
||||
group.bench_function("batch_100", |b| {
|
||||
b.iter(|| {
|
||||
let _ = embedder.borrow_mut().embed(black_box(&large_batch));
|
||||
});
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn pooling_benchmarks(c: &mut Criterion) {
|
||||
use ruvector_onnx_embeddings::{Pooler, PoolingStrategy};
|
||||
|
||||
let mut group = c.benchmark_group("pooling");
|
||||
|
||||
// Create test data
|
||||
let hidden_size = 384;
|
||||
let seq_length = 128;
|
||||
let batch_size = 32;
|
||||
|
||||
let token_embeddings: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|_| {
|
||||
(0..seq_length * hidden_size)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let attention_masks: Vec<Vec<i64>> = (0..batch_size)
|
||||
.map(|_| vec![1i64; seq_length])
|
||||
.collect();
|
||||
|
||||
for strategy in [
|
||||
PoolingStrategy::Mean,
|
||||
PoolingStrategy::Cls,
|
||||
PoolingStrategy::Max,
|
||||
PoolingStrategy::MeanSqrtLen,
|
||||
] {
|
||||
let pooler = Pooler::new(strategy, true);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("strategy", format!("{:?}", strategy)),
|
||||
&(&token_embeddings, &attention_masks),
|
||||
|b, (tokens, masks)| {
|
||||
b.iter(|| {
|
||||
pooler.pool(black_box(tokens), black_box(masks), seq_length, hidden_size)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn similarity_benchmarks(c: &mut Criterion) {
|
||||
use ruvector_onnx_embeddings::Pooler;
|
||||
|
||||
let mut group = c.benchmark_group("similarity");
|
||||
|
||||
// Create test vectors
|
||||
let dim = 384;
|
||||
let vec_a: Vec<f32> = (0..dim).map(|i| (i as f32) * 0.01).collect();
|
||||
let vec_b: Vec<f32> = (0..dim).map(|i| ((dim - i) as f32) * 0.01).collect();
|
||||
|
||||
group.bench_function("cosine_similarity_384d", |b| {
|
||||
b.iter(|| {
|
||||
Pooler::cosine_similarity(black_box(&vec_a), black_box(&vec_b))
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("dot_product_384d", |b| {
|
||||
b.iter(|| {
|
||||
Pooler::dot_product(black_box(&vec_a), black_box(&vec_b))
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("euclidean_distance_384d", |b| {
|
||||
b.iter(|| {
|
||||
Pooler::euclidean_distance(black_box(&vec_a), black_box(&vec_b))
|
||||
});
|
||||
});
|
||||
|
||||
// Batch similarity
|
||||
let candidates: Vec<Vec<f32>> = (0..1000)
|
||||
.map(|i| (0..dim).map(|j| ((i + j) as f32) * 0.001).collect())
|
||||
.collect();
|
||||
|
||||
group.bench_function("batch_cosine_1000", |b| {
|
||||
b.iter(|| {
|
||||
ruvector_onnx_embeddings::pooling::batch_cosine_similarity(
|
||||
black_box(&vec_a),
|
||||
black_box(&candidates),
|
||||
)
|
||||
});
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
embedding_benchmarks,
|
||||
pooling_benchmarks,
|
||||
similarity_benchmarks
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
313
vendor/ruvector/examples/onnx-embeddings/benches/gpu_benchmark.rs
vendored
Normal file
313
vendor/ruvector/examples/onnx-embeddings/benches/gpu_benchmark.rs
vendored
Normal file
@@ -0,0 +1,313 @@
|
||||
//! GPU Acceleration Benchmarks
|
||||
//!
|
||||
//! Benchmarks comparing CPU vs GPU performance for:
|
||||
//! - Similarity computations
|
||||
//! - Pooling operations
|
||||
//! - Vector operations
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use ruvector_onnx_embeddings::gpu::{
|
||||
GpuAccelerator, GpuConfig, GpuPooler, GpuSimilarity, GpuVectorOps,
|
||||
batch_cosine_similarity_gpu, batch_dot_product_gpu, batch_euclidean_gpu,
|
||||
};
|
||||
|
||||
/// CPU baseline implementations for comparison
|
||||
mod cpu_baseline {
|
||||
use rayon::prelude::*;
|
||||
|
||||
pub fn batch_cosine_similarity(query: &[f32], candidates: &[Vec<f32>]) -> Vec<f32> {
|
||||
candidates
|
||||
.par_iter()
|
||||
.map(|c| cosine_similarity(query, c))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm_a > 1e-12 && norm_b > 1e-12 {
|
||||
dot / (norm_a * norm_b)
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mean_pool(
|
||||
tokens: &[f32],
|
||||
mask: &[i64],
|
||||
batch_size: usize,
|
||||
seq_length: usize,
|
||||
hidden_size: usize,
|
||||
) -> Vec<f32> {
|
||||
let mut output = vec![0.0f32; batch_size * hidden_size];
|
||||
|
||||
for batch_idx in 0..batch_size {
|
||||
let tokens_base = batch_idx * seq_length * hidden_size;
|
||||
let mask_base = batch_idx * seq_length;
|
||||
let out_base = batch_idx * hidden_size;
|
||||
|
||||
let mut count = 0.0f32;
|
||||
|
||||
for seq_idx in 0..seq_length {
|
||||
if mask[mask_base + seq_idx] == 1 {
|
||||
let start = tokens_base + seq_idx * hidden_size;
|
||||
for j in 0..hidden_size {
|
||||
output[out_base + j] += tokens[start + j];
|
||||
}
|
||||
count += 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0.0 {
|
||||
for j in 0..hidden_size {
|
||||
output[out_base + j] /= count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
pub fn normalize_batch(vectors: &mut [f32], dimension: usize) {
|
||||
for chunk in vectors.chunks_mut(dimension) {
|
||||
let norm: f32 = chunk.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 1e-12 {
|
||||
for val in chunk.iter_mut() {
|
||||
*val /= norm;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Similarity Benchmarks ====================
|
||||
|
||||
fn similarity_benchmarks(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("similarity");
|
||||
|
||||
// Test different dimensions
|
||||
for dimension in [128, 384, 768, 1536].iter() {
|
||||
let query: Vec<f32> = (0..*dimension).map(|i| (i as f32) * 0.001).collect();
|
||||
|
||||
// Test different candidate counts
|
||||
for num_candidates in [100, 1000, 10000].iter() {
|
||||
let candidates: Vec<Vec<f32>> = (0..*num_candidates)
|
||||
.map(|i| {
|
||||
(0..*dimension)
|
||||
.map(|j| ((i + j) as f32) * 0.0001)
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let id = format!("dim{}_n{}", dimension, num_candidates);
|
||||
|
||||
group.throughput(Throughput::Elements(*num_candidates as u64));
|
||||
|
||||
// CPU baseline
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_cosine", &id),
|
||||
&(&query, &candidates),
|
||||
|b, (q, c)| {
|
||||
b.iter(|| cpu_baseline::batch_cosine_similarity(black_box(q), black_box(c)))
|
||||
},
|
||||
);
|
||||
|
||||
// GPU implementation (uses rayon parallel CPU when GPU unavailable)
|
||||
#[cfg(feature = "gpu")]
|
||||
{
|
||||
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("gpu_cosine", &id),
|
||||
&(&query, &refs),
|
||||
|b, (q, c)| {
|
||||
b.iter(|| batch_cosine_similarity_gpu(black_box(q), black_box(c)))
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ==================== Pooling Benchmarks ====================
|
||||
|
||||
fn pooling_benchmarks(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("gpu_pooling");
|
||||
|
||||
// Test different batch sizes and sequence lengths
|
||||
for (batch_size, seq_length, hidden_size) in [
|
||||
(1, 128, 384),
|
||||
(8, 128, 384),
|
||||
(32, 128, 384),
|
||||
(64, 256, 768),
|
||||
(128, 512, 384),
|
||||
] {
|
||||
let tokens: Vec<f32> = (0..batch_size * seq_length * hidden_size)
|
||||
.map(|i| (i as f32) * 0.0001)
|
||||
.collect();
|
||||
|
||||
let mask: Vec<i64> = (0..batch_size * seq_length)
|
||||
.map(|i| if i % seq_length < seq_length - 10 { 1 } else { 0 })
|
||||
.collect();
|
||||
|
||||
let id = format!("b{}_s{}_h{}", batch_size, seq_length, hidden_size);
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
|
||||
// CPU baseline
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_mean_pool", &id),
|
||||
&(&tokens, &mask, batch_size, seq_length, hidden_size),
|
||||
|b, (t, m, bs, sl, hs)| {
|
||||
b.iter(|| {
|
||||
cpu_baseline::mean_pool(black_box(t), black_box(m), *bs, *sl, *hs)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Note: GPU pooling would be benchmarked here when full GPU backend is implemented
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ==================== Vector Operations Benchmarks ====================
|
||||
|
||||
fn vector_ops_benchmarks(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("vector_ops");
|
||||
|
||||
// Test normalization at different scales
|
||||
for (num_vectors, dimension) in [
|
||||
(100, 384),
|
||||
(1000, 384),
|
||||
(10000, 384),
|
||||
(1000, 768),
|
||||
(1000, 1536),
|
||||
] {
|
||||
let mut vectors: Vec<f32> = (0..num_vectors * dimension)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
|
||||
let id = format!("n{}_d{}", num_vectors, dimension);
|
||||
|
||||
group.throughput(Throughput::Elements(num_vectors as u64));
|
||||
|
||||
// CPU baseline
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_normalize", &id),
|
||||
&(dimension,),
|
||||
|b, (dim,)| {
|
||||
let mut v = vectors.clone();
|
||||
b.iter(|| {
|
||||
cpu_baseline::normalize_batch(black_box(&mut v), *dim)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ==================== End-to-End Benchmarks ====================
|
||||
|
||||
fn e2e_similarity_search(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("e2e_search");
|
||||
|
||||
// Realistic similarity search scenario
|
||||
let dimension = 384;
|
||||
let num_candidates = 10000;
|
||||
let top_k = 10;
|
||||
|
||||
let query: Vec<f32> = (0..dimension).map(|i| (i as f32) * 0.001).collect();
|
||||
let candidates: Vec<Vec<f32>> = (0..num_candidates)
|
||||
.map(|i| {
|
||||
(0..dimension)
|
||||
.map(|j| ((i * j) as f32).sin() * 0.1)
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(num_candidates as u64));
|
||||
|
||||
// CPU: compute similarities and find top-k
|
||||
group.bench_function("cpu_top_k", |b| {
|
||||
b.iter(|| {
|
||||
let sims = cpu_baseline::batch_cosine_similarity(black_box(&query), black_box(&candidates));
|
||||
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
|
||||
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
indexed.truncate(top_k);
|
||||
indexed
|
||||
})
|
||||
});
|
||||
|
||||
// GPU path
|
||||
#[cfg(feature = "gpu")]
|
||||
{
|
||||
let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
|
||||
group.bench_function("gpu_top_k", |b| {
|
||||
b.iter(|| {
|
||||
let sims = batch_cosine_similarity_gpu(black_box(&query), black_box(&refs));
|
||||
let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
|
||||
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
indexed.truncate(top_k);
|
||||
indexed
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ==================== Memory Throughput Benchmarks ====================
|
||||
|
||||
fn memory_throughput(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("memory_throughput");
|
||||
|
||||
// Measure memory bandwidth with different sizes
|
||||
for size_mb in [1, 10, 100].iter() {
|
||||
let size = size_mb * 1024 * 1024 / 4; // Convert MB to f32 count
|
||||
let data: Vec<f32> = (0..size).map(|i| i as f32).collect();
|
||||
|
||||
group.throughput(Throughput::Bytes((*size_mb * 1024 * 1024) as u64));
|
||||
|
||||
// Simple copy benchmark
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("copy", format!("{}MB", size_mb)),
|
||||
&data,
|
||||
|b, d| {
|
||||
b.iter(|| {
|
||||
let _copy: Vec<f32> = black_box(d).iter().copied().collect();
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Sum reduction benchmark
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("sum", format!("{}MB", size_mb)),
|
||||
&data,
|
||||
|b, d| {
|
||||
b.iter(|| {
|
||||
let sum: f32 = black_box(d).iter().sum();
|
||||
sum
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
similarity_benchmarks,
|
||||
pooling_benchmarks,
|
||||
vector_ops_benchmarks,
|
||||
e2e_similarity_search,
|
||||
memory_throughput,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
Reference in New Issue
Block a user