Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/examples/onnx-embeddings/benches/embedding_benchmark.rs
+++ b/examples/onnx-embeddings/benches/embedding_benchmark.rs
@@ -0,0 +1,155 @@
+//! Benchmarks for ONNX embedding generation
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
+use std::cell::RefCell;
+
+fn embedding_benchmarks(c: &mut Criterion) {
+    // Note: These benchmarks require the tokio runtime
+    // Run with: cargo bench --features benchmark
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+
+    // Initialize embedder once (wrapped in RefCell for interior mutability)
+    let embedder = RefCell::new(rt.block_on(async {
+        ruvector_onnx_embeddings::Embedder::default_model()
+            .await
+            .expect("Failed to load model")
+    }));
+
+    let mut group = c.benchmark_group("embedding_generation");
+
+    // Single text embedding
+    group.bench_function("single_text", |b| {
+        b.iter(|| {
+            let _ = embedder.borrow_mut().embed_one(black_box("This is a test sentence for benchmarking."));
+        });
+    });
+
+    // Batch embedding at different sizes
+    for size in [1, 8, 16, 32, 64].iter() {
+        let texts: Vec<String> = (0..*size)
+            .map(|i| format!("Benchmark sentence number {} for testing.", i))
+            .collect();
+
+        group.bench_with_input(
+            BenchmarkId::new("batch", size),
+            &texts,
+            |b, texts| {
+                b.iter(|| {
+                    let _ = embedder.borrow_mut().embed(black_box(texts));
+                });
+            },
+        );
+    }
+
+    // Large batch embedding
+    let large_batch: Vec<String> = (0..100)
+        .map(|i| format!("Large batch sentence {} for parallel benchmark.", i))
+        .collect();
+
+    group.bench_function("batch_100", |b| {
+        b.iter(|| {
+            let _ = embedder.borrow_mut().embed(black_box(&large_batch));
+        });
+    });
+
+    group.finish();
+}
+
+fn pooling_benchmarks(c: &mut Criterion) {
+    use ruvector_onnx_embeddings::{Pooler, PoolingStrategy};
+
+    let mut group = c.benchmark_group("pooling");
+
+    // Create test data
+    let hidden_size = 384;
+    let seq_length = 128;
+    let batch_size = 32;
+
+    let token_embeddings: Vec<Vec<f32>> = (0..batch_size)
+        .map(|_| {
+            (0..seq_length * hidden_size)
+                .map(|i| (i as f32) * 0.001)
+                .collect()
+        })
+        .collect();
+
+    let attention_masks: Vec<Vec<i64>> = (0..batch_size)
+        .map(|_| vec![1i64; seq_length])
+        .collect();
+
+    for strategy in [
+        PoolingStrategy::Mean,
+        PoolingStrategy::Cls,
+        PoolingStrategy::Max,
+        PoolingStrategy::MeanSqrtLen,
+    ] {
+        let pooler = Pooler::new(strategy, true);
+
+        group.bench_with_input(
+            BenchmarkId::new("strategy", format!("{:?}", strategy)),
+            &(&token_embeddings, &attention_masks),
+            |b, (tokens, masks)| {
+                b.iter(|| {
+                    pooler.pool(black_box(tokens), black_box(masks), seq_length, hidden_size)
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn similarity_benchmarks(c: &mut Criterion) {
+    use ruvector_onnx_embeddings::Pooler;
+
+    let mut group = c.benchmark_group("similarity");
+
+    // Create test vectors
+    let dim = 384;
+    let vec_a: Vec<f32> = (0..dim).map(|i| (i as f32) * 0.01).collect();
+    let vec_b: Vec<f32> = (0..dim).map(|i| ((dim - i) as f32) * 0.01).collect();
+
+    group.bench_function("cosine_similarity_384d", |b| {
+        b.iter(|| {
+            Pooler::cosine_similarity(black_box(&vec_a), black_box(&vec_b))
+        });
+    });
+
+    group.bench_function("dot_product_384d", |b| {
+        b.iter(|| {
+            Pooler::dot_product(black_box(&vec_a), black_box(&vec_b))
+        });
+    });
+
+    group.bench_function("euclidean_distance_384d", |b| {
+        b.iter(|| {
+            Pooler::euclidean_distance(black_box(&vec_a), black_box(&vec_b))
+        });
+    });
+
+    // Batch similarity
+    let candidates: Vec<Vec<f32>> = (0..1000)
+        .map(|i| (0..dim).map(|j| ((i + j) as f32) * 0.001).collect())
+        .collect();
+
+    group.bench_function("batch_cosine_1000", |b| {
+        b.iter(|| {
+            ruvector_onnx_embeddings::pooling::batch_cosine_similarity(
+                black_box(&vec_a),
+                black_box(&candidates),
+            )
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    embedding_benchmarks,
+    pooling_benchmarks,
+    similarity_benchmarks
+);
+
+criterion_main!(benches);
--- a/examples/onnx-embeddings/benches/gpu_benchmark.rs
+++ b/examples/onnx-embeddings/benches/gpu_benchmark.rs
@@ -0,0 +1,313 @@
+//! GPU Acceleration Benchmarks
+//!
+//! Benchmarks comparing CPU vs GPU performance for:
+//! - Similarity computations
+//! - Pooling operations
+//! - Vector operations
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
+
+#[cfg(feature = "gpu")]
+use ruvector_onnx_embeddings::gpu::{
+    GpuAccelerator, GpuConfig, GpuPooler, GpuSimilarity, GpuVectorOps,
+    batch_cosine_similarity_gpu, batch_dot_product_gpu, batch_euclidean_gpu,
+};
+
+/// CPU baseline implementations for comparison
+mod cpu_baseline {
+    use rayon::prelude::*;
+
+    pub fn batch_cosine_similarity(query: &[f32], candidates: &[Vec<f32>]) -> Vec<f32> {
+        candidates
+            .par_iter()
+            .map(|c| cosine_similarity(query, c))
+            .collect()
+    }
+
+    pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
+        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+        if norm_a > 1e-12 && norm_b > 1e-12 {
+            dot / (norm_a * norm_b)
+        } else {
+            0.0
+        }
+    }
+
+    pub fn mean_pool(
+        tokens: &[f32],
+        mask: &[i64],
+        batch_size: usize,
+        seq_length: usize,
+        hidden_size: usize,
+    ) -> Vec<f32> {
+        let mut output = vec![0.0f32; batch_size * hidden_size];
+
+        for batch_idx in 0..batch_size {
+            let tokens_base = batch_idx * seq_length * hidden_size;
+            let mask_base = batch_idx * seq_length;
+            let out_base = batch_idx * hidden_size;
+
+            let mut count = 0.0f32;
+
+            for seq_idx in 0..seq_length {
+                if mask[mask_base + seq_idx] == 1 {
+                    let start = tokens_base + seq_idx * hidden_size;
+                    for j in 0..hidden_size {
+                        output[out_base + j] += tokens[start + j];
+                    }
+                    count += 1.0;
+                }
+            }
+
+            if count > 0.0 {
+                for j in 0..hidden_size {
+                    output[out_base + j] /= count;
+                }
+            }
+        }
+
+        output
+    }
+
+    pub fn normalize_batch(vectors: &mut [f32], dimension: usize) {
+        for chunk in vectors.chunks_mut(dimension) {
+            let norm: f32 = chunk.iter().map(|x| x * x).sum::<f32>().sqrt();
+            if norm > 1e-12 {
+                for val in chunk.iter_mut() {
+                    *val /= norm;
+                }
+            }
+        }
+    }
+}
+
+// ==================== Similarity Benchmarks ====================
+
+fn similarity_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("similarity");
+
+    // Test different dimensions
+    for dimension in [128, 384, 768, 1536].iter() {
+        let query: Vec<f32> = (0..*dimension).map(|i| (i as f32) * 0.001).collect();
+
+        // Test different candidate counts
+        for num_candidates in [100, 1000, 10000].iter() {
+            let candidates: Vec<Vec<f32>> = (0..*num_candidates)
+                .map(|i| {
+                    (0..*dimension)
+                        .map(|j| ((i + j) as f32) * 0.0001)
+                        .collect()
+                })
+                .collect();
+
+            let id = format!("dim{}_n{}", dimension, num_candidates);
+
+            group.throughput(Throughput::Elements(*num_candidates as u64));
+
+            // CPU baseline
+            group.bench_with_input(
+                BenchmarkId::new("cpu_cosine", &id),
+                &(&query, &candidates),
+                |b, (q, c)| {
+                    b.iter(|| cpu_baseline::batch_cosine_similarity(black_box(q), black_box(c)))
+                },
+            );
+
+            // GPU implementation (uses rayon parallel CPU when GPU unavailable)
+            #[cfg(feature = "gpu")]
+            {
+                let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
+                group.bench_with_input(
+                    BenchmarkId::new("gpu_cosine", &id),
+                    &(&query, &refs),
+                    |b, (q, c)| {
+                        b.iter(|| batch_cosine_similarity_gpu(black_box(q), black_box(c)))
+                    },
+                );
+            }
+        }
+    }
+
+    group.finish();
+}
+
+// ==================== Pooling Benchmarks ====================
+
+fn pooling_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_pooling");
+
+    // Test different batch sizes and sequence lengths
+    for (batch_size, seq_length, hidden_size) in [
+        (1, 128, 384),
+        (8, 128, 384),
+        (32, 128, 384),
+        (64, 256, 768),
+        (128, 512, 384),
+    ] {
+        let tokens: Vec<f32> = (0..batch_size * seq_length * hidden_size)
+            .map(|i| (i as f32) * 0.0001)
+            .collect();
+
+        let mask: Vec<i64> = (0..batch_size * seq_length)
+            .map(|i| if i % seq_length < seq_length - 10 { 1 } else { 0 })
+            .collect();
+
+        let id = format!("b{}_s{}_h{}", batch_size, seq_length, hidden_size);
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+
+        // CPU baseline
+        group.bench_with_input(
+            BenchmarkId::new("cpu_mean_pool", &id),
+            &(&tokens, &mask, batch_size, seq_length, hidden_size),
+            |b, (t, m, bs, sl, hs)| {
+                b.iter(|| {
+                    cpu_baseline::mean_pool(black_box(t), black_box(m), *bs, *sl, *hs)
+                })
+            },
+        );
+
+        // Note: GPU pooling would be benchmarked here when full GPU backend is implemented
+    }
+
+    group.finish();
+}
+
+// ==================== Vector Operations Benchmarks ====================
+
+fn vector_ops_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("vector_ops");
+
+    // Test normalization at different scales
+    for (num_vectors, dimension) in [
+        (100, 384),
+        (1000, 384),
+        (10000, 384),
+        (1000, 768),
+        (1000, 1536),
+    ] {
+        let mut vectors: Vec<f32> = (0..num_vectors * dimension)
+            .map(|i| (i as f32) * 0.001)
+            .collect();
+
+        let id = format!("n{}_d{}", num_vectors, dimension);
+
+        group.throughput(Throughput::Elements(num_vectors as u64));
+
+        // CPU baseline
+        group.bench_with_input(
+            BenchmarkId::new("cpu_normalize", &id),
+            &(dimension,),
+            |b, (dim,)| {
+                let mut v = vectors.clone();
+                b.iter(|| {
+                    cpu_baseline::normalize_batch(black_box(&mut v), *dim)
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// ==================== End-to-End Benchmarks ====================
+
+fn e2e_similarity_search(c: &mut Criterion) {
+    let mut group = c.benchmark_group("e2e_search");
+
+    // Realistic similarity search scenario
+    let dimension = 384;
+    let num_candidates = 10000;
+    let top_k = 10;
+
+    let query: Vec<f32> = (0..dimension).map(|i| (i as f32) * 0.001).collect();
+    let candidates: Vec<Vec<f32>> = (0..num_candidates)
+        .map(|i| {
+            (0..dimension)
+                .map(|j| ((i * j) as f32).sin() * 0.1)
+                .collect()
+        })
+        .collect();
+
+    group.throughput(Throughput::Elements(num_candidates as u64));
+
+    // CPU: compute similarities and find top-k
+    group.bench_function("cpu_top_k", |b| {
+        b.iter(|| {
+            let sims = cpu_baseline::batch_cosine_similarity(black_box(&query), black_box(&candidates));
+            let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
+            indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+            indexed.truncate(top_k);
+            indexed
+        })
+    });
+
+    // GPU path
+    #[cfg(feature = "gpu")]
+    {
+        let refs: Vec<&[f32]> = candidates.iter().map(|v| v.as_slice()).collect();
+        group.bench_function("gpu_top_k", |b| {
+            b.iter(|| {
+                let sims = batch_cosine_similarity_gpu(black_box(&query), black_box(&refs));
+                let mut indexed: Vec<(usize, f32)> = sims.into_iter().enumerate().collect();
+                indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+                indexed.truncate(top_k);
+                indexed
+            })
+        });
+    }
+
+    group.finish();
+}
+
+// ==================== Memory Throughput Benchmarks ====================
+
+fn memory_throughput(c: &mut Criterion) {
+    let mut group = c.benchmark_group("memory_throughput");
+
+    // Measure memory bandwidth with different sizes
+    for size_mb in [1, 10, 100].iter() {
+        let size = size_mb * 1024 * 1024 / 4; // Convert MB to f32 count
+        let data: Vec<f32> = (0..size).map(|i| i as f32).collect();
+
+        group.throughput(Throughput::Bytes((*size_mb * 1024 * 1024) as u64));
+
+        // Simple copy benchmark
+        group.bench_with_input(
+            BenchmarkId::new("copy", format!("{}MB", size_mb)),
+            &data,
+            |b, d| {
+                b.iter(|| {
+                    let _copy: Vec<f32> = black_box(d).iter().copied().collect();
+                })
+            },
+        );
+
+        // Sum reduction benchmark
+        group.bench_with_input(
+            BenchmarkId::new("sum", format!("{}MB", size_mb)),
+            &data,
+            |b, d| {
+                b.iter(|| {
+                    let sum: f32 = black_box(d).iter().sum();
+                    sum
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    similarity_benchmarks,
+    pooling_benchmarks,
+    vector_ops_benchmarks,
+    e2e_similarity_search,
+    memory_throughput,
+);
+
+criterion_main!(benches);