Files
wifi-densepose/docs/research/latent-space/implementation-plans/agents/13-benchmarks.md
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

1570 lines
44 KiB
Markdown

# Agent 13: Performance Benchmark Suite
## Overview
Comprehensive Criterion-based benchmark suite for measuring and tracking performance across all latent space operations, attention mechanisms, and search algorithms.
## 1. Criterion Benchmarks
### 1.1 Latency Benchmarks
Complete benchmark code for measuring operation latency across various dimensions and neighbor counts.
```rust
// benches/latency_benchmarks.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
use rand::Rng;
/// Generate random embedding of specified dimension
fn random_embedding(dim: usize) -> Vec<f32> {
let mut rng = rand::thread_rng();
(0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect()
}
/// Generate dataset of random embeddings
fn generate_dataset(num_vectors: usize, dim: usize) -> Vec<Vec<f32>> {
(0..num_vectors)
.map(|_| random_embedding(dim))
.collect()
}
/// Benchmark latent space creation
fn bench_latent_space_creation(c: &mut Criterion) {
let mut group = c.benchmark_group("latent_space_creation");
for dim in [64, 128, 256, 512, 1024].iter() {
group.bench_with_input(
BenchmarkId::from_parameter(dim),
dim,
|b, &dim| {
b.iter(|| {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
black_box(LatentSpace::new(config))
});
},
);
}
group.finish();
}
/// Benchmark embedding addition
fn bench_add_embedding(c: &mut Criterion) {
let mut group = c.benchmark_group("add_embedding");
for dim in [64, 128, 256, 512, 1024].iter() {
let config = LatentConfig {
dimension: *dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embedding = random_embedding(*dim);
group.bench_with_input(
BenchmarkId::from_parameter(dim),
&embedding,
|b, emb| {
b.iter(|| {
black_box(space.add_embedding(emb.clone(), None))
});
},
);
}
group.finish();
}
/// Benchmark KNN search with varying neighbor counts
fn bench_knn_search(c: &mut Criterion) {
let mut group = c.benchmark_group("knn_search");
let dimensions = [128, 256, 512];
let neighbor_counts = [10, 50, 100, 500, 1000];
let dataset_size = 10000;
for &dim in &dimensions {
for &k in &neighbor_counts {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let dataset = generate_dataset(dataset_size, dim);
// Populate space
for emb in dataset.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.throughput(Throughput::Elements(k as u64));
group.bench_with_input(
BenchmarkId::new(format!("dim_{}", dim), k),
&k,
|b, &neighbors| {
b.iter(|| {
black_box(space.knn_search(&query, neighbors))
});
},
);
}
}
group.finish();
}
/// Benchmark attention computation
fn bench_attention_computation(c: &mut Criterion) {
let mut group = c.benchmark_group("attention_computation");
let dimensions = [64, 128, 256, 512];
let attention_types = [
AttentionType::Standard,
AttentionType::Flash,
AttentionType::MultiHead { num_heads: 8 },
AttentionType::MoE { num_experts: 4 },
];
for &dim in &dimensions {
for attention_type in &attention_types {
let config = LatentConfig {
dimension: dim,
attention_type: attention_type.clone(),
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(100, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.bench_with_input(
BenchmarkId::new(format!("dim_{}", dim), format!("{:?}", attention_type)),
&query,
|b, q| {
b.iter(|| {
black_box(space.compute_attention(q))
});
},
);
}
}
group.finish();
}
criterion_group!(
latency_benches,
bench_latent_space_creation,
bench_add_embedding,
bench_knn_search,
bench_attention_computation
);
criterion_main!(latency_benches);
```
### 1.2 Throughput Benchmarks
Benchmark batch processing and parallel operations.
```rust
// benches/throughput_benchmarks.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
use rand::Rng;
/// Benchmark batch embedding addition
fn bench_batch_add_embeddings(c: &mut Criterion) {
let mut group = c.benchmark_group("batch_add_embeddings");
let batch_sizes = [1, 8, 32, 128, 512];
let dim = 256;
for &batch_size in &batch_sizes {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(batch_size, dim);
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::from_parameter(batch_size),
&embeddings,
|b, embs| {
b.iter(|| {
for emb in embs {
black_box(space.add_embedding(emb.clone(), None));
}
});
},
);
}
group.finish();
}
/// Benchmark parallel KNN search
fn bench_parallel_knn_search(c: &mut Criterion) {
use rayon::prelude::*;
let mut group = c.benchmark_group("parallel_knn_search");
let query_counts = [1, 8, 32, 128];
let dim = 256;
let k = 100;
let dataset_size = 10000;
for &num_queries in &query_counts {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let dataset = generate_dataset(dataset_size, dim);
for emb in dataset.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let queries: Vec<Vec<f32>> = (0..num_queries)
.map(|_| random_embedding(dim))
.collect();
group.throughput(Throughput::Elements(num_queries as u64));
group.bench_with_input(
BenchmarkId::from_parameter(num_queries),
&queries,
|b, qs| {
b.iter(|| {
let results: Vec<_> = qs.par_iter()
.map(|q| space.knn_search(q, k))
.collect();
black_box(results)
});
},
);
}
group.finish();
}
/// Benchmark batch attention computation
fn bench_batch_attention(c: &mut Criterion) {
let mut group = c.benchmark_group("batch_attention");
let batch_sizes = [8, 32, 128];
let dim = 256;
for &batch_size in &batch_sizes {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Flash,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(1000, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let queries = generate_dataset(batch_size, dim);
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::from_parameter(batch_size),
&queries,
|b, qs| {
b.iter(|| {
for q in qs {
black_box(space.compute_attention(q));
}
});
},
);
}
group.finish();
}
criterion_group!(
throughput_benches,
bench_batch_add_embeddings,
bench_parallel_knn_search,
bench_batch_attention
);
criterion_main!(throughput_benches);
```
### 1.3 Memory Benchmarks
Track peak memory usage and allocation patterns.
```rust
// benches/memory_benchmarks.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicUsize, Ordering};
/// Custom allocator to track memory usage
struct TrackingAllocator;
static ALLOCATED: AtomicUsize = AtomicUsize::new(0);
static PEAK_ALLOCATED: AtomicUsize = AtomicUsize::new(0);
unsafe impl GlobalAlloc for TrackingAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
let size = layout.size();
let current = ALLOCATED.fetch_add(size, Ordering::SeqCst) + size;
// Update peak if necessary
let mut peak = PEAK_ALLOCATED.load(Ordering::SeqCst);
while current > peak {
match PEAK_ALLOCATED.compare_exchange(
peak,
current,
Ordering::SeqCst,
Ordering::SeqCst,
) {
Ok(_) => break,
Err(p) => peak = p,
}
}
System.alloc(layout)
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
ALLOCATED.fetch_sub(layout.size(), Ordering::SeqCst);
System.dealloc(ptr, layout)
}
}
#[global_allocator]
static GLOBAL: TrackingAllocator = TrackingAllocator;
fn reset_memory_tracking() {
ALLOCATED.store(0, Ordering::SeqCst);
PEAK_ALLOCATED.store(0, Ordering::SeqCst);
}
fn get_peak_memory() -> usize {
PEAK_ALLOCATED.load(Ordering::SeqCst)
}
/// Benchmark memory usage for different dataset sizes
fn bench_memory_scaling(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_scaling");
let dataset_sizes = [1000, 5000, 10000, 50000, 100000];
let dim = 256;
for &size in &dataset_sizes {
group.bench_with_input(
BenchmarkId::from_parameter(size),
&size,
|b, &num_vectors| {
b.iter_custom(|iters| {
let mut total_duration = std::time::Duration::ZERO;
for _ in 0..iters {
reset_memory_tracking();
let start = std::time::Instant::now();
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
for i in 0..num_vectors {
let emb = random_embedding(dim);
space.add_embedding(emb, Some(i as u64)).unwrap();
}
let elapsed = start.elapsed();
total_duration += elapsed;
let peak = get_peak_memory();
println!(
"Dataset size: {}, Peak memory: {} MB",
num_vectors,
peak / 1_000_000
);
black_box(space);
}
total_duration
});
},
);
}
group.finish();
}
/// Benchmark memory usage by dimension
fn bench_memory_by_dimension(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_by_dimension");
let dimensions = [64, 128, 256, 512, 1024];
let num_vectors = 10000;
for &dim in &dimensions {
group.bench_with_input(
BenchmarkId::from_parameter(dim),
&dim,
|b, &dimension| {
b.iter_custom(|iters| {
let mut total_duration = std::time::Duration::ZERO;
for _ in 0..iters {
reset_memory_tracking();
let start = std::time::Instant::now();
let config = LatentConfig {
dimension,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
for i in 0..num_vectors {
let emb = random_embedding(dimension);
space.add_embedding(emb, Some(i as u64)).unwrap();
}
let elapsed = start.elapsed();
total_duration += elapsed;
let peak = get_peak_memory();
println!(
"Dimension: {}, Peak memory: {} MB",
dimension,
peak / 1_000_000
);
black_box(space);
}
total_duration
});
},
);
}
group.finish();
}
criterion_group!(
memory_benches,
bench_memory_scaling,
bench_memory_by_dimension
);
criterion_main!(memory_benches);
```
## 2. Benchmark Matrix
### 2.1 Complete Test Matrix Configuration
```rust
// benches/benchmark_matrix.rs
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
/// Comprehensive benchmark matrix
struct BenchmarkMatrix {
dimensions: Vec<usize>,
neighbors: Vec<usize>,
batch_sizes: Vec<usize>,
dataset_sizes: Vec<usize>,
}
impl Default for BenchmarkMatrix {
fn default() -> Self {
Self {
dimensions: vec![64, 128, 256, 512, 1024],
neighbors: vec![10, 50, 100, 500, 1000, 10000],
batch_sizes: vec![1, 8, 32, 128],
dataset_sizes: vec![1000, 5000, 10000, 50000],
}
}
}
impl BenchmarkMatrix {
/// Run complete benchmark matrix
fn run_complete_matrix(&self, c: &mut Criterion) {
for &dim in &self.dimensions {
for &k in &self.neighbors {
for &batch_size in &self.batch_sizes {
for &dataset_size in &self.dataset_sizes {
self.bench_configuration(
c,
dim,
k,
batch_size,
dataset_size,
);
}
}
}
}
}
/// Benchmark specific configuration
fn bench_configuration(
&self,
c: &mut Criterion,
dim: usize,
k: usize,
batch_size: usize,
dataset_size: usize,
) {
let group_name = format!(
"matrix/dim_{}/k_{}/batch_{}/dataset_{}",
dim, k, batch_size, dataset_size
);
let mut group = c.benchmark_group(&group_name);
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let dataset = generate_dataset(dataset_size, dim);
for emb in dataset.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let queries = generate_dataset(batch_size, dim);
group.bench_function("knn_search", |b| {
b.iter(|| {
for query in &queries {
black_box(space.knn_search(query, k));
}
});
});
group.finish();
}
}
/// Run critical path benchmarks only (reduced matrix)
fn bench_critical_path(c: &mut Criterion) {
let matrix = BenchmarkMatrix {
dimensions: vec![128, 256, 512],
neighbors: vec![10, 100, 1000],
batch_sizes: vec![1, 32],
dataset_sizes: vec![10000],
};
matrix.run_complete_matrix(c);
}
criterion_group!(matrix_benches, bench_critical_path);
criterion_main!(matrix_benches);
```
### 2.2 Benchmark Matrix Results Format
Expected output format for tracking:
```toml
# benchmark_results.toml
[latency.dim_128.k_10]
mean = "45.2 µs"
std_dev = "2.1 µs"
median = "44.8 µs"
mad = "1.4 µs"
[latency.dim_256.k_100]
mean = "124.5 µs"
std_dev = "5.3 µs"
median = "123.1 µs"
mad = "3.2 µs"
[throughput.batch_32.dim_256]
throughput = "2.34 GiB/s"
ops_per_sec = "8542"
[memory.dataset_10000.dim_256]
peak_mb = "245"
average_mb = "198"
allocations = "12543"
```
## 3. Comparative Benchmarks
### 3.1 Attention Mechanism Comparison
```rust
// benches/attention_comparison.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, PlotConfiguration,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
fn compare_attention_mechanisms(c: &mut Criterion) {
let mut group = c.benchmark_group("attention_comparison");
group.plot_config(PlotConfiguration::default().summary_scale(criterion::AxisScale::Logarithmic));
let dimensions = [128, 256, 512];
let sequence_lengths = [100, 500, 1000];
let attention_types = vec![
("Standard", AttentionType::Standard),
("Flash", AttentionType::Flash),
("MultiHead_4", AttentionType::MultiHead { num_heads: 4 }),
("MultiHead_8", AttentionType::MultiHead { num_heads: 8 }),
("MoE_2", AttentionType::MoE { num_experts: 2 }),
("MoE_4", AttentionType::MoE { num_experts: 4 }),
];
for &dim in &dimensions {
for &seq_len in &sequence_lengths {
for (name, attn_type) in &attention_types {
let config = LatentConfig {
dimension: dim,
attention_type: attn_type.clone(),
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(seq_len, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.bench_with_input(
BenchmarkId::new(
format!("{}@dim_{}", name, dim),
seq_len,
),
&query,
|b, q| {
b.iter(|| {
black_box(space.compute_attention(q))
});
},
);
}
}
}
group.finish();
}
/// Generate comparison report
fn generate_comparison_report() {
use std::collections::HashMap;
let results = HashMap::from([
("Standard", vec![45.2, 124.5, 456.7]),
("Flash", vec![32.1, 89.3, 301.2]),
("MultiHead_8", vec![52.3, 142.1, 512.4]),
("MoE_4", vec![48.9, 128.7, 445.3]),
]);
println!("\n=== Attention Mechanism Comparison ===\n");
println!("{:<15} {:>12} {:>12} {:>12}", "Type", "128-dim", "256-dim", "512-dim");
println!("{:-<55}", "");
for (name, times) in results.iter() {
println!(
"{:<15} {:>10.1} µs {:>10.1} µs {:>10.1} µs",
name, times[0], times[1], times[2]
);
}
// Calculate speedup vs standard
if let Some(baseline) = results.get("Standard") {
println!("\n=== Speedup vs Standard ===\n");
for (name, times) in results.iter() {
if name != &"Standard" {
let speedups: Vec<f64> = times
.iter()
.zip(baseline.iter())
.map(|(t, b)| b / t)
.collect();
println!(
"{:<15} {:>10.2}x {:>10.2}x {:>10.2}x",
name, speedups[0], speedups[1], speedups[2]
);
}
}
}
}
criterion_group!(attention_benches, compare_attention_mechanisms);
criterion_main!(attention_benches);
```
### 3.2 Distance Metric Comparison
```rust
// benches/distance_comparison.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
fn compare_distance_metrics(c: &mut Criterion) {
let mut group = c.benchmark_group("distance_comparison");
let dimensions = [128, 256, 512];
let metrics = vec![
("Euclidean", DistanceMetric::Euclidean),
("Cosine", DistanceMetric::Cosine),
("Manhattan", DistanceMetric::Manhattan),
("Hyperbolic", DistanceMetric::Hyperbolic { curvature: -1.0 }),
];
for &dim in &dimensions {
for (name, metric) in &metrics {
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: metric.clone(),
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let dataset = generate_dataset(10000, dim);
for emb in dataset.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.bench_with_input(
BenchmarkId::new(format!("{}@dim_{}", name, dim), "knn_100"),
&query,
|b, q| {
b.iter(|| {
black_box(space.knn_search(q, 100))
});
},
);
}
}
group.finish();
}
criterion_group!(distance_benches, compare_distance_metrics);
criterion_main!(distance_benches);
```
### 3.3 Standard vs Flash Attention
Detailed comparison focusing on Flash attention optimizations:
```rust
// benches/flash_attention_benchmark.rs
use criterion::{
black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput,
};
use ruvector::latent_space::{LatentSpace, LatentConfig, AttentionType};
use ruvector::metrics::DistanceMetric;
fn bench_flash_vs_standard(c: &mut Criterion) {
let mut group = c.benchmark_group("flash_vs_standard");
// Test at various sequence lengths where Flash shines
let sequence_lengths = [256, 512, 1024, 2048, 4096];
let dim = 256;
let num_heads = 8;
for &seq_len in &sequence_lengths {
// Standard attention
{
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(seq_len, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.throughput(Throughput::Elements(seq_len as u64));
group.bench_with_input(
BenchmarkId::new("Standard", seq_len),
&query,
|b, q| {
b.iter(|| {
black_box(space.compute_attention(q))
});
},
);
}
// Flash attention
{
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Flash,
num_heads,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(seq_len, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
group.throughput(Throughput::Elements(seq_len as u64));
group.bench_with_input(
BenchmarkId::new("Flash", seq_len),
&query,
|b, q| {
b.iter(|| {
black_box(space.compute_attention(q))
});
},
);
}
}
group.finish();
}
/// Memory comparison between Flash and Standard
fn bench_memory_flash_vs_standard(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_flash_vs_standard");
let sequence_lengths = [512, 1024, 2048];
let dim = 256;
for &seq_len in &sequence_lengths {
group.bench_with_input(
BenchmarkId::new("Standard", seq_len),
&seq_len,
|b, &len| {
b.iter_custom(|iters| {
let mut total_duration = std::time::Duration::ZERO;
for _ in 0..iters {
reset_memory_tracking();
let start = std::time::Instant::now();
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Standard,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(len, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
black_box(space.compute_attention(&query));
total_duration += start.elapsed();
println!(
"Standard@{}: Peak {} MB",
len,
get_peak_memory() / 1_000_000
);
}
total_duration
});
},
);
group.bench_with_input(
BenchmarkId::new("Flash", seq_len),
&seq_len,
|b, &len| {
b.iter_custom(|iters| {
let mut total_duration = std::time::Duration::ZERO;
for _ in 0..iters {
reset_memory_tracking();
let start = std::time::Instant::now();
let config = LatentConfig {
dimension: dim,
attention_type: AttentionType::Flash,
num_heads: 8,
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
let mut space = LatentSpace::new(config).unwrap();
let embeddings = generate_dataset(len, dim);
for emb in embeddings.iter() {
space.add_embedding(emb.clone(), None).unwrap();
}
let query = random_embedding(dim);
black_box(space.compute_attention(&query));
total_duration += start.elapsed();
println!(
"Flash@{}: Peak {} MB",
len,
get_peak_memory() / 1_000_000
);
}
total_duration
});
},
);
}
group.finish();
}
criterion_group!(
flash_benches,
bench_flash_vs_standard,
bench_memory_flash_vs_standard
);
criterion_main!(flash_benches);
```
## 4. Regression Detection
### 4.1 Baseline Storage System
```rust
// benches/regression_detection.rs
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkBaseline {
pub name: String,
pub dimension: usize,
pub mean: f64,
pub std_dev: f64,
pub median: f64,
pub throughput: Option<f64>,
pub memory_peak_mb: Option<usize>,
pub timestamp: String,
pub git_commit: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct BaselineCollection {
pub baselines: HashMap<String, BenchmarkBaseline>,
pub created_at: String,
pub updated_at: String,
}
impl BaselineCollection {
/// Load baselines from file
pub fn load<P: AsRef<Path>>(path: P) -> Result<Self, Box<dyn std::error::Error>> {
let content = fs::read_to_string(path)?;
let collection: BaselineCollection = serde_json::from_str(&content)?;
Ok(collection)
}
/// Save baselines to file
pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn std::error::Error>> {
let content = serde_json::to_string_pretty(self)?;
fs::write(path, content)?;
Ok(())
}
/// Add or update baseline
pub fn update_baseline(&mut self, key: String, baseline: BenchmarkBaseline) {
self.baselines.insert(key, baseline);
self.updated_at = chrono::Utc::now().to_rfc3339();
}
/// Check for regression
pub fn check_regression(
&self,
key: &str,
current: &BenchmarkResult,
threshold_percent: f64,
) -> RegressionStatus {
if let Some(baseline) = self.baselines.get(key) {
let change_percent = ((current.mean - baseline.mean) / baseline.mean) * 100.0;
if change_percent > threshold_percent {
RegressionStatus::Regression {
baseline: baseline.mean,
current: current.mean,
change_percent,
}
} else if change_percent < -threshold_percent / 2.0 {
RegressionStatus::Improvement {
baseline: baseline.mean,
current: current.mean,
change_percent: -change_percent,
}
} else {
RegressionStatus::NoChange
}
} else {
RegressionStatus::NewBenchmark
}
}
}
#[derive(Debug)]
pub enum RegressionStatus {
Regression {
baseline: f64,
current: f64,
change_percent: f64,
},
Improvement {
baseline: f64,
current: f64,
change_percent: f64,
},
NoChange,
NewBenchmark,
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub name: String,
pub mean: f64,
pub std_dev: f64,
pub median: f64,
}
/// Get current git commit
fn get_git_commit() -> String {
use std::process::Command;
let output = Command::new("git")
.args(&["rev-parse", "HEAD"])
.output()
.expect("Failed to get git commit");
String::from_utf8_lossy(&output.stdout).trim().to_string()
}
/// Create baseline from current run
pub fn create_baseline(results: Vec<BenchmarkResult>) -> BaselineCollection {
let mut baselines = HashMap::new();
let commit = get_git_commit();
let timestamp = chrono::Utc::now().to_rfc3339();
for result in results {
let baseline = BenchmarkBaseline {
name: result.name.clone(),
dimension: 256, // Extract from name or pass explicitly
mean: result.mean,
std_dev: result.std_dev,
median: result.median,
throughput: None,
memory_peak_mb: None,
timestamp: timestamp.clone(),
git_commit: commit.clone(),
};
baselines.insert(result.name.clone(), baseline);
}
BaselineCollection {
baselines,
created_at: timestamp.clone(),
updated_at: timestamp,
}
}
```
### 4.2 CI Integration Script
```bash
#!/bin/bash
# scripts/benchmark_ci.sh
set -e
BASELINE_FILE="benches/baselines.json"
RESULTS_FILE="target/criterion/results.json"
THRESHOLD=10.0 # 10% regression threshold
echo "Running benchmarks..."
cargo bench --bench latency_benchmarks -- --save-baseline current
echo "Comparing with baseline..."
if [ -f "$BASELINE_FILE" ]; then
cargo run --bin compare_benchmarks -- \
--baseline "$BASELINE_FILE" \
--current "$RESULTS_FILE" \
--threshold "$THRESHOLD"
REGRESSION_STATUS=$?
if [ $REGRESSION_STATUS -eq 1 ]; then
echo "❌ Performance regression detected!"
exit 1
elif [ $REGRESSION_STATUS -eq 2 ]; then
echo "✅ Performance improvement detected!"
else
echo "✅ No significant performance change"
fi
else
echo "No baseline found, creating new baseline..."
cp "$RESULTS_FILE" "$BASELINE_FILE"
fi
echo "Generating benchmark report..."
cargo run --bin benchmark_report -- \
--baseline "$BASELINE_FILE" \
--output "target/benchmark_report.md"
echo "Done!"
```
### 4.3 Threshold Configuration
```toml
# benches/regression_config.toml
[thresholds]
# Global default threshold (%)
default = 10.0
# Per-benchmark thresholds
[thresholds.latency]
knn_search = 5.0
add_embedding = 8.0
attention = 12.0
[thresholds.throughput]
batch_operations = 15.0
parallel_search = 10.0
[thresholds.memory]
peak_usage = 20.0
allocation_count = 25.0
[regression_actions]
# What to do on regression
fail_ci = true
create_issue = true
notify_team = true
[regression_actions.notifications]
slack_webhook = "${SLACK_WEBHOOK_URL}"
email = "dev-team@example.com"
[baseline_management]
# Auto-update baseline on main branch
auto_update_main = true
# Require manual approval for baseline updates
require_approval = false
# Keep history of baselines
keep_history = true
max_history_count = 50
```
### 4.4 Benchmark Comparison Tool
```rust
// src/bin/compare_benchmarks.rs
use clap::Parser;
use ruvector_benches::regression::{BaselineCollection, BenchmarkResult, RegressionStatus};
use std::path::PathBuf;
#[derive(Parser)]
struct Args {
#[arg(long)]
baseline: PathBuf,
#[arg(long)]
current: PathBuf,
#[arg(long, default_value = "10.0")]
threshold: f64,
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
let baselines = BaselineCollection::load(&args.baseline)?;
let current_results = load_current_results(&args.current)?;
let mut has_regression = false;
let mut has_improvement = false;
println!("\n=== Benchmark Comparison Report ===\n");
for result in current_results {
let status = baselines.check_regression(
&result.name,
&result,
args.threshold,
);
match status {
RegressionStatus::Regression {
baseline,
current,
change_percent,
} => {
has_regression = true;
println!(
"❌ REGRESSION: {}\n Baseline: {:.2} µs\n Current: {:.2} µs\n Change: +{:.1}%\n",
result.name, baseline, current, change_percent
);
}
RegressionStatus::Improvement {
baseline,
current,
change_percent,
} => {
has_improvement = true;
println!(
"✅ IMPROVEMENT: {}\n Baseline: {:.2} µs\n Current: {:.2} µs\n Change: -{:.1}%\n",
result.name, baseline, current, change_percent
);
}
RegressionStatus::NoChange => {
println!("➡️ NO CHANGE: {}\n", result.name);
}
RegressionStatus::NewBenchmark => {
println!("🆕 NEW: {}\n", result.name);
}
}
}
if has_regression {
std::process::exit(1);
} else if has_improvement {
std::process::exit(2);
} else {
std::process::exit(0);
}
}
fn load_current_results(path: &PathBuf) -> Result<Vec<BenchmarkResult>, Box<dyn std::error::Error>> {
// Parse Criterion JSON output
// Implementation depends on Criterion output format
Ok(vec![])
}
```
### 4.5 GitHub Actions Workflow
```yaml
# .github/workflows/benchmarks.yml
name: Performance Benchmarks
on:
pull_request:
branches: [ main ]
push:
branches: [ main ]
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0 # Fetch all history for baseline comparison
- name: Setup Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Cache cargo registry
uses: actions/cache@v3
with:
path: ~/.cargo/registry
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
- name: Cache criterion results
uses: actions/cache@v3
with:
path: target/criterion
key: ${{ runner.os }}-criterion-${{ github.sha }}
restore-keys: |
${{ runner.os }}-criterion-
- name: Restore baseline
run: |
if [ -f benches/baselines.json ]; then
echo "Baseline found"
else
echo "No baseline, will create new one"
fi
- name: Run benchmarks
run: |
cargo bench --bench latency_benchmarks
cargo bench --bench throughput_benchmarks
cargo bench --bench memory_benchmarks
cargo bench --bench attention_comparison
- name: Compare with baseline
id: compare
run: |
chmod +x scripts/benchmark_ci.sh
./scripts/benchmark_ci.sh
continue-on-error: true
- name: Generate report
run: |
cargo run --bin benchmark_report -- \
--baseline benches/baselines.json \
--output target/benchmark_report.md
- name: Comment PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
const report = fs.readFileSync('target/benchmark_report.md', 'utf8');
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: report
});
- name: Update baseline on main
if: github.ref == 'refs/heads/main' && steps.compare.outcome == 'success'
run: |
cp target/criterion/results.json benches/baselines.json
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add benches/baselines.json
git commit -m "chore: update performance baselines [skip ci]"
git push
- name: Fail on regression
if: steps.compare.outcome == 'failure'
run: exit 1
- name: Upload benchmark results
uses: actions/upload-artifact@v3
with:
name: benchmark-results
path: |
target/criterion/
target/benchmark_report.md
```
## 5. Benchmark Organization
### 5.1 Directory Structure
```
benches/
├── latency_benchmarks.rs # Latency measurements
├── throughput_benchmarks.rs # Throughput measurements
├── memory_benchmarks.rs # Memory usage tracking
├── attention_comparison.rs # Attention mechanism comparison
├── distance_comparison.rs # Distance metric comparison
├── flash_attention_benchmark.rs # Flash vs Standard detailed
├── benchmark_matrix.rs # Complete test matrix
├── regression_detection.rs # Baseline & regression tools
├── baselines.json # Stored baselines
└── regression_config.toml # Threshold configuration
scripts/
├── benchmark_ci.sh # CI integration script
├── generate_report.sh # Report generation
└── update_baseline.sh # Baseline management
src/bin/
├── compare_benchmarks.rs # Comparison tool
└── benchmark_report.rs # Report generator
```
### 5.2 Cargo.toml Configuration
```toml
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
rayon = "1.7"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = "0.4"
[[bench]]
name = "latency_benchmarks"
harness = false
[[bench]]
name = "throughput_benchmarks"
harness = false
[[bench]]
name = "memory_benchmarks"
harness = false
[[bench]]
name = "attention_comparison"
harness = false
[[bench]]
name = "distance_comparison"
harness = false
[[bench]]
name = "flash_attention_benchmark"
harness = false
[[bench]]
name = "benchmark_matrix"
harness = false
[profile.bench]
opt-level = 3
lto = true
codegen-units = 1
```
## Summary
This benchmark suite provides:
1. **Comprehensive Coverage**:
- Latency benchmarks across all dimensions
- Throughput measurements for batch operations
- Memory usage tracking and profiling
- Full test matrix with 4D parameter space
2. **Comparative Analysis**:
- Attention mechanism comparison (Standard, Flash, Multi-Head, MoE)
- Distance metric comparison (Euclidean, Cosine, Manhattan, Hyperbolic)
- Detailed Flash vs Standard analysis
3. **Regression Detection**:
- Baseline storage and versioning
- Automated comparison with configurable thresholds
- CI/CD integration with GitHub Actions
- Automatic PR comments with benchmark results
4. **Production Ready**:
- Complete Criterion integration
- Structured result storage
- Automated reporting
- Performance tracking over time
Run benchmarks with:
```bash
# Run all benchmarks
cargo bench
# Run specific benchmark suite
cargo bench --bench latency_benchmarks
# Compare with baseline
./scripts/benchmark_ci.sh
# Generate report
cargo run --bin benchmark_report
```