git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
17 KiB
17 KiB
Benchmarking Plan
Overview
Comprehensive benchmarking strategy for ruvector-postgres covering micro-benchmarks, integration tests, comparison with competitors, and production workload simulation.
Benchmark Categories
1. Micro-Benchmarks
Test individual operations in isolation.
// benches/distance_bench.rs
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
fn bench_euclidean_distance(c: &mut Criterion) {
let dims = [128, 256, 512, 768, 1024, 1536];
let mut group = c.benchmark_group("euclidean_distance");
for dim in dims {
let a: Vec<f32> = (0..dim).map(|_| rand::random()).collect();
let b: Vec<f32> = (0..dim).map(|_| rand::random()).collect();
group.bench_with_input(
BenchmarkId::new("scalar", dim),
&dim,
|bench, _| bench.iter(|| euclidean_scalar(&a, &b))
);
group.bench_with_input(
BenchmarkId::new("simd_auto", dim),
&dim,
|bench, _| bench.iter(|| euclidean_simd(&a, &b))
);
#[cfg(target_arch = "x86_64")]
{
group.bench_with_input(
BenchmarkId::new("avx2", dim),
&dim,
|bench, _| bench.iter(|| unsafe { euclidean_avx2(&a, &b) })
);
if is_x86_feature_detected!("avx512f") {
group.bench_with_input(
BenchmarkId::new("avx512", dim),
&dim,
|bench, _| bench.iter(|| unsafe { euclidean_avx512(&a, &b) })
);
}
}
}
group.finish();
}
fn bench_cosine_distance(c: &mut Criterion) {
// Similar structure for cosine
}
fn bench_dot_product(c: &mut Criterion) {
// Similar structure for dot product
}
criterion_group!(
distance_benches,
bench_euclidean_distance,
bench_cosine_distance,
bench_dot_product
);
criterion_main!(distance_benches);
Expected Results: Distance Functions
| Operation | Dimension | Scalar (ns) | AVX2 (ns) | AVX-512 (ns) | Speedup |
|---|---|---|---|---|---|
| Euclidean | 128 | 180 | 45 | 28 | 6.4x |
| Euclidean | 768 | 980 | 210 | 125 | 7.8x |
| Euclidean | 1536 | 1950 | 420 | 245 | 8.0x |
| Cosine | 128 | 240 | 62 | 38 | 6.3x |
| Cosine | 768 | 1280 | 285 | 168 | 7.6x |
| Dot Product | 768 | 450 | 95 | 58 | 7.8x |
2. Index Benchmarks
// benches/index_bench.rs
fn bench_hnsw_build(c: &mut Criterion) {
let sizes = [10_000, 100_000, 1_000_000];
let dims = [128, 768];
let mut group = c.benchmark_group("hnsw_build");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
for size in sizes {
for dim in dims {
let vectors = generate_random_vectors(size, dim);
group.bench_with_input(
BenchmarkId::new(format!("{}d", dim), size),
&(&vectors, dim),
|bench, (vecs, _)| {
bench.iter(|| {
let mut index = HnswIndex::new(HnswConfig {
m: 16,
ef_construction: 200,
..Default::default()
});
for (i, v) in vecs.iter().enumerate() {
index.insert(i as u64, v);
}
})
}
);
}
}
group.finish();
}
fn bench_hnsw_search(c: &mut Criterion) {
// Pre-build index
let index = build_hnsw_index(1_000_000, 768);
let queries = generate_random_vectors(1000, 768);
let ef_values = [10, 50, 100, 200, 500];
let k_values = [1, 10, 100];
let mut group = c.benchmark_group("hnsw_search");
for ef in ef_values {
for k in k_values {
group.bench_with_input(
BenchmarkId::new(format!("ef{}_k{}", ef, k), "1M"),
&(&index, &queries, ef, k),
|bench, (idx, qs, ef, k)| {
bench.iter(|| {
for q in qs.iter() {
idx.search(q, *k, *ef);
}
})
}
);
}
}
group.finish();
}
fn bench_ivfflat_search(c: &mut Criterion) {
let index = build_ivfflat_index(1_000_000, 768, 1000); // 1000 lists
let queries = generate_random_vectors(1000, 768);
let probe_values = [1, 5, 10, 20, 50];
let mut group = c.benchmark_group("ivfflat_search");
for probes in probe_values {
group.bench_with_input(
BenchmarkId::new(format!("probes{}", probes), "1M"),
&probes,
|bench, probes| {
bench.iter(|| {
for q in queries.iter() {
index.search(q, 10, *probes);
}
})
}
);
}
group.finish();
}
Expected Results: Index Operations
| Index | Size | Build Time | Memory | Search (p50) | Search (p99) | Recall@10 |
|---|---|---|---|---|---|---|
| HNSW | 100K | 45s | 450MB | 0.8ms | 2.1ms | 0.98 |
| HNSW | 1M | 8min | 4.5GB | 1.2ms | 4.5ms | 0.97 |
| HNSW | 10M | 95min | 45GB | 2.1ms | 8.2ms | 0.96 |
| IVFFlat | 100K | 12s | 320MB | 1.5ms | 4.2ms | 0.92 |
| IVFFlat | 1M | 2min | 3.2GB | 3.2ms | 9.5ms | 0.91 |
| IVFFlat | 10M | 25min | 32GB | 8.5ms | 25ms | 0.89 |
3. Quantization Benchmarks
// benches/quantization_bench.rs
fn bench_quantization_build(c: &mut Criterion) {
let vectors = generate_random_vectors(100_000, 768);
let mut group = c.benchmark_group("quantization_build");
group.bench_function("scalar_q8", |bench| {
bench.iter(|| ScalarQuantized::from_f32(&vectors))
});
group.bench_function("binary", |bench| {
bench.iter(|| BinaryQuantized::from_f32(&vectors))
});
group.bench_function("product_q", |bench| {
bench.iter(|| ProductQuantized::from_f32(&vectors, 96, 256))
});
group.finish();
}
fn bench_quantized_search(c: &mut Criterion) {
let vectors = generate_random_vectors(1_000_000, 768);
let query = generate_random_vectors(1, 768).pop().unwrap();
let sq8 = ScalarQuantized::from_f32(&vectors);
let binary = BinaryQuantized::from_f32(&vectors);
let pq = ProductQuantized::from_f32(&vectors, 96, 256);
let mut group = c.benchmark_group("quantized_search_1M");
group.bench_function("full_precision", |bench| {
bench.iter(|| {
vectors.iter()
.enumerate()
.map(|(i, v)| (i, euclidean_distance(&query, v)))
.min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
})
});
group.bench_function("scalar_q8", |bench| {
bench.iter(|| {
(0..vectors.len())
.map(|i| (i, sq8.distance(&query, i)))
.min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
})
});
group.bench_function("binary_hamming", |bench| {
let query_bits = binary.quantize_query(&query);
bench.iter(|| {
(0..vectors.len())
.map(|i| (i, binary.hamming_distance(&query_bits, i)))
.min_by(|a, b| a.1.cmp(&b.1))
})
});
group.finish();
}
Expected Results: Quantization
| Method | Memory (1M 768d) | Search Time | Recall Loss |
|---|---|---|---|
| Full Precision | 3GB | 850ms | 0% |
| Scalar Q8 | 750MB | 420ms | 1-2% |
| Binary | 94MB | 95ms | 5-10% |
| Product Q | 200MB | 180ms | 2-4% |
4. PostgreSQL Integration Benchmarks
-- Test setup script
CREATE EXTENSION ruvector;
-- Create test table
CREATE TABLE bench_vectors (
id SERIAL PRIMARY KEY,
embedding vector(768),
category TEXT,
created_at TIMESTAMP DEFAULT NOW()
);
-- Insert test data
INSERT INTO bench_vectors (embedding, category)
SELECT
array_agg(random())::vector(768),
'category_' || (i % 100)::text
FROM generate_series(1, 1000000) i
GROUP BY i;
-- Create indexes
CREATE INDEX ON bench_vectors USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 200);
CREATE INDEX ON bench_vectors USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1000);
-- Benchmark queries
\timing on
-- Simple k-NN
EXPLAIN ANALYZE
SELECT id, embedding <=> '[...]'::vector AS distance
FROM bench_vectors
ORDER BY distance
LIMIT 10;
-- k-NN with filter
EXPLAIN ANALYZE
SELECT id, embedding <=> '[...]'::vector AS distance
FROM bench_vectors
WHERE category = 'category_42'
ORDER BY distance
LIMIT 10;
-- Batch search
EXPLAIN ANALYZE
SELECT b.id, q.query_id,
b.embedding <=> q.embedding AS distance
FROM bench_vectors b
CROSS JOIN (
SELECT 1 AS query_id, '[...]'::vector AS embedding
UNION ALL
SELECT 2, '[...]'::vector
-- ... more queries
) q
ORDER BY q.query_id, distance
LIMIT 100;
5. Competitor Comparison
# benchmark_comparison.py
import time
import numpy as np
from typing import List, Tuple
# Test data
SIZES = [10_000, 100_000, 1_000_000]
DIMS = [128, 768, 1536]
K = 10
QUERIES = 1000
def run_pgvector_benchmark(conn, size, dim):
"""Benchmark pgvector"""
# Setup
conn.execute(f"""
CREATE TABLE pgvector_test (
id SERIAL PRIMARY KEY,
embedding vector({dim})
);
CREATE INDEX ON pgvector_test USING hnsw (embedding vector_cosine_ops);
""")
# Insert
start = time.time()
# ... bulk insert
build_time = time.time() - start
# Search
query = np.random.randn(dim).astype(np.float32)
start = time.time()
for _ in range(QUERIES):
conn.execute(f"""
SELECT id FROM pgvector_test
ORDER BY embedding <=> %s
LIMIT {K}
""", (query.tolist(),))
search_time = (time.time() - start) / QUERIES * 1000
return {
'build_time': build_time,
'search_time_ms': search_time,
}
def run_ruvector_benchmark(conn, size, dim):
"""Benchmark ruvector-postgres"""
# Similar setup with ruvector
pass
def run_pinecone_benchmark(index, size, dim):
"""Benchmark Pinecone (cloud)"""
pass
def run_qdrant_benchmark(client, size, dim):
"""Benchmark Qdrant"""
pass
def run_milvus_benchmark(collection, size, dim):
"""Benchmark Milvus"""
pass
# Run all benchmarks
results = {}
for size in SIZES:
for dim in DIMS:
results[(size, dim)] = {
'pgvector': run_pgvector_benchmark(...),
'ruvector': run_ruvector_benchmark(...),
'qdrant': run_qdrant_benchmark(...),
'milvus': run_milvus_benchmark(...),
}
# Generate comparison report
Expected Comparison Results
| System | 1M Build | 1M Search (p50) | 1M Search (p99) | Memory | Recall@10 |
|---|---|---|---|---|---|
| ruvector-postgres | 5min | 0.9ms | 3.2ms | 4.2GB | 0.97 |
| pgvector | 12min | 2.1ms | 8.5ms | 4.8GB | 0.95 |
| Qdrant | 7min | 1.2ms | 4.1ms | 4.5GB | 0.96 |
| Milvus | 8min | 1.5ms | 5.2ms | 5.1GB | 0.96 |
| Pinecone (P1) | 3min* | 5ms* | 15ms* | N/A | 0.98 |
*Cloud latency includes network overhead
6. Stress Testing
#!/bin/bash
# stress_test.sh
# Configuration
DURATION=3600 # 1 hour
CONCURRENCY=100
QPS_TARGET=10000
# Start PostgreSQL with ruvector
pg_ctl start -D $PGDATA
# Run pgbench-style workload
pgbench -c $CONCURRENCY -j 10 -T $DURATION \
-f stress_queries.sql \
-P 10 \
--rate=$QPS_TARGET \
testdb
# Monitor during test
while true; do
psql -c "SELECT * FROM ruvector_stats();" >> stats.log
psql -c "SELECT * FROM pg_stat_activity WHERE state = 'active';" >> activity.log
sleep 10
done
stress_queries.sql
-- Mixed workload
\set query_type random(1, 100)
\if :query_type <= 60
-- 60% simple k-NN
SELECT id FROM vectors
ORDER BY embedding <=> :'random_vector'::vector
LIMIT 10;
\elif :query_type <= 80
-- 20% filtered k-NN
SELECT id FROM vectors
WHERE category = :'random_category'
ORDER BY embedding <=> :'random_vector'::vector
LIMIT 10;
\elif :query_type <= 90
-- 10% batch search
SELECT v.id, q.id as query_id
FROM vectors v, query_batch q
ORDER BY v.embedding <=> q.embedding
LIMIT 100;
\else
-- 10% insert
INSERT INTO vectors (embedding, category)
VALUES (:'random_vector'::vector, :'random_category');
\endif
7. Memory Benchmarks
// benches/memory_bench.rs
fn bench_memory_footprint(c: &mut Criterion) {
let sizes = [100_000, 1_000_000, 10_000_000];
println!("\n=== Memory Footprint Analysis ===\n");
for size in sizes {
println!("Size: {} vectors", size);
// Full precision vectors
let vectors: Vec<Vec<f32>> = generate_random_vectors(size, 768);
let raw_size = size * 768 * 4;
println!(" Raw vectors: {} MB", raw_size / 1_000_000);
// HNSW index
let hnsw = HnswIndex::new(HnswConfig::default());
for (i, v) in vectors.iter().enumerate() {
hnsw.insert(i as u64, v);
}
println!(" HNSW overhead: {} MB", hnsw.memory_usage() / 1_000_000);
// Quantized
let sq8 = ScalarQuantized::from_f32(&vectors);
println!(" SQ8 size: {} MB", sq8.memory_usage() / 1_000_000);
let binary = BinaryQuantized::from_f32(&vectors);
println!(" Binary size: {} MB", binary.memory_usage() / 1_000_000);
println!();
}
}
8. Recall vs Latency Analysis
# recall_latency_analysis.py
import matplotlib.pyplot as plt
import numpy as np
def measure_recall_latency_tradeoff(index, queries, ground_truth, ef_values):
"""Measure recall vs latency for different ef values"""
results = []
for ef in ef_values:
latencies = []
recalls = []
for i, query in enumerate(queries):
start = time.time()
results = index.search(query, k=10, ef=ef)
latency = (time.time() - start) * 1000
recall = len(set(results) & set(ground_truth[i])) / 10
latencies.append(latency)
recalls.append(recall)
results.append({
'ef': ef,
'avg_latency': np.mean(latencies),
'p99_latency': np.percentile(latencies, 99),
'avg_recall': np.mean(recalls),
})
return results
# Plot results
plt.figure(figsize=(10, 6))
plt.plot([r['avg_latency'] for r in results],
[r['avg_recall'] for r in results], 'b-o')
plt.xlabel('Latency (ms)')
plt.ylabel('Recall@10')
plt.title('Recall vs Latency Tradeoff')
plt.savefig('recall_latency.png')
Benchmark Automation
CI/CD Integration
# .github/workflows/benchmark.yml
name: Benchmarks
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install dependencies
run: |
sudo apt-get install postgresql-16
cargo install cargo-criterion
- name: Run micro-benchmarks
run: |
cargo criterion --output-format json > bench_results.json
- name: Run PostgreSQL benchmarks
run: |
./scripts/run_pg_benchmarks.sh
- name: Compare with baseline
run: |
python scripts/compare_benchmarks.py \
--baseline baseline.json \
--current bench_results.json \
--threshold 10
- name: Upload results
uses: actions/upload-artifact@v3
with:
name: benchmark-results
path: bench_results.json
Benchmark Dashboard
-- Create benchmark results table
CREATE TABLE benchmark_results (
id SERIAL PRIMARY KEY,
run_date TIMESTAMP DEFAULT NOW(),
git_commit TEXT,
benchmark_name TEXT,
metric_name TEXT,
value FLOAT,
unit TEXT,
metadata JSONB
);
-- Query for trend analysis
SELECT
date_trunc('day', run_date) AS day,
benchmark_name,
AVG(value) AS avg_value,
MIN(value) AS min_value,
MAX(value) AS max_value
FROM benchmark_results
WHERE metric_name = 'search_latency_p50'
AND run_date > NOW() - INTERVAL '30 days'
GROUP BY 1, 2
ORDER BY 1, 2;
Reporting Format
Performance Report Template
# RuVector-Postgres Performance Report
**Date:** 2024-XX-XX
**Version:** 0.X.0
**Commit:** abc123
## Summary
- Overall performance: **X% faster** than pgvector
- Memory efficiency: **X% less** than competitors
- Recall@10: **0.97** (target: 0.95)
## Detailed Results
### Index Build Performance
| Size | HNSW Time | IVFFlat Time | Memory |
|------|-----------|--------------|--------|
| 100K | Xs | Xs | XMB |
| 1M | Xm | Xm | XGB |
### Search Latency (1M vectors, 768d)
| Metric | HNSW | IVFFlat | Target |
|--------|------|---------|--------|
| p50 | Xms | Xms | <2ms |
| p99 | Xms | Xms | <10ms |
| QPS | X | X | >5000 |
### Comparison with Competitors
[Charts and tables]
## Recommendations
1. For latency-sensitive workloads: Use HNSW with ef_search=64
2. For memory-constrained: Use IVFFlat with SQ8 quantization
3. For maximum throughput: Enable parallel search with 4 workers
Running Benchmarks
# Run all micro-benchmarks
cargo bench --features bench
# Run specific benchmark
cargo bench -- distance
# Run PostgreSQL benchmarks
./scripts/run_pg_benchmarks.sh
# Generate comparison report
python scripts/generate_report.py
# Quick smoke test
cargo bench -- --quick