Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/crates/ruvector-bench/Cargo.toml
+++ b/vendor/ruvector/crates/ruvector-bench/Cargo.toml
@@ -0,0 +1,99 @@
+[package]
+name = "ruvector-bench"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+authors.workspace = true
+repository.workspace = true
+readme = "README.md"
+description = "Comprehensive benchmarking suite for Ruvector"
+publish = false
+
+[[bin]]
+name = "ann-benchmark"
+path = "src/bin/ann_benchmark.rs"
+
+[[bin]]
+name = "agenticdb-benchmark"
+path = "src/bin/agenticdb_benchmark.rs"
+
+[[bin]]
+name = "latency-benchmark"
+path = "src/bin/latency_benchmark.rs"
+
+[[bin]]
+name = "memory-benchmark"
+path = "src/bin/memory_benchmark.rs"
+
+[[bin]]
+name = "comparison-benchmark"
+path = "src/bin/comparison_benchmark.rs"
+
+[[bin]]
+name = "profiling-benchmark"
+path = "src/bin/profiling_benchmark.rs"
+
+[dependencies]
+ruvector-core = {path = "../ruvector-core" }
+ruvector-mincut = { path = "../ruvector-mincut", features = ["canonical"] }
+ruvector-coherence = { path = "../ruvector-coherence", features = ["spectral"] }
+ruvector-cognitive-container = { path = "../ruvector-cognitive-container" }
+cognitum-gate-kernel = { path = "../cognitum-gate-kernel", default-features = true, features = ["canonical-witness"] }
+
+# Benchmarking
+criterion = { workspace = true }
+
+# CLI
+clap = { workspace = true }
+indicatif = { workspace = true }
+console = { workspace = true }
+
+# Data
+rand = { workspace = true }
+rand_distr = { workspace = true }
+
+# Performance
+rayon = { workspace = true }
+
+# Serialization
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+# Error handling
+anyhow = { workspace = true }
+thiserror = { workspace = true }
+
+# Statistics and analysis
+hdrhistogram = "7.5"
+statistical = "1.0"
+
+# Visualization
+plotters = "0.3"
+tabled = "0.16"
+
+# Dataset loading
+hdf5 = { version = "0.8", optional = true }
+byteorder = "1.5"
+
+# Memory profiling
+sysinfo = "0.31"
+jemalloc-ctl = { version = "0.5", optional = true }
+
+# Profiling
+pprof = { version = "0.13", features = ["flamegraph", "criterion"], optional = true }
+
+# Async
+tokio = { workspace = true }
+
+# Timing
+instant = "0.1"
+chrono = "0.4"
+
+# Testing utilities
+tempfile = "3.13"
+
+[features]
+default = []
+hdf5-datasets = ["hdf5"]
+profiling = ["pprof", "jemalloc-ctl"]
--- a/vendor/ruvector/crates/ruvector-bench/README.md
+++ b/vendor/ruvector/crates/ruvector-bench/README.md
@@ -0,0 +1,684 @@
+# Ruvector-Bench
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Rust](https://img.shields.io/badge/rust-1.77%2B-orange.svg)](https://www.rust-lang.org)
+
+**Comprehensive benchmarking suite for measuring Ruvector performance across different operations and configurations.**
+
+> Professional-grade performance testing tools for validating sub-millisecond vector search, HNSW optimization, quantization efficiency, and cross-system comparisons. Built for developers who demand data-driven insights.
+
+## 🎯 Overview
+
+The `ruvector-bench` crate provides a complete benchmarking infrastructure to measure and analyze Ruvector's performance characteristics. It includes standardized test suites compatible with [ann-benchmarks.com](http://ann-benchmarks.com), comprehensive latency profiling, memory usage analysis, and cross-system performance comparison tools.
+
+### Key Features
+
+- ⚡ **ANN-Benchmarks Compatible**: Standard datasets (SIFT1M, GIST1M, Deep1M) and metrics
+- 📊 **Latency Profiling**: High-precision measurement of p50, p95, p99, p99.9 percentiles
+- 💾 **Memory Analysis**: Track memory usage with quantization and optimization techniques
+- 🔬 **AgenticDB Workloads**: Simulate real-world AI agent memory patterns
+- 🏆 **Cross-System Comparison**: Compare against Python baselines and other vector databases
+- 📈 **Comprehensive Reporting**: JSON, CSV, and Markdown output formats
+- 🔥 **Performance Profiling**: CPU flamegraphs and memory profiling support
+
+## 📦 Installation
+
+Add to your `Cargo.toml`:
+
+```toml
+[dev-dependencies]
+ruvector-bench = { path = "../ruvector-bench" }
+
+# Optional: Enable profiling features
+ruvector-bench = { path = "../ruvector-bench", features = ["profiling"] }
+
+# Optional: Enable HDF5 dataset loading
+ruvector-bench = { path = "../ruvector-bench", features = ["hdf5-datasets"] }
+```
+
+## 🚀 Available Benchmarks
+
+The suite includes 6 specialized benchmark binaries:
+
+| Benchmark | Purpose | Metrics |
+|-----------|---------|---------|
+| **ann-benchmark** | ANN-Benchmarks compatibility | QPS, latency, recall@k, memory |
+| **agenticdb-benchmark** | AI agent memory workloads | Insert/search/update latency, memory |
+| **latency-benchmark** | Detailed latency profiling | p50/p95/p99/p99.9 latencies |
+| **memory-benchmark** | Memory usage analysis | Memory per vector, quantization savings |
+| **comparison-benchmark** | Cross-system performance | Ruvector vs baselines (10-100x faster) |
+| **profiling-benchmark** | CPU/memory profiling | Flamegraphs, allocation tracking |
+
+## ⚡ Quick Start
+
+### Running Basic Benchmarks
+
+```bash
+# Run ANN-Benchmarks suite with default settings
+cargo run --bin ann-benchmark --release
+
+# Run with custom parameters
+cargo run --bin ann-benchmark --release -- \
+  --num-vectors 100000 \
+  --dimensions 384 \
+  --ef-search-values 50,100,200 \
+  --output bench_results
+
+# Run latency profiling
+cargo run --bin latency-benchmark --release
+
+# Run AgenticDB workload simulation
+cargo run --bin agenticdb-benchmark --release
+
+# Run cross-system comparison
+cargo run --bin comparison-benchmark --release
+```
+
+### Running with Profiling
+
+```bash
+# Build with profiling enabled
+cargo build --bin profiling-benchmark --release --features profiling
+
+# Run and generate flamegraph
+cargo run --bin profiling-benchmark --release --features profiling -- \
+  --enable-flamegraph \
+  --output profiling_results
+```
+
+## 📊 Benchmark Categories
+
+### 1. ANN-Benchmarks Suite (`ann-benchmark`)
+
+Standard benchmarking compatible with [ann-benchmarks.com](http://ann-benchmarks.com) methodology.
+
+**Supported Datasets:**
+- **SIFT1M**: 1M vectors, 128 dimensions (image descriptors)
+- **GIST1M**: 1M vectors, 960 dimensions (scene recognition)
+- **Deep1M**: 1M vectors, 96 dimensions (deep learning embeddings)
+- **Synthetic**: Configurable size and distribution
+
+**Usage:**
+
+```bash
+# Test with synthetic data (default)
+cargo run --bin ann-benchmark --release -- \
+  --dataset synthetic \
+  --num-vectors 100000 \
+  --dimensions 384 \
+  --k 10
+
+# Test with SIFT1M (requires dataset download)
+cargo run --bin ann-benchmark --release -- \
+  --dataset sift1m \
+  --ef-search-values 50,100,200,400
+```
+
+**Measured Metrics:**
+- Queries per second (QPS)
+- Latency percentiles (p50, p95, p99, p99.9)
+- Recall@1, Recall@10, Recall@100
+- Memory usage (MB)
+- Build/index time
+
+**Example Output:**
+
+```
+╔════════════════════════════════════════╗
+║   Ruvector ANN-Benchmarks Suite       ║
+╚════════════════════════════════════════╝
+
+✓ Dataset loaded: 100000 vectors, 1000 queries
+
+============================================================
+Testing with ef_search = 100
+============================================================
+
+┌───────────┬──────┬──────────┬──────────┬───────────┬─────────────┐
+│ ef_search │ QPS  │ p50 (ms) │ p99 (ms) │ Recall@10 │ Memory (MB) │
+├───────────┼──────┼──────────┼──────────┼───────────┼─────────────┤
+│ 100       │ 5243 │ 0.19     │ 0.45     │ 95.23%    │ 246.8       │
+└───────────┴──────┴──────────┴──────────┴───────────┴─────────────┘
+```
+
+### 2. AgenticDB Workload Simulation (`agenticdb-benchmark`)
+
+Simulates real-world AI agent memory patterns with mixed read/write workloads.
+
+**Workload Types:**
+- **Conversational AI**: High read ratio (70/30 read/write)
+- **Learning Agents**: Balanced read/write (50/50)
+- **Batch Processing**: Write-heavy (30/70 read/write)
+
+**Usage:**
+
+```bash
+cargo run --bin agenticdb-benchmark --release -- \
+  --workload conversational \
+  --num-vectors 50000 \
+  --num-operations 10000
+```
+
+**Measured Operations:**
+- Insert latency
+- Search latency
+- Update latency
+- Batch operation throughput
+- Memory efficiency
+
+### 3. Latency Profiling (`latency-benchmark`)
+
+Detailed latency analysis across different configurations and concurrency levels.
+
+**Test Scenarios:**
+- Single-threaded vs multi-threaded search
+- Effect of `ef_search` parameter on latency
+- Effect of quantization on latency/recall tradeoff
+- Concurrent query handling
+
+**Usage:**
+
+```bash
+# Test with different thread counts
+cargo run --bin latency-benchmark --release -- \
+  --threads 1,4,8,16 \
+  --num-vectors 50000 \
+  --queries 1000
+```
+
+**Example Output:**
+
+```
+Test 1: Single-threaded Latency
+- p50: 0.42ms
+- p95: 1.23ms
+- p99: 2.15ms
+- p99.9: 4.87ms
+
+Test 2: Multi-threaded Latency (8 threads)
+- p50: 0.38ms
+- p95: 1.05ms
+- p99: 1.89ms
+- p99.9: 3.92ms
+```
+
+### 4. Memory Benchmarks (`memory-benchmark`)
+
+Analyzes memory usage with different quantization strategies.
+
+**Quantization Tests:**
+- **None**: Full precision (baseline)
+- **Scalar**: 4x compression
+- **Binary**: 32x compression
+
+**Usage:**
+
+```bash
+cargo run --bin memory-benchmark --release -- \
+  --num-vectors 100000 \
+  --dimensions 384
+```
+
+**Measured Metrics:**
+- Memory per vector (bytes)
+- Compression ratio
+- Memory overhead
+- Quantization impact on recall
+
+**Example Results:**
+
+```
+┌──────────────┬─────────────┬───────────────┬────────────┐
+│ Quantization │ Memory (MB) │ Bytes/Vector  │ Recall@10  │
+├──────────────┼─────────────┼───────────────┼────────────┤
+│ None         │ 147.5       │ 1536          │ 100.00%    │
+│ Scalar       │ 38.2        │ 398           │ 95.80%     │
+│ Binary       │ 4.7         │ 49            │ 87.20%     │
+└──────────────┴─────────────┴───────────────┴────────────┘
+
+✓ Scalar quantization: 4.0x memory reduction, 4.2% recall loss
+✓ Binary quantization: 31.4x memory reduction, 12.8% recall loss
+```
+
+### 5. Cross-System Comparison (`comparison-benchmark`)
+
+Compare Ruvector against other implementations and baselines.
+
+**Comparison Targets:**
+- Ruvector (optimized: SIMD + Quantization + HNSW)
+- Ruvector (no quantization)
+- Simulated Python baseline (numpy)
+- Simulated brute-force search
+
+**Usage:**
+
+```bash
+cargo run --bin comparison-benchmark --release -- \
+  --num-vectors 50000 \
+  --dimensions 384
+```
+
+**Example Results:**
+
+```
+┌──────────────────────────┬──────┬──────────┬─────────────┬────────────┐
+│ System                   │ QPS  │ p50 (ms) │ Memory (MB) │ Speedup    │
+├──────────────────────────┼──────┼──────────┼─────────────┼────────────┤
+│ Ruvector (optimized)     │ 5243 │ 0.19     │ 38.2        │ 1.0x       │
+│ Ruvector (no quant)      │ 4891 │ 0.20     │ 147.5       │ 0.93x      │
+│ Python baseline          │ 89   │ 11.2     │ 153.6       │ 58.9x      │
+│ Brute-force              │ 12   │ 83.3     │ 147.5       │ 437x       │
+└──────────────────────────┴──────┴──────────┴─────────────┴────────────┘
+
+✓ Ruvector is 58.9x faster than Python baseline
+✓ Ruvector uses 74.1% less memory with quantization
+```
+
+### 6. Performance Profiling (`profiling-benchmark`)
+
+CPU and memory profiling with flamegraph generation (requires `profiling` feature).
+
+**Usage:**
+
+```bash
+# Build with profiling support
+cargo build --bin profiling-benchmark --release --features profiling
+
+# Run with flamegraph generation
+cargo run --bin profiling-benchmark --release --features profiling -- \
+  --enable-flamegraph \
+  --num-vectors 50000 \
+  --output profiling_results
+
+# View flamegraph
+open profiling_results/flamegraph.svg
+```
+
+**Generated Artifacts:**
+- CPU flamegraph (SVG)
+- Memory allocation profile
+- Hotspot analysis
+- Function-level timing breakdown
+
+## 📈 Interpreting Results
+
+### Latency Metrics
+
+| Percentile | Meaning | Target |
+|------------|---------|--------|
+| **p50** | Median latency - typical query performance | <0.5ms |
+| **p95** | 95% of queries complete within this time | <1.5ms |
+| **p99** | 99% of queries complete within this time | <3.0ms |
+| **p99.9** | 99.9% of queries (tail latency) | <5.0ms |
+
+### Recall Metrics
+
+- **Recall@k**: Fraction of true nearest neighbors found in top-k results
+- **Target Recall@10**: ≥95% for most applications
+- **Trade-off**: Higher `ef_search` → better recall, higher latency
+
+### Memory Efficiency
+
+```
+Memory per vector = Total Memory / Number of Vectors
+
+Typical values:
+- No quantization: ~1536 bytes (384D float32)
+- Scalar quantization: ~400 bytes (4x compression)
+- Binary quantization: ~50 bytes (32x compression)
+```
+
+## 🔧 Benchmark Configuration Options
+
+### Common Options (All Benchmarks)
+
+```bash
+--num-vectors <N>       # Number of vectors to index (default: 50000)
+--dimensions <D>        # Vector dimensions (default: 384)
+--output <PATH>         # Output directory for results (default: bench_results)
+```
+
+### ANN-Benchmark Specific
+
+```bash
+--dataset <NAME>        # Dataset: sift1m, gist1m, deep1m, synthetic
+--num-queries <N>       # Number of search queries (default: 1000)
+--k <K>                 # Number of nearest neighbors to retrieve (default: 10)
+--m <M>                 # HNSW M parameter (default: 32)
+--ef-construction <EF>  # HNSW build parameter (default: 200)
+--ef-search-values <EF> # Comma-separated ef_search values to test (default: 50,100,200,400)
+--metric <METRIC>       # Distance metric: cosine, euclidean, dot (default: cosine)
+--quantization <TYPE>   # Quantization: none, scalar, binary (default: scalar)
+```
+
+### Latency-Benchmark Specific
+
+```bash
+--threads <THREADS>     # Comma-separated thread counts (default: 1,4,8,16)
+```
+
+### AgenticDB-Benchmark Specific
+
+```bash
+--workload <TYPE>       # Workload type: conversational, learning, batch
+--num-operations <N>    # Number of operations to perform (default: 10000)
+```
+
+### Profiling-Benchmark Specific
+
+```bash
+--enable-flamegraph     # Generate CPU flamegraph (requires profiling feature)
+--enable-memory-profile # Enable detailed memory profiling
+```
+
+## 🎨 Custom Benchmark Creation
+
+Create your own benchmarks using the `ruvector-bench` library:
+
+```rust
+use ruvector_bench::{
+    BenchmarkResult, DatasetGenerator, LatencyStats,
+    MemoryProfiler, ResultWriter, VectorDistribution,
+};
+use ruvector_core::{VectorDB, DbOptions, SearchQuery, VectorEntry};
+use std::time::Instant;
+
+fn my_custom_benchmark() -> anyhow::Result<()> {
+    // Generate test data
+    let gen = DatasetGenerator::new(384, VectorDistribution::Normal {
+        mean: 0.0,
+        std_dev: 1.0,
+    });
+    let vectors = gen.generate(10000);
+    let queries = gen.generate(100);
+
+    // Create database
+    let db = VectorDB::new(DbOptions::default())?;
+
+    // Measure indexing
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+
+    for (idx, vector) in vectors.iter().enumerate() {
+        db.insert(VectorEntry {
+            id: Some(idx.to_string()),
+            vector: vector.clone(),
+            metadata: None,
+        })?;
+    }
+
+    let build_time = build_start.elapsed();
+
+    // Measure search performance
+    let mut latency_stats = LatencyStats::new()?;
+
+    for query in &queries {
+        let start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(start.elapsed())?;
+    }
+
+    // Print results
+    println!("Build time: {:.2}s", build_time.as_secs_f64());
+    println!("p50 latency: {:.2}ms", latency_stats.percentile(0.50).as_secs_f64() * 1000.0);
+    println!("p99 latency: {:.2}ms", latency_stats.percentile(0.99).as_secs_f64() * 1000.0);
+    println!("Memory usage: {:.2}MB", mem_profiler.current_usage_mb());
+
+    Ok(())
+}
+```
+
+## 🔄 CI/CD Integration
+
+### GitHub Actions Example
+
+```yaml
+name: Benchmarks
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+
+      - name: Run benchmarks
+        run: |
+          cd crates/ruvector-bench
+          cargo run --bin ann-benchmark --release -- --output ci_results
+          cargo run --bin latency-benchmark --release -- --output ci_results
+
+      - name: Upload results
+        uses: actions/upload-artifact@v3
+        with:
+          name: benchmark-results
+          path: crates/ruvector-bench/ci_results/
+
+      - name: Check performance regression
+        run: |
+          python scripts/check_regression.py ci_results/ann_benchmark.json
+```
+
+## 📉 Performance Regression Testing
+
+Track performance over time using historical benchmark data:
+
+```bash
+# Run baseline benchmarks (on main branch)
+git checkout main
+cargo run --bin ann-benchmark --release -- --output baseline_results
+
+# Run comparison benchmarks (on feature branch)
+git checkout feature-branch
+cargo run --bin ann-benchmark --release -- --output feature_results
+
+# Compare results
+python scripts/compare_benchmarks.py \
+  baseline_results/ann_benchmark.json \
+  feature_results/ann_benchmark.json
+```
+
+**Regression Thresholds:**
+- ✅ **Pass**: <5% latency regression, <10% memory regression
+- ⚠️ **Warning**: 5-10% latency regression, 10-20% memory regression
+- ❌ **Fail**: >10% latency regression, >20% memory regression
+
+## 📊 Results Visualization
+
+Benchmark results are automatically saved in multiple formats:
+
+### JSON Format
+
+```json
+{
+  "name": "ruvector-ef100",
+  "dataset": "synthetic",
+  "dimensions": 384,
+  "num_vectors": 100000,
+  "qps": 5243.2,
+  "latency_p50": 0.19,
+  "latency_p99": 2.15,
+  "recall_at_10": 0.9523,
+  "memory_mb": 38.2
+}
+```
+
+### CSV Format
+
+```csv
+name,dataset,dimensions,num_vectors,qps,p50,p99,recall@10,memory_mb
+ruvector-ef100,synthetic,384,100000,5243.2,0.19,2.15,0.9523,38.2
+```
+
+### Markdown Report
+
+Results include automatically generated markdown reports with detailed performance analysis.
+
+### Custom Visualization
+
+Generate performance charts using the provided data:
+
+```python
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Load benchmark results
+df = pd.read_csv('bench_results/ann_benchmark.csv')
+
+# Plot QPS vs Recall tradeoff
+plt.figure(figsize=(10, 6))
+plt.scatter(df['recall@10'] * 100, df['qps'])
+plt.xlabel('Recall@10 (%)')
+plt.ylabel('Queries per Second')
+plt.title('Ruvector Performance: QPS vs Recall')
+plt.grid(True)
+plt.savefig('qps_vs_recall.png')
+```
+
+## 🔗 Links to Benchmark Reports
+
+- [Latest Benchmark Results](../../benchmarks/LOAD_TEST_SCENARIOS.md)
+- [Performance Optimization Guide](../../docs/cloud-architecture/PERFORMANCE_OPTIMIZATION_GUIDE.md)
+- [Implementation Summary](../../docs/IMPLEMENTATION_SUMMARY.md)
+- [ANN-Benchmarks.com](http://ann-benchmarks.com) - Standard vector search benchmarks
+
+## 🎯 Optimization Based on Benchmarks
+
+### Use Benchmark Results to Tune Performance
+
+1. **Optimize for Latency** (sub-millisecond queries):
+   ```rust
+   HnswConfig {
+       m: 16,              // Lower M = faster search, less recall
+       ef_construction: 100,
+       ef_search: 50,      // Lower ef_search = faster, less recall
+       max_elements: 100000,
+   }
+   ```
+
+2. **Optimize for Recall** (95%+ accuracy):
+   ```rust
+   HnswConfig {
+       m: 64,              // Higher M = better recall
+       ef_construction: 400,
+       ef_search: 200,     // Higher ef_search = better recall
+       max_elements: 100000,
+   }
+   ```
+
+3. **Optimize for Memory** (minimal footprint):
+   ```rust
+   DbOptions {
+       quantization: Some(QuantizationConfig::Binary),  // 32x compression
+       ..Default::default()
+   }
+   ```
+
+### Recommended Configurations by Use Case
+
+| Use Case | M | ef_construction | ef_search | Quantization | Expected Performance |
+|----------|---|----------------|-----------|--------------|----------------------|
+| **Low-Latency Search** | 16 | 100 | 50 | Scalar | <0.5ms p50, 90%+ recall |
+| **Balanced** | 32 | 200 | 100 | Scalar | <1ms p50, 95%+ recall |
+| **High Accuracy** | 64 | 400 | 200 | None | <2ms p50, 98%+ recall |
+| **Memory Constrained** | 16 | 100 | 50 | Binary | <1ms p50, 85%+ recall, 32x compression |
+
+## 🛠️ Development
+
+### Running Tests
+
+```bash
+# Run unit tests
+cargo test -p ruvector-bench
+
+# Run specific benchmark
+cargo test -p ruvector-bench --test latency_stats_test
+```
+
+### Building Documentation
+
+```bash
+# Generate API documentation
+cargo doc -p ruvector-bench --open
+```
+
+### Adding New Benchmarks
+
+1. Create a new binary in `src/bin/`:
+   ```bash
+   touch src/bin/my_benchmark.rs
+   ```
+
+2. Add to `Cargo.toml`:
+   ```toml
+   [[bin]]
+   name = "my-benchmark"
+   path = "src/bin/my_benchmark.rs"
+   ```
+
+3. Implement using `ruvector-bench` utilities:
+   ```rust
+   use ruvector_bench::{LatencyStats, ResultWriter};
+   ```
+
+## 📚 API Reference
+
+### Core Types
+
+- **`BenchmarkResult`**: Comprehensive benchmark result structure
+- **`LatencyStats`**: HDR histogram-based latency measurement
+- **`DatasetGenerator`**: Synthetic vector data generation
+- **`MemoryProfiler`**: Memory usage tracking
+- **`ResultWriter`**: Multi-format result output (JSON, CSV, Markdown)
+
+### Utilities
+
+- **`calculate_recall()`**: Compute recall@k metric
+- **`create_progress_bar()`**: Terminal progress indication
+- **`VectorDistribution`**: Uniform, Normal, or Clustered vector generation
+
+See [full API documentation](https://docs.rs/ruvector-bench) for details.
+
+## 🤝 Contributing
+
+We welcome contributions to improve the benchmarking suite!
+
+### Areas for Contribution
+
+- 📊 Additional benchmark scenarios (concurrent writes, updates, deletes)
+- 🔌 Integration with other vector databases (Pinecone, Qdrant, Milvus)
+- 📈 Enhanced visualization and reporting
+- 🎯 Real-world dataset support (SIFT, GIST, Deep1M loaders)
+- 🚀 Performance optimization insights
+
+See [Contributing Guidelines](../../docs/development/CONTRIBUTING.md) for details.
+
+## 📜 License
+
+This crate is part of the Ruvector project and is licensed under the MIT License.
+
+---
+
+<div align="center">
+
+**Part of [Ruvector](../../README.md) - Next-generation vector database built in Rust**
+
+Built by [rUv](https://ruv.io) • [GitHub](https://github.com/ruvnet/ruvector) • [Documentation](../../docs/README.md)
+
+</div>
--- a/vendor/ruvector/crates/ruvector-bench/docs/BENCHMARKS.md
+++ b/vendor/ruvector/crates/ruvector-bench/docs/BENCHMARKS.md
@@ -0,0 +1,467 @@
+# Ruvector Benchmark Suite Documentation
+
+Comprehensive benchmarking tools for measuring and analyzing Ruvector's performance across various workloads and configurations.
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Installation](#installation)
+3. [Benchmark Tools](#benchmark-tools)
+4. [Quick Start](#quick-start)
+5. [Detailed Usage](#detailed-usage)
+6. [Understanding Results](#understanding-results)
+7. [Performance Targets](#performance-targets)
+8. [Troubleshooting](#troubleshooting)
+
+## Overview
+
+The Ruvector benchmark suite provides:
+
+- **ANN-Benchmarks Compatibility**: Standard SIFT1M, GIST1M, Deep1M testing
+- **AgenticDB Workloads**: Reflexion episodes, skill libraries, causal graphs
+- **Latency Analysis**: p50, p95, p99, p99.9 percentile measurements
+- **Memory Profiling**: Usage at various scales with quantization effects
+- **System Comparison**: Ruvector vs other implementations
+- **Performance Profiling**: CPU flamegraphs and hotspot analysis
+
+## Installation
+
+### Prerequisites
+
+```bash
+# Install Rust (if not already installed)
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+# Optional: HDF5 for loading real ANN benchmark datasets
+# Ubuntu/Debian
+sudo apt-get install libhdf5-dev
+
+# macOS
+brew install hdf5
+
+# Optional: Profiling tools
+sudo apt-get install linux-perf  # Linux only
+```
+
+### Build Benchmarks
+
+```bash
+cd crates/ruvector-bench
+
+# Standard build
+cargo build --release
+
+# With profiling support
+cargo build --release --features profiling
+
+# With HDF5 dataset support
+cargo build --release --features hdf5-datasets
+```
+
+## Benchmark Tools
+
+### 1. ANN Benchmark (`ann-benchmark`)
+
+Tests standard ANN benchmark datasets with configurable HNSW parameters.
+
+**Features:**
+- SIFT1M (128D, 1M vectors)
+- GIST1M (960D, 1M vectors)
+- Deep1M (96D, 1M vectors)
+- Synthetic dataset generation
+- Recall-QPS curves at 90%, 95%, 99%
+- Multiple ef_search values
+
+### 2. AgenticDB Benchmark (`agenticdb-benchmark`)
+
+Simulates agentic AI workloads.
+
+**Workloads:**
+- Reflexion episode storage/retrieval
+- Skill library search
+- Causal graph queries
+- Learning session throughput (mixed read/write)
+
+### 3. Latency Benchmark (`latency-benchmark`)
+
+Measures detailed latency characteristics.
+
+**Tests:**
+- Single-threaded latency
+- Multi-threaded latency (configurable thread counts)
+- Effect of ef_search on latency
+- Effect of quantization on latency/recall tradeoff
+
+### 4. Memory Benchmark (`memory-benchmark`)
+
+Profiles memory usage at scale.
+
+**Tests:**
+- Memory at 10K, 100K, 1M vectors
+- Effect of quantization (none, scalar, binary)
+- Index overhead analysis
+- Memory per vector calculation
+
+### 5. Comparison Benchmark (`comparison-benchmark`)
+
+Compares Ruvector against other systems.
+
+**Comparisons:**
+- Ruvector (optimized)
+- Ruvector (no quantization)
+- Simulated Python baseline
+- Simulated brute-force search
+
+### 6. Profiling Benchmark (`profiling-benchmark`)
+
+Generates performance profiles.
+
+**Outputs:**
+- CPU flamegraphs (SVG)
+- Profiling reports
+- Hotspot identification
+- SIMD utilization analysis
+
+## Quick Start
+
+### Run All Benchmarks
+
+```bash
+# Full benchmark suite
+./scripts/run_all_benchmarks.sh
+
+# Quick mode (smaller datasets)
+./scripts/run_all_benchmarks.sh --quick
+
+# With profiling
+./scripts/run_all_benchmarks.sh --profile
+```
+
+### Run Individual Benchmarks
+
+```bash
+# ANN benchmarks
+cargo run --release --bin ann-benchmark -- \
+    --dataset synthetic \
+    --num-vectors 100000 \
+    --queries 1000
+
+# AgenticDB workloads
+cargo run --release --bin agenticdb-benchmark -- \
+    --episodes 10000 \
+    --queries 500
+
+# Latency profiling
+cargo run --release --bin latency-benchmark -- \
+    --num-vectors 50000 \
+    --threads "1,4,8,16"
+
+# Memory profiling
+cargo run --release --bin memory-benchmark -- \
+    --scales "1000,10000,100000"
+
+# System comparison
+cargo run --release --bin comparison-benchmark -- \
+    --num-vectors 50000
+
+# Performance profiling
+cargo run --release --features profiling --bin profiling-benchmark -- \
+    --flamegraph
+```
+
+## Detailed Usage
+
+### ANN Benchmark Options
+
+```bash
+cargo run --release --bin ann-benchmark -- --help
+
+Options:
+  -d, --dataset <DATASET>              Dataset: sift1m, gist1m, deep1m, synthetic [default: synthetic]
+  -n, --num-vectors <NUM_VECTORS>      Number of vectors [default: 100000]
+  -q, --queries <NUM_QUERIES>          Number of queries [default: 1000]
+  -d, --dimensions <DIMENSIONS>        Vector dimensions [default: 128]
+  -k, --k <K>                          K nearest neighbors [default: 10]
+  -m, --m <M>                          HNSW M parameter [default: 32]
+      --ef-construction <VALUE>        HNSW ef_construction [default: 200]
+      --ef-search-values <VALUES>      HNSW ef_search values (comma-separated) [default: 50,100,200,400]
+  -o, --output <OUTPUT>                Output directory [default: bench_results]
+      --metric <METRIC>                Distance metric [default: cosine]
+      --quantization <QUANT>           Quantization: none, scalar, binary [default: scalar]
+```
+
+### AgenticDB Benchmark Options
+
+```bash
+cargo run --release --bin agenticdb-benchmark -- --help
+
+Options:
+      --episodes <EPISODES>    Number of episodes [default: 10000]
+      --skills <SKILLS>        Number of skills [default: 1000]
+  -q, --queries <QUERIES>      Number of queries [default: 500]
+  -o, --output <OUTPUT>        Output directory [default: bench_results]
+```
+
+### Latency Benchmark Options
+
+```bash
+cargo run --release --bin latency-benchmark -- --help
+
+Options:
+  -n, --num-vectors <NUM_VECTORS>    Number of vectors [default: 50000]
+  -q, --queries <QUERIES>            Number of queries [default: 1000]
+  -d, --dimensions <DIMENSIONS>      Vector dimensions [default: 384]
+  -t, --threads <THREADS>            Thread counts to test [default: 1,4,8,16]
+  -o, --output <OUTPUT>              Output directory [default: bench_results]
+```
+
+## Understanding Results
+
+### Output Files
+
+Each benchmark generates three output files:
+
+1. **JSON** (`{benchmark}_benchmark.json`): Raw data for programmatic analysis
+2. **CSV** (`{benchmark}_benchmark.csv`): Tabular data for spreadsheet analysis
+3. **Markdown** (`{benchmark}_benchmark.md`): Human-readable report
+
+### Key Metrics
+
+#### QPS (Queries Per Second)
+- Higher is better
+- Measures throughput
+- Target: >10,000 QPS for 100K vectors
+
+#### Latency Percentiles
+- **p50**: Median latency (typical user experience)
+- **p95**: 95th percentile (captures most outliers)
+- **p99**: 99th percentile (worst-case for most users)
+- **p99.9**: 99.9th percentile (extreme outliers)
+- Lower is better
+- Target: <5ms p99 for 100K vectors
+
+#### Recall
+- **Recall@1**: Percentage of times the true nearest neighbor is found
+- **Recall@10**: Percentage of true top-10 neighbors found
+- **Recall@100**: Percentage of true top-100 neighbors found
+- Higher is better
+- Target: >95% recall@10
+
+#### Memory
+- Total memory usage in MB
+- Memory per vector in KB
+- Compression ratio with quantization
+- Target: <2KB per vector with quantization
+
+### Reading Benchmark Reports
+
+Example output interpretation:
+
+```
+ef_search  QPS    p50 (ms)  p99 (ms)  Recall@10  Memory (MB)
+50         15234  0.05      0.12      92.5%      156.2
+100        12456  0.06      0.15      96.8%      156.2
+200        8932   0.08      0.20      98.9%      156.2
+```
+
+**Analysis:**
+- Increasing ef_search improves recall but reduces QPS
+- ef_search=100 offers good balance (96.8% recall, 12K QPS)
+- Memory usage constant across ef_search values
+
+## Performance Targets
+
+### AgenticDB Replacement Goals
+
+Ruvector targets **10-100x performance improvement** over AgenticDB:
+
+| Metric | AgenticDB (Python) | Ruvector (Target) | Speedup |
+|--------|-------------------|-------------------|---------|
+| Reflexion Retrieval | ~100 QPS | >5,000 QPS | 50x |
+| Skill Search | ~50 QPS | >2,000 QPS | 40x |
+| Index Build Time | ~60s/10K | <5s/10K | 12x |
+| Memory Usage | ~500MB/100K | <100MB/100K | 5x |
+
+### ANN-Benchmarks Targets
+
+Competitive with state-of-the-art implementations:
+
+| Dataset | Recall@10 | QPS Target | Latency p99 |
+|---------|-----------|------------|-------------|
+| SIFT1M | >95% | >10,000 | <1ms |
+| GIST1M | >95% | >5,000 | <2ms |
+| Deep1M | >95% | >15,000 | <0.5ms |
+
+## Advanced Topics
+
+### Profiling with Flamegraphs
+
+Generate CPU flamegraphs to identify performance bottlenecks:
+
+```bash
+cargo run --release --features profiling --bin profiling-benchmark -- \
+    --flamegraph \
+    --output bench_results/profiling
+
+# View flamegraph
+firefox bench_results/profiling/flamegraph.svg
+```
+
+**Interpreting Flamegraphs:**
+- Width = CPU time spent
+- Height = call stack depth
+- Look for wide plateaus (hotspots)
+- Focus optimization on top 20% of time
+
+### Custom Benchmark Scenarios
+
+Create custom benchmarks by modifying the tools:
+
+```rust
+// Example: Custom dimension test
+let dimensions = vec![64, 128, 256, 512, 768, 1024];
+for dim in dimensions {
+    let result = bench_custom(dim)?;
+    results.push(result);
+}
+```
+
+### Continuous Benchmarking
+
+Integrate with CI/CD:
+
+```yaml
+# .github/workflows/benchmark.yml
+name: Benchmarks
+on: [push]
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run benchmarks
+        run: |
+          cd crates/ruvector-bench
+          ./scripts/run_all_benchmarks.sh --quick
+      - name: Upload results
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark-results
+          path: crates/ruvector-bench/bench_results/
+```
+
+## Troubleshooting
+
+### Common Issues
+
+#### "HDF5 not found"
+
+```bash
+# Install HDF5 development libraries
+sudo apt-get install libhdf5-dev  # Ubuntu/Debian
+brew install hdf5                 # macOS
+
+# Or build without HDF5 support
+cargo build --release --no-default-features
+```
+
+#### "Out of memory"
+
+```bash
+# Reduce dataset size
+cargo run --release --bin ann-benchmark -- --num-vectors 10000
+
+# Or use quick mode
+./scripts/run_all_benchmarks.sh --quick
+```
+
+#### "Profiling not working"
+
+```bash
+# Ensure profiling feature is enabled
+cargo build --release --features profiling
+
+# Linux: May need perf permissions
+echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid
+```
+
+#### "Benchmarks taking too long"
+
+```bash
+# Use quick mode
+./scripts/run_all_benchmarks.sh --quick
+
+# Or run individual benchmarks
+cargo run --release --bin latency-benchmark -- --queries 100
+```
+
+### Performance Debugging
+
+If benchmarks show unexpectedly slow results:
+
+1. **Check CPU governor:**
+   ```bash
+   # Linux: Use performance mode
+   sudo cpupower frequency-set -g performance
+   ```
+
+2. **Verify release build:**
+   ```bash
+   cargo build --release  # Not --debug!
+   ```
+
+3. **Check system load:**
+   ```bash
+   htop  # Ensure no other heavy processes
+   ```
+
+4. **Review HNSW parameters:**
+   - Reduce ef_construction for faster indexing
+   - Reduce ef_search for faster queries (at cost of recall)
+
+## Results Analysis
+
+### Comparing Runs
+
+```bash
+# Compare two benchmark runs
+diff -u bench_results_old/ann_benchmark.csv bench_results_new/ann_benchmark.csv
+
+# Plot results with Python
+python3 scripts/plot_results.py bench_results/
+```
+
+### Statistical Significance
+
+For reliable benchmarks:
+- Run multiple iterations (3-5 times)
+- Use appropriate dataset sizes (>10K vectors)
+- Ensure consistent system load
+- Record system specs in metadata
+
+## Contributing
+
+To add new benchmarks:
+
+1. Create new binary in `src/bin/`
+2. Use `ruvector_bench` utilities
+3. Output results in standard format
+4. Update this documentation
+5. Add to `run_all_benchmarks.sh`
+
+## References
+
+- [ANN-Benchmarks](http://ann-benchmarks.com)
+- [HNSW Paper](https://arxiv.org/abs/1603.09320)
+- [AgenticDB Documentation](https://github.com/agenticdb/agenticdb)
+- [Ruvector Repository](https://github.com/ruvnet/ruvector)
+
+## Support
+
+For issues or questions:
+- GitHub Issues: https://github.com/ruvnet/ruvector/issues
+- Documentation: https://github.com/ruvnet/ruvector/docs
+
+---
+
+Last updated: 2025-11-19
--- a/vendor/ruvector/crates/ruvector-bench/scripts/download_datasets.sh
+++ b/vendor/ruvector/crates/ruvector-bench/scripts/download_datasets.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Download ANN benchmark datasets (SIFT1M, GIST1M, Deep1M)
+
+set -e
+
+DATASETS_DIR="datasets"
+mkdir -p "$DATASETS_DIR"
+
+echo "╔════════════════════════════════════════╗"
+echo "║   ANN Benchmark Dataset Downloader    ║"
+echo "╚════════════════════════════════════════╝"
+echo ""
+
+# Function to download and extract dataset
+download_dataset() {
+    local name=$1
+    local url=$2
+    local file=$(basename "$url")
+
+    echo "Downloading $name..."
+    if [ -f "$DATASETS_DIR/$file" ]; then
+        echo "  ✓ Already downloaded: $file"
+    else
+        wget -q --show-progress -O "$DATASETS_DIR/$file" "$url"
+        echo "  ✓ Downloaded: $file"
+    fi
+
+    echo "Extracting $name..."
+    if [[ $file == *.tar.gz ]]; then
+        tar -xzf "$DATASETS_DIR/$file" -C "$DATASETS_DIR"
+    elif [[ $file == *.gz ]]; then
+        gunzip -k "$DATASETS_DIR/$file"
+    fi
+    echo "  ✓ Extracted successfully"
+    echo ""
+}
+
+# SIFT1M Dataset (128D, 1M vectors)
+# http://corpus-texmex.irisa.fr/
+echo "1. SIFT1M Dataset (128 dimensions, 1M vectors)"
+echo "   Download from: http://corpus-texmex.irisa.fr/"
+echo "   Note: Direct download requires manual intervention due to terms of service"
+echo "   Please visit the website and download sift.tar.gz manually to datasets/"
+echo ""
+
+# GIST1M Dataset (960D, 1M vectors)
+echo "2. GIST1M Dataset (960 dimensions, 1M vectors)"
+echo "   Download from: http://corpus-texmex.irisa.fr/"
+echo "   Note: Direct download requires manual intervention due to terms of service"
+echo "   Please visit the website and download gist.tar.gz manually to datasets/"
+echo ""
+
+# Deep1M Dataset (96D, 1M vectors)
+echo "3. Deep1M Dataset (96 dimensions, 1M vectors)"
+echo "   Download from: http://sites.skoltech.ru/compvision/noimi/"
+echo "   Note: This dataset may require registration"
+echo ""
+
+# Alternative: Generate synthetic datasets
+echo "═══════════════════════════════════════════════════════════════"
+echo "ALTERNATIVE: Generate Synthetic Datasets"
+echo "═══════════════════════════════════════════════════════════════"
+echo ""
+echo "If you prefer to use synthetic data for benchmarking, the"
+echo "benchmark tools will automatically generate appropriate datasets."
+echo ""
+echo "To run with synthetic data:"
+echo "  cargo run --release --bin ann-benchmark -- --dataset synthetic"
+echo ""
+
+# Check for HDF5 support
+echo "Checking dependencies..."
+if command -v h5dump &> /dev/null; then
+    echo "  ✓ HDF5 tools installed"
+else
+    echo "  ⚠ HDF5 tools not found. Install with:"
+    echo "    Ubuntu/Debian: sudo apt-get install hdf5-tools"
+    echo "    macOS: brew install hdf5"
+    echo "    Note: HDF5 is optional for synthetic benchmarks"
+fi
+echo ""
+
+echo "════════════════════════════════════════"
+echo "Setup Instructions:"
+echo "════════════════════════════════════════"
+echo ""
+echo "1. Manual Download (for real datasets):"
+echo "   - Visit http://corpus-texmex.irisa.fr/"
+echo "   - Download sift.tar.gz, gist.tar.gz"
+echo "   - Place in: $DATASETS_DIR/"
+echo "   - Extract: tar -xzf $DATASETS_DIR/sift.tar.gz -C $DATASETS_DIR/"
+echo ""
+echo "2. Synthetic Datasets (recommended for testing):"
+echo "   - No download required"
+echo "   - Generated automatically by benchmark tools"
+echo "   - Suitable for performance testing and profiling"
+echo ""
+echo "3. Run Benchmarks:"
+echo "   cd crates/ruvector-bench"
+echo "   cargo run --release --bin ann-benchmark"
+echo ""
+echo "✓ Setup guide complete!"
--- a/vendor/ruvector/crates/ruvector-bench/scripts/run_all_benchmarks.sh
+++ b/vendor/ruvector/crates/ruvector-bench/scripts/run_all_benchmarks.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+# Run complete Ruvector benchmark suite
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCH_DIR="$(dirname "$SCRIPT_DIR")"
+OUTPUT_DIR="${BENCH_DIR}/bench_results"
+
+echo "╔════════════════════════════════════════╗"
+echo "║   Ruvector Benchmark Suite Runner     ║"
+echo "╚════════════════════════════════════════╝"
+echo ""
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Parse arguments
+QUICK_MODE=false
+PROFILE=false
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --quick)
+            QUICK_MODE=true
+            shift
+            ;;
+        --profile)
+            PROFILE=true
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 [--quick] [--profile]"
+            exit 1
+            ;;
+    esac
+done
+
+# Set benchmark parameters based on mode
+if [ "$QUICK_MODE" = true ]; then
+    echo "Running in QUICK mode (reduced dataset sizes)..."
+    VECTORS=10000
+    QUERIES=500
+else
+    echo "Running in FULL mode (standard dataset sizes)..."
+    VECTORS=100000
+    QUERIES=1000
+fi
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Build benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "Building benchmark suite..."
+echo "═══════════════════════════════════════════════════════════════"
+cd "$BENCH_DIR"
+cargo build --release
+echo "✓ Build complete"
+echo ""
+
+# Run ANN Benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "1. ANN Benchmarks (SIFT/GIST/Deep1M compatibility)"
+echo "═══════════════════════════════════════════════════════════════"
+cargo run --release --bin ann-benchmark -- \
+    --dataset synthetic \
+    --num-vectors $VECTORS \
+    --queries $QUERIES \
+    --dimensions 128 \
+    --output "$OUTPUT_DIR"
+echo ""
+
+# Run AgenticDB Benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "2. AgenticDB Workload Benchmarks"
+echo "═══════════════════════════════════════════════════════════════"
+cargo run --release --bin agenticdb-benchmark -- \
+    --episodes $VECTORS \
+    --skills $(($VECTORS / 10)) \
+    --queries $QUERIES \
+    --output "$OUTPUT_DIR"
+echo ""
+
+# Run Latency Benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "3. Latency Profiling"
+echo "═══════════════════════════════════════════════════════════════"
+cargo run --release --bin latency-benchmark -- \
+    --num-vectors $(($VECTORS / 2)) \
+    --queries $QUERIES \
+    --dimensions 384 \
+    --threads "1,4,8" \
+    --output "$OUTPUT_DIR"
+echo ""
+
+# Run Memory Benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "4. Memory Profiling"
+echo "═══════════════════════════════════════════════════════════════"
+if [ "$QUICK_MODE" = true ]; then
+    SCALES="1000,10000"
+else
+    SCALES="1000,10000,100000"
+fi
+
+cargo run --release --bin memory-benchmark -- \
+    --dimensions 384 \
+    --scales "$SCALES" \
+    --output "$OUTPUT_DIR"
+echo ""
+
+# Run Comparison Benchmarks
+echo "═══════════════════════════════════════════════════════════════"
+echo "5. Cross-System Comparison"
+echo "═══════════════════════════════════════════════════════════════"
+cargo run --release --bin comparison-benchmark -- \
+    --num-vectors $(($VECTORS / 2)) \
+    --queries $QUERIES \
+    --dimensions 384 \
+    --output "$OUTPUT_DIR"
+echo ""
+
+# Run Profiling (optional)
+if [ "$PROFILE" = true ]; then
+    echo "═══════════════════════════════════════════════════════════════"
+    echo "6. Performance Profiling with Flamegraph"
+    echo "═══════════════════════════════════════════════════════════════"
+    cargo run --release --features profiling --bin profiling-benchmark -- \
+        --num-vectors $(($VECTORS / 2)) \
+        --queries $QUERIES \
+        --dimensions 384 \
+        --flamegraph \
+        --output "$OUTPUT_DIR/profiling"
+    echo ""
+fi
+
+# Generate summary report
+echo "═══════════════════════════════════════════════════════════════"
+echo "Generating Summary Report"
+echo "═══════════════════════════════════════════════════════════════"
+
+SUMMARY_FILE="$OUTPUT_DIR/SUMMARY.md"
+
+cat > "$SUMMARY_FILE" << EOF
+# Ruvector Benchmark Results Summary
+
+**Generated:** $(date)
+**Mode:** $([ "$QUICK_MODE" = true ] && echo "Quick" || echo "Full")
+
+## Configuration
+- Vectors: $VECTORS
+- Queries: $QUERIES
+- Profiling: $([ "$PROFILE" = true ] && echo "Enabled" || echo "Disabled")
+
+## Results Location
+All benchmark results are saved in: \`$OUTPUT_DIR\`
+
+## Available Reports
+
+### 1. ANN Benchmarks
+- JSON: \`ann_benchmark.json\`
+- CSV: \`ann_benchmark.csv\`
+- Report: \`ann_benchmark.md\`
+
+### 2. AgenticDB Workloads
+- JSON: \`agenticdb_benchmark.json\`
+- CSV: \`agenticdb_benchmark.csv\`
+- Report: \`agenticdb_benchmark.md\`
+
+### 3. Latency Profiling
+- JSON: \`latency_benchmark.json\`
+- CSV: \`latency_benchmark.csv\`
+- Report: \`latency_benchmark.md\`
+
+### 4. Memory Profiling
+- JSON: \`memory_benchmark.json\`
+- CSV: \`memory_benchmark.csv\`
+- Report: \`memory_benchmark.md\`
+
+### 5. System Comparison
+- JSON: \`comparison_benchmark.json\`
+- CSV: \`comparison_benchmark.csv\`
+- Report: \`comparison_benchmark.md\`
+
+EOF
+
+if [ "$PROFILE" = true ]; then
+    cat >> "$SUMMARY_FILE" << EOF
+
+### 6. Performance Profiling
+- Flamegraph: \`profiling/flamegraph.svg\`
+- Profile: \`profiling/profile.txt\`
+
+EOF
+fi
+
+cat >> "$SUMMARY_FILE" << EOF
+
+## Quick Analysis
+
+To view individual benchmark reports, use:
+\`\`\`bash
+cat $OUTPUT_DIR/ann_benchmark.md
+cat $OUTPUT_DIR/agenticdb_benchmark.md
+cat $OUTPUT_DIR/latency_benchmark.md
+cat $OUTPUT_DIR/memory_benchmark.md
+cat $OUTPUT_DIR/comparison_benchmark.md
+\`\`\`
+
+To view CSV data for analysis:
+\`\`\`bash
+column -t -s, $OUTPUT_DIR/ann_benchmark.csv | less -S
+\`\`\`
+
+EOF
+
+echo "✓ Summary report generated: $SUMMARY_FILE"
+echo ""
+
+echo "════════════════════════════════════════════════════════════════"
+echo "✓ All benchmarks complete!"
+echo "════════════════════════════════════════════════════════════════"
+echo ""
+echo "Results saved to: $OUTPUT_DIR"
+echo "Summary report: $SUMMARY_FILE"
+echo ""
+echo "View results:"
+echo "  cat $SUMMARY_FILE"
+echo "  cat $OUTPUT_DIR/*.md"
+echo ""
+
+# Display quick stats if available
+if [ -f "$OUTPUT_DIR/comparison_benchmark.csv" ]; then
+    echo "Quick Performance Summary:"
+    echo "─────────────────────────────────────────"
+    grep "ruvector_optimized" "$OUTPUT_DIR/comparison_benchmark.csv" | \
+        awk -F',' '{printf "  Optimized QPS: %s\n  Latency p50: %sms\n  Latency p99: %sms\n", $7, $8, $10}'
+    echo ""
+fi
+
+echo "To run again:"
+echo "  ./scripts/run_all_benchmarks.sh           # Full benchmarks"
+echo "  ./scripts/run_all_benchmarks.sh --quick   # Quick mode"
+echo "  ./scripts/run_all_benchmarks.sh --profile # With profiling"
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/agenticdb_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/agenticdb_benchmark.rs
@@ -0,0 +1,538 @@
+//! AgenticDB compatibility benchmark
+//!
+//! Tests AgenticDB-specific workloads:
+//! - Reflexion episode storage and retrieval
+//! - Skill library search
+//! - Causal graph queries
+//! - Learning session throughput
+
+use anyhow::Result;
+use clap::Parser;
+use rand::Rng;
+use ruvector_bench::{
+    create_progress_bar, BenchmarkResult, DatasetGenerator, LatencyStats, MemoryProfiler,
+    ResultWriter, VectorDistribution,
+};
+use ruvector_core::{
+    types::{DbOptions, HnswConfig, QuantizationConfig},
+    DistanceMetric, SearchQuery, VectorDB, VectorEntry,
+};
+use serde_json::json;
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "agenticdb-benchmark")]
+#[command(about = "AgenticDB workload testing")]
+struct Args {
+    /// Number of episodes
+    #[arg(long, default_value = "10000")]
+    episodes: usize,
+
+    /// Number of skills
+    #[arg(long, default_value = "1000")]
+    skills: usize,
+
+    /// Number of queries
+    #[arg(short, long, default_value = "500")]
+    queries: usize,
+
+    /// Output directory
+    #[arg(short, long, default_value = "bench_results")]
+    output: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector AgenticDB Benchmark        ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    let mut all_results = Vec::new();
+
+    // Test 1: Reflexion episode storage/retrieval
+    println!("\n{}", "=".repeat(60));
+    println!("Test 1: Reflexion Episode Storage & Retrieval");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_reflexion_episodes(&args)?;
+    all_results.push(result);
+
+    // Test 2: Skill library search
+    println!("\n{}", "=".repeat(60));
+    println!("Test 2: Skill Library Search");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_skill_library(&args)?;
+    all_results.push(result);
+
+    // Test 3: Causal graph queries
+    println!("\n{}", "=".repeat(60));
+    println!("Test 3: Causal Graph Queries");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_causal_graph(&args)?;
+    all_results.push(result);
+
+    // Test 4: Learning session throughput
+    println!("\n{}", "=".repeat(60));
+    println!("Test 4: Learning Session Throughput");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_learning_session(&args)?;
+    all_results.push(result);
+
+    // Write results
+    let writer = ResultWriter::new(&args.output)?;
+    writer.write_json("agenticdb_benchmark", &all_results)?;
+    writer.write_csv("agenticdb_benchmark", &all_results)?;
+    writer.write_markdown_report("agenticdb_benchmark", &all_results)?;
+
+    print_summary(&all_results);
+
+    println!(
+        "\n✓ AgenticDB benchmark complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+fn bench_reflexion_episodes(args: &Args) -> Result<BenchmarkResult> {
+    println!("Simulating {} Reflexion episodes...", args.episodes);
+
+    // Reflexion episodes use 384D embeddings (typical for sentence-transformers)
+    let dimensions = 384;
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("episodes.db");
+
+    let options = DbOptions {
+        dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    // Generate episode data
+    let gen = DatasetGenerator::new(
+        dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Storing episodes...");
+    let pb = create_progress_bar(args.episodes as u64, "Storing episodes");
+
+    for i in 0..args.episodes {
+        let entry = VectorEntry {
+            id: Some(format!("episode_{}", i)),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: Some(
+                vec![
+                    ("trajectory".to_string(), json!(format!("traj_{}", i))),
+                    ("reward".to_string(), json!(rand::thread_rng().gen::<f32>())),
+                    (
+                        "success".to_string(),
+                        json!(rand::thread_rng().gen_bool(0.7)),
+                    ),
+                    (
+                        "step_count".to_string(),
+                        json!(rand::thread_rng().gen_range(10..100)),
+                    ),
+                ]
+                .into_iter()
+                .collect(),
+            ),
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Episodes stored");
+
+    let build_time = build_start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+
+    // Query similar episodes
+    println!("Querying similar episodes...");
+    let mut latency_stats = LatencyStats::new()?;
+    let query_vectors = gen.generate(args.queries);
+
+    let search_start = Instant::now();
+    let pb = create_progress_bar(args.queries as u64, "Searching");
+
+    for query in query_vectors {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query,
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_search_time = search_start.elapsed();
+    let qps = args.queries as f64 / total_search_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "reflexion_episodes".to_string(),
+        dataset: "reflexion".to_string(),
+        dimensions,
+        num_vectors: args.episodes,
+        num_queries: args.queries,
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0, // No ground truth for synthetic
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: HashMap::new(),
+    })
+}
+
+fn bench_skill_library(args: &Args) -> Result<BenchmarkResult> {
+    println!("Simulating {} skills in library...", args.skills);
+
+    let dimensions = 768; // Larger embeddings for code/skill descriptions
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("skills.db");
+
+    let options = DbOptions {
+        dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        dimensions,
+        VectorDistribution::Clustered {
+            num_clusters: 20, // Skills grouped by categories
+        },
+    );
+
+    println!("Storing skills...");
+    let pb = create_progress_bar(args.skills as u64, "Storing skills");
+
+    for i in 0..args.skills {
+        let entry = VectorEntry {
+            id: Some(format!("skill_{}", i)),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: Some(
+                vec![
+                    ("name".to_string(), json!(format!("skill_{}", i))),
+                    ("category".to_string(), json!(format!("cat_{}", i % 20))),
+                    (
+                        "success_rate".to_string(),
+                        json!(rand::thread_rng().gen::<f32>()),
+                    ),
+                    (
+                        "usage_count".to_string(),
+                        json!(rand::thread_rng().gen_range(0..1000)),
+                    ),
+                ]
+                .into_iter()
+                .collect(),
+            ),
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Skills stored");
+
+    let build_time = build_start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+
+    // Search for relevant skills
+    println!("Searching for relevant skills...");
+    let mut latency_stats = LatencyStats::new()?;
+    let query_vectors = gen.generate(args.queries);
+
+    let search_start = Instant::now();
+    let pb = create_progress_bar(args.queries as u64, "Searching");
+
+    for query in query_vectors {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query,
+            k: 5,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_search_time = search_start.elapsed();
+    let qps = args.queries as f64 / total_search_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "skill_library".to_string(),
+        dataset: "skills".to_string(),
+        dimensions,
+        num_vectors: args.skills,
+        num_queries: args.queries,
+        k: 5,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: HashMap::new(),
+    })
+}
+
+fn bench_causal_graph(args: &Args) -> Result<BenchmarkResult> {
+    println!(
+        "Simulating causal graph with {} nodes...",
+        args.episodes / 10
+    );
+
+    let dimensions = 256;
+    let num_nodes = args.episodes / 10;
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("causal.db");
+
+    let options = DbOptions {
+        dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Building causal graph...");
+    let pb = create_progress_bar(num_nodes as u64, "Storing nodes");
+
+    for i in 0..num_nodes {
+        let entry = VectorEntry {
+            id: Some(format!("node_{}", i)),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: Some(
+                vec![
+                    ("state".to_string(), json!(format!("state_{}", i))),
+                    ("action".to_string(), json!(format!("action_{}", i % 50))),
+                    (
+                        "causal_strength".to_string(),
+                        json!(rand::thread_rng().gen::<f32>()),
+                    ),
+                ]
+                .into_iter()
+                .collect(),
+            ),
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Graph built");
+
+    let build_time = build_start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+
+    // Query causal relationships
+    println!("Querying causal relationships...");
+    let mut latency_stats = LatencyStats::new()?;
+    let query_vectors = gen.generate(args.queries / 2);
+
+    let search_start = Instant::now();
+    let pb = create_progress_bar((args.queries / 2) as u64, "Searching");
+
+    for query in query_vectors {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query,
+            k: 20,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_search_time = search_start.elapsed();
+    let qps = (args.queries / 2) as f64 / total_search_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "causal_graph".to_string(),
+        dataset: "causal".to_string(),
+        dimensions,
+        num_vectors: num_nodes,
+        num_queries: args.queries / 2,
+        k: 20,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: HashMap::new(),
+    })
+}
+
+fn bench_learning_session(args: &Args) -> Result<BenchmarkResult> {
+    println!("Simulating mixed-workload learning session...");
+
+    let dimensions = 512;
+    let num_items = args.episodes;
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("learning.db");
+
+    let options = DbOptions {
+        dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Running learning session with mixed read/write...");
+    let mut latency_stats = LatencyStats::new()?;
+    let pb = create_progress_bar(num_items as u64, "Processing");
+
+    let mut write_count = 0;
+    let mut read_count = 0;
+
+    for i in 0..num_items {
+        // 70% writes, 30% reads (typical learning scenario)
+        if rand::thread_rng().gen_bool(0.7) {
+            let entry = VectorEntry {
+                id: Some(format!("item_{}", i)),
+                vector: gen.generate(1).into_iter().next().unwrap(),
+                metadata: Some(
+                    vec![("timestamp".to_string(), json!(i))]
+                        .into_iter()
+                        .collect(),
+                ),
+            };
+            db.insert(entry)?;
+            write_count += 1;
+        } else {
+            let query = gen.generate(1).into_iter().next().unwrap();
+            let query_start = Instant::now();
+            db.search(SearchQuery {
+                vector: query,
+                k: 10,
+                filter: None,
+                ef_search: None,
+            })?;
+            latency_stats.record(query_start.elapsed())?;
+            read_count += 1;
+        }
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Learning session complete");
+
+    let build_time = build_start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+    let throughput = num_items as f64 / build_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "learning_session".to_string(),
+        dataset: "mixed_workload".to_string(),
+        dimensions,
+        num_vectors: write_count,
+        num_queries: read_count,
+        k: 10,
+        qps: throughput,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: vec![
+            ("writes".to_string(), write_count.to_string()),
+            ("reads".to_string(), read_count.to_string()),
+        ]
+        .into_iter()
+        .collect(),
+    })
+}
+
+fn print_summary(results: &[BenchmarkResult]) {
+    use tabled::{Table, Tabled};
+
+    #[derive(Tabled)]
+    struct ResultRow {
+        #[tabled(rename = "Workload")]
+        name: String,
+        #[tabled(rename = "Vectors")]
+        vectors: String,
+        #[tabled(rename = "Throughput")]
+        qps: String,
+        #[tabled(rename = "p50 (ms)")]
+        p50: String,
+        #[tabled(rename = "p99 (ms)")]
+        p99: String,
+        #[tabled(rename = "Memory (MB)")]
+        memory: String,
+    }
+
+    let rows: Vec<ResultRow> = results
+        .iter()
+        .map(|r| ResultRow {
+            name: r.name.clone(),
+            vectors: r.num_vectors.to_string(),
+            qps: format!("{:.0} ops/s", r.qps),
+            p50: format!("{:.2}", r.latency_p50),
+            p99: format!("{:.2}", r.latency_p99),
+            memory: format!("{:.1}", r.memory_mb),
+        })
+        .collect();
+
+    println!("\n\n{}", Table::new(rows));
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/ann_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/ann_benchmark.rs
@@ -0,0 +1,400 @@
+//! ANN-Benchmarks compatible benchmark suite
+//!
+//! Runs standard benchmarks on SIFT1M, GIST1M, and Deep1M datasets
+//! compatible with http://ann-benchmarks.com format
+
+use anyhow::{Context, Result};
+use clap::Parser;
+use ruvector_bench::{
+    calculate_recall, create_progress_bar, BenchmarkResult, DatasetGenerator, LatencyStats,
+    MemoryProfiler, ResultWriter, VectorDistribution,
+};
+use ruvector_core::{
+    types::{DbOptions, HnswConfig, QuantizationConfig},
+    DistanceMetric, SearchQuery, VectorDB, VectorEntry,
+};
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "ann-benchmark")]
+#[command(about = "ANN-Benchmarks compatible testing")]
+struct Args {
+    /// Dataset to use: sift1m, gist1m, deep1m, or synthetic
+    #[arg(short, long, default_value = "synthetic")]
+    dataset: String,
+
+    /// Number of vectors for synthetic dataset
+    #[arg(short, long, default_value = "100000")]
+    num_vectors: usize,
+
+    /// Number of queries
+    #[arg(short = 'q', long, default_value = "1000")]
+    num_queries: usize,
+
+    /// Vector dimensions (for synthetic)
+    #[arg(short = 'd', long, default_value = "128")]
+    dimensions: usize,
+
+    /// K nearest neighbors to retrieve
+    #[arg(short, long, default_value = "10")]
+    k: usize,
+
+    /// HNSW M parameter
+    #[arg(short, long, default_value = "32")]
+    m: usize,
+
+    /// HNSW ef_construction
+    #[arg(long, default_value = "200")]
+    ef_construction: usize,
+
+    /// HNSW ef_search values to test (comma-separated)
+    #[arg(long, default_value = "50,100,200,400")]
+    ef_search_values: String,
+
+    /// Output directory for results
+    #[arg(short, long, default_value = "bench_results")]
+    output: PathBuf,
+
+    /// Distance metric
+    #[arg(long, default_value = "cosine")]
+    metric: String,
+
+    /// Quantization: none, scalar, binary
+    #[arg(long, default_value = "scalar")]
+    quantization: String,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector ANN-Benchmarks Suite       ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    // Parse ef_search values
+    let ef_search_values: Vec<usize> = args
+        .ef_search_values
+        .split(',')
+        .map(|s| s.trim().parse().unwrap())
+        .collect();
+
+    // Load or generate dataset
+    let (vectors, queries, ground_truth) = load_dataset(&args)?;
+    println!(
+        "✓ Dataset loaded: {} vectors, {} queries",
+        vectors.len(),
+        queries.len()
+    );
+
+    let mut all_results = Vec::new();
+
+    // Run benchmarks for each ef_search value
+    for &ef_search in &ef_search_values {
+        println!("\n{}", "=".repeat(60));
+        println!("Testing with ef_search = {}", ef_search);
+        println!("{}\n", "=".repeat(60));
+
+        let result = run_benchmark(&args, &vectors, &queries, &ground_truth, ef_search)?;
+        all_results.push(result);
+    }
+
+    // Write results
+    let writer = ResultWriter::new(&args.output)?;
+    writer.write_json("ann_benchmark", &all_results)?;
+    writer.write_csv("ann_benchmark", &all_results)?;
+    writer.write_markdown_report("ann_benchmark", &all_results)?;
+
+    // Print summary table
+    print_summary_table(&all_results);
+
+    println!(
+        "\n✓ Benchmark complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+fn load_dataset(args: &Args) -> Result<(Vec<Vec<f32>>, Vec<Vec<f32>>, Vec<Vec<String>>)> {
+    match args.dataset.as_str() {
+        "sift1m" => load_sift1m(),
+        "gist1m" => load_gist1m(),
+        "deep1m" => load_deep1m(),
+        "synthetic" | _ => {
+            println!("Generating synthetic {} dataset...", args.dataset);
+            let gen = DatasetGenerator::new(
+                args.dimensions,
+                VectorDistribution::Normal {
+                    mean: 0.0,
+                    std_dev: 1.0,
+                },
+            );
+
+            let pb = create_progress_bar(args.num_vectors as u64, "Generating vectors");
+            let vectors: Vec<Vec<f32>> = (0..args.num_vectors)
+                .map(|_| {
+                    pb.inc(1);
+                    gen.generate(1).into_iter().next().unwrap()
+                })
+                .collect();
+            pb.finish_with_message("✓ Vectors generated");
+
+            let queries = gen.generate(args.num_queries);
+
+            // Generate ground truth using brute force
+            let ground_truth = compute_ground_truth(&vectors, &queries, args.k)?;
+
+            Ok((vectors, queries, ground_truth))
+        }
+    }
+}
+
+fn load_sift1m() -> Result<(Vec<Vec<f32>>, Vec<Vec<f32>>, Vec<Vec<String>>)> {
+    // TODO: Implement HDF5 loading when dataset is available
+    println!("⚠ SIFT1M dataset not found, using synthetic data");
+    println!("  Download SIFT1M with: scripts/download_datasets.sh");
+
+    let gen = DatasetGenerator::new(
+        128,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+    let vectors = gen.generate(10000);
+    let queries = gen.generate(100);
+    let ground_truth = compute_ground_truth(&vectors, &queries, 10)?;
+    Ok((vectors, queries, ground_truth))
+}
+
+fn load_gist1m() -> Result<(Vec<Vec<f32>>, Vec<Vec<f32>>, Vec<Vec<String>>)> {
+    println!("⚠ GIST1M dataset not found, using synthetic data");
+    let gen = DatasetGenerator::new(
+        960,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+    let vectors = gen.generate(10000);
+    let queries = gen.generate(100);
+    let ground_truth = compute_ground_truth(&vectors, &queries, 10)?;
+    Ok((vectors, queries, ground_truth))
+}
+
+fn load_deep1m() -> Result<(Vec<Vec<f32>>, Vec<Vec<f32>>, Vec<Vec<String>>)> {
+    println!("⚠ Deep1M dataset not found, using synthetic data");
+    let gen = DatasetGenerator::new(
+        96,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+    let vectors = gen.generate(10000);
+    let queries = gen.generate(100);
+    let ground_truth = compute_ground_truth(&vectors, &queries, 10)?;
+    Ok((vectors, queries, ground_truth))
+}
+
+fn compute_ground_truth(
+    vectors: &[Vec<f32>],
+    queries: &[Vec<f32>],
+    k: usize,
+) -> Result<Vec<Vec<String>>> {
+    println!("Computing ground truth with brute force...");
+    let pb = create_progress_bar(queries.len() as u64, "Computing ground truth");
+
+    let ground_truth: Vec<Vec<String>> = queries
+        .iter()
+        .map(|query| {
+            pb.inc(1);
+            let mut distances: Vec<(usize, f32)> = vectors
+                .iter()
+                .enumerate()
+                .map(|(idx, vec)| {
+                    let dist = cosine_distance(query, vec);
+                    (idx, dist)
+                })
+                .collect();
+
+            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+            distances
+                .iter()
+                .take(k)
+                .map(|(idx, _)| idx.to_string())
+                .collect()
+        })
+        .collect();
+
+    pb.finish_with_message("✓ Ground truth computed");
+    Ok(ground_truth)
+}
+
+fn cosine_distance(a: &[f32], b: &[f32]) -> f32 {
+    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
+    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+    1.0 - (dot / (norm_a * norm_b))
+}
+
+fn run_benchmark(
+    args: &Args,
+    vectors: &[Vec<f32>],
+    queries: &[Vec<f32>],
+    ground_truth: &[Vec<String>],
+    ef_search: usize,
+) -> Result<BenchmarkResult> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("bench.db");
+
+    // Parse distance metric
+    let distance_metric = match args.metric.as_str() {
+        "cosine" => DistanceMetric::Cosine,
+        "euclidean" => DistanceMetric::Euclidean,
+        "dot" => DistanceMetric::DotProduct,
+        _ => DistanceMetric::Cosine,
+    };
+
+    // Parse quantization
+    let quantization = match args.quantization.as_str() {
+        "none" => QuantizationConfig::None,
+        "scalar" => QuantizationConfig::Scalar,
+        "binary" => QuantizationConfig::Binary,
+        _ => QuantizationConfig::Scalar,
+    };
+
+    let dimensions = vectors[0].len();
+    let options = DbOptions {
+        dimensions,
+        distance_metric,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig {
+            m: args.m,
+            ef_construction: args.ef_construction,
+            ef_search,
+            max_elements: vectors.len() * 2,
+        }),
+        quantization: Some(quantization),
+    };
+
+    // Measure build time and memory
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+
+    let db = VectorDB::new(options)?;
+
+    println!("Indexing {} vectors...", vectors.len());
+    let pb = create_progress_bar(vectors.len() as u64, "Indexing");
+
+    for (idx, vector) in vectors.iter().enumerate() {
+        let entry = VectorEntry {
+            id: Some(idx.to_string()),
+            vector: vector.clone(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Indexing complete");
+
+    let build_time = build_start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+
+    // Run search benchmark
+    println!("Running {} queries...", queries.len());
+    let mut latency_stats = LatencyStats::new()?;
+    let mut search_results = Vec::new();
+
+    let search_start = Instant::now();
+    let pb = create_progress_bar(queries.len() as u64, "Searching");
+
+    for query in queries {
+        let query_start = Instant::now();
+        let results = db.search(SearchQuery {
+            vector: query.clone(),
+            k: args.k,
+            filter: None,
+            ef_search: Some(ef_search),
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+
+        let result_ids: Vec<String> = results.into_iter().map(|r| r.id).collect();
+        search_results.push(result_ids);
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_search_time = search_start.elapsed();
+    let qps = queries.len() as f64 / total_search_time.as_secs_f64();
+
+    // Calculate recall
+    let recall_1 = calculate_recall(&search_results, ground_truth, 1);
+    let recall_10 = calculate_recall(&search_results, ground_truth, 10.min(args.k));
+    let recall_100 = calculate_recall(&search_results, ground_truth, 100.min(args.k));
+
+    let mut metadata = HashMap::new();
+    metadata.insert("m".to_string(), args.m.to_string());
+    metadata.insert(
+        "ef_construction".to_string(),
+        args.ef_construction.to_string(),
+    );
+    metadata.insert("ef_search".to_string(), ef_search.to_string());
+    metadata.insert("metric".to_string(), args.metric.clone());
+    metadata.insert("quantization".to_string(), args.quantization.clone());
+
+    Ok(BenchmarkResult {
+        name: format!("ruvector-ef{}", ef_search),
+        dataset: args.dataset.clone(),
+        dimensions,
+        num_vectors: vectors.len(),
+        num_queries: queries.len(),
+        k: args.k,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: recall_1,
+        recall_at_10: recall_10,
+        recall_at_100: recall_100,
+        memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata,
+    })
+}
+
+fn print_summary_table(results: &[BenchmarkResult]) {
+    use tabled::{Table, Tabled};
+
+    #[derive(Tabled)]
+    struct ResultRow {
+        #[tabled(rename = "ef_search")]
+        ef_search: String,
+        #[tabled(rename = "QPS")]
+        qps: String,
+        #[tabled(rename = "p50 (ms)")]
+        p50: String,
+        #[tabled(rename = "p99 (ms)")]
+        p99: String,
+        #[tabled(rename = "Recall@10")]
+        recall: String,
+        #[tabled(rename = "Memory (MB)")]
+        memory: String,
+    }
+
+    let rows: Vec<ResultRow> = results
+        .iter()
+        .map(|r| ResultRow {
+            ef_search: r.metadata.get("ef_search").unwrap().clone(),
+            qps: format!("{:.0}", r.qps),
+            p50: format!("{:.2}", r.latency_p50),
+            p99: format!("{:.2}", r.latency_p99),
+            recall: format!("{:.2}%", r.recall_at_10 * 100.0),
+            memory: format!("{:.1}", r.memory_mb),
+        })
+        .collect();
+
+    println!("\n\n{}", Table::new(rows));
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/comparison_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/comparison_benchmark.rs
@@ -0,0 +1,386 @@
+//! Cross-system performance comparison benchmark
+//!
+//! Compares Ruvector against:
+//! - Pure Python implementations (simulated)
+//! - Other vector databases (placeholder for future integration)
+//!
+//! Documents performance improvements (target: 10-100x)
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_bench::{
+    create_progress_bar, BenchmarkResult, DatasetGenerator, LatencyStats, MemoryProfiler,
+    ResultWriter, VectorDistribution,
+};
+use ruvector_core::types::{DbOptions, HnswConfig, QuantizationConfig};
+use ruvector_core::{DistanceMetric, SearchQuery, VectorDB, VectorEntry};
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "comparison-benchmark")]
+#[command(about = "Cross-system performance comparison")]
+struct Args {
+    /// Number of vectors
+    #[arg(short, long, default_value = "50000")]
+    num_vectors: usize,
+
+    /// Number of queries
+    #[arg(short, long, default_value = "1000")]
+    queries: usize,
+
+    /// Vector dimensions
+    #[arg(short, long, default_value = "384")]
+    dimensions: usize,
+
+    /// Output directory
+    #[arg(short, long, default_value = "bench_results")]
+    output: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector Comparison Benchmark       ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    let mut all_results = Vec::new();
+
+    // Test 1: Ruvector (optimized)
+    println!("\n{}", "=".repeat(60));
+    println!("Test 1: Ruvector (SIMD + Quantization + HNSW)");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_ruvector_optimized(&args)?;
+    all_results.push(result);
+
+    // Test 2: Ruvector (no quantization)
+    println!("\n{}", "=".repeat(60));
+    println!("Test 2: Ruvector (No Quantization)");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_ruvector_no_quant(&args)?;
+    all_results.push(result);
+
+    // Test 3: Simulated Python baseline
+    println!("\n{}", "=".repeat(60));
+    println!("Test 3: Simulated Python Baseline");
+    println!("{}\n", "=".repeat(60));
+    let result = simulate_python_baseline(&args)?;
+    all_results.push(result);
+
+    // Test 4: Simulated naive brute-force
+    println!("\n{}", "=".repeat(60));
+    println!("Test 4: Simulated Brute-Force Search");
+    println!("{}\n", "=".repeat(60));
+    let result = simulate_brute_force(&args)?;
+    all_results.push(result);
+
+    // Write results
+    let writer = ResultWriter::new(&args.output)?;
+    writer.write_json("comparison_benchmark", &all_results)?;
+    writer.write_csv("comparison_benchmark", &all_results)?;
+    writer.write_markdown_report("comparison_benchmark", &all_results)?;
+
+    print_comparison_table(&all_results);
+
+    println!(
+        "\n✓ Comparison benchmark complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+fn bench_ruvector_optimized(args: &Args) -> Result<BenchmarkResult> {
+    let (db, queries) = setup_ruvector(args, QuantizationConfig::Scalar)?;
+
+    println!("Running {} queries...", queries.len());
+    let mut latency_stats = LatencyStats::new()?;
+    let pb = create_progress_bar(queries.len() as u64, "Searching");
+
+    let search_start = Instant::now();
+    for query in &queries {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_time = search_start.elapsed();
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "ruvector_optimized".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: vec![("system".to_string(), "ruvector".to_string())]
+            .into_iter()
+            .collect(),
+    })
+}
+
+fn bench_ruvector_no_quant(args: &Args) -> Result<BenchmarkResult> {
+    let (db, queries) = setup_ruvector(args, QuantizationConfig::None)?;
+
+    println!("Running {} queries...", queries.len());
+    let mut latency_stats = LatencyStats::new()?;
+    let pb = create_progress_bar(queries.len() as u64, "Searching");
+
+    let search_start = Instant::now();
+    for query in &queries {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_time = search_start.elapsed();
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "ruvector_no_quant".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: vec![("system".to_string(), "ruvector_no_quant".to_string())]
+            .into_iter()
+            .collect(),
+    })
+}
+
+fn simulate_python_baseline(args: &Args) -> Result<BenchmarkResult> {
+    // Simulate Python numpy-based implementation
+    // Estimated to be 10-20x slower based on typical Rust vs Python performance
+
+    let (db, queries) = setup_ruvector(args, QuantizationConfig::Scalar)?;
+
+    println!("Simulating Python baseline (estimated)...");
+    let mut latency_stats = LatencyStats::new()?;
+
+    let search_start = Instant::now();
+    for query in &queries {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        let rust_latency = query_start.elapsed();
+
+        // Simulate Python being 15x slower
+        let simulated_latency = rust_latency * 15;
+        latency_stats.record(simulated_latency)?;
+    }
+
+    let total_time = search_start.elapsed() * 15; // Simulate slower execution
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    println!("  (Estimated based on 15x slowdown factor)");
+
+    Ok(BenchmarkResult {
+        name: "python_baseline".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: vec![
+            ("system".to_string(), "python_numpy".to_string()),
+            ("simulated".to_string(), "true".to_string()),
+        ]
+        .into_iter()
+        .collect(),
+    })
+}
+
+fn simulate_brute_force(args: &Args) -> Result<BenchmarkResult> {
+    // Simulate naive brute-force O(n) search
+    // For HNSW with 50K vectors, brute force would be ~500x slower
+
+    let (db, queries) = setup_ruvector(args, QuantizationConfig::Scalar)?;
+
+    println!("Simulating brute-force search (estimated)...");
+    let mut latency_stats = LatencyStats::new()?;
+
+    let slowdown_factor = (args.num_vectors as f64).sqrt() as u32; // Rough O(log n) vs O(n) ratio
+
+    let search_start = Instant::now();
+    for query in &queries {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        let hnsw_latency = query_start.elapsed();
+
+        // Simulate brute force being much slower
+        let simulated_latency = hnsw_latency * slowdown_factor;
+        latency_stats.record(simulated_latency)?;
+    }
+
+    let total_time = search_start.elapsed() * slowdown_factor;
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    println!("  (Estimated with {}x slowdown factor)", slowdown_factor);
+
+    Ok(BenchmarkResult {
+        name: "brute_force".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: vec![
+            ("system".to_string(), "brute_force".to_string()),
+            ("simulated".to_string(), "true".to_string()),
+            ("slowdown_factor".to_string(), slowdown_factor.to_string()),
+        ]
+        .into_iter()
+        .collect(),
+    })
+}
+
+fn setup_ruvector(
+    args: &Args,
+    quantization: QuantizationConfig,
+) -> Result<(VectorDB, Vec<Vec<f32>>)> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("comparison.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(quantization),
+    };
+
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Indexing {} vectors...", args.num_vectors);
+    let pb = create_progress_bar(args.num_vectors as u64, "Indexing");
+
+    for i in 0..args.num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Indexing complete");
+
+    let queries = gen.generate(args.queries);
+
+    Ok((db, queries))
+}
+
+fn print_comparison_table(results: &[BenchmarkResult]) {
+    use tabled::{Table, Tabled};
+
+    #[derive(Tabled)]
+    struct ResultRow {
+        #[tabled(rename = "System")]
+        name: String,
+        #[tabled(rename = "QPS")]
+        qps: String,
+        #[tabled(rename = "p50 (ms)")]
+        p50: String,
+        #[tabled(rename = "p99 (ms)")]
+        p99: String,
+        #[tabled(rename = "Speedup")]
+        speedup: String,
+    }
+
+    let baseline_qps = results
+        .iter()
+        .find(|r| r.name == "python_baseline")
+        .map(|r| r.qps)
+        .unwrap_or(1.0);
+
+    let rows: Vec<ResultRow> = results
+        .iter()
+        .map(|r| {
+            let speedup = r.qps / baseline_qps;
+            ResultRow {
+                name: r.name.clone(),
+                qps: format!("{:.0}", r.qps),
+                p50: format!("{:.2}", r.latency_p50),
+                p99: format!("{:.2}", r.latency_p99),
+                speedup: format!("{:.1}x", speedup),
+            }
+        })
+        .collect();
+
+    println!("\n\n{}", Table::new(rows));
+    println!("\nNote: Python and brute-force results are simulated estimates.");
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/latency_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/latency_benchmark.rs
@@ -0,0 +1,411 @@
+//! Latency profiling benchmark
+//!
+//! Measures p50, p95, p99, p99.9 latencies under various conditions:
+//! - Single-threaded vs multi-threaded
+//! - Effect of efSearch on latency
+//! - Effect of quantization on latency/recall tradeoff
+
+use anyhow::Result;
+use clap::Parser;
+use rayon::prelude::*;
+use ruvector_bench::{
+    create_progress_bar, BenchmarkResult, DatasetGenerator, LatencyStats, MemoryProfiler,
+    ResultWriter, VectorDistribution,
+};
+use ruvector_core::{
+    types::{DbOptions, HnswConfig, QuantizationConfig},
+    DistanceMetric, SearchQuery, VectorDB, VectorEntry,
+};
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "latency-benchmark")]
+#[command(about = "Latency profiling across different conditions")]
+struct Args {
+    /// Number of vectors
+    #[arg(short, long, default_value = "50000")]
+    num_vectors: usize,
+
+    /// Number of queries
+    #[arg(short, long, default_value = "1000")]
+    queries: usize,
+
+    /// Vector dimensions
+    #[arg(short, long, default_value = "384")]
+    dimensions: usize,
+
+    /// Number of parallel threads to test
+    #[arg(short, long, default_value = "1,4,8,16")]
+    threads: String,
+
+    /// Output directory
+    #[arg(short, long, default_value = "bench_results")]
+    output: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector Latency Profiling          ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    let mut all_results = Vec::new();
+
+    // Test 1: Single-threaded latency
+    println!("\n{}", "=".repeat(60));
+    println!("Test 1: Single-threaded Latency");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_single_threaded(&args)?;
+    all_results.push(result);
+
+    // Test 2: Multi-threaded latency
+    let thread_counts: Vec<usize> = args
+        .threads
+        .split(',')
+        .map(|s| s.trim().parse().unwrap())
+        .collect();
+
+    for &num_threads in &thread_counts {
+        println!("\n{}", "=".repeat(60));
+        println!("Test 2: Multi-threaded Latency ({} threads)", num_threads);
+        println!("{}\n", "=".repeat(60));
+        let result = bench_multi_threaded(&args, num_threads)?;
+        all_results.push(result);
+    }
+
+    // Test 3: Effect of efSearch
+    println!("\n{}", "=".repeat(60));
+    println!("Test 3: Effect of efSearch on Latency");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_ef_search_latency(&args)?;
+    all_results.extend(result);
+
+    // Test 4: Effect of quantization
+    println!("\n{}", "=".repeat(60));
+    println!("Test 4: Effect of Quantization on Latency");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_quantization_latency(&args)?;
+    all_results.extend(result);
+
+    // Write results
+    let writer = ResultWriter::new(&args.output)?;
+    writer.write_json("latency_benchmark", &all_results)?;
+    writer.write_csv("latency_benchmark", &all_results)?;
+    writer.write_markdown_report("latency_benchmark", &all_results)?;
+
+    print_summary(&all_results);
+
+    println!(
+        "\n✓ Latency benchmark complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+fn bench_single_threaded(args: &Args) -> Result<BenchmarkResult> {
+    let (db, queries) = setup_database(args, QuantizationConfig::Scalar)?;
+
+    println!("Running {} queries (single-threaded)...", queries.len());
+    let mut latency_stats = LatencyStats::new()?;
+    let pb = create_progress_bar(queries.len() as u64, "Searching");
+
+    let search_start = Instant::now();
+    for query in &queries {
+        let query_start = Instant::now();
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        latency_stats.record(query_start.elapsed())?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let total_time = search_start.elapsed();
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    Ok(BenchmarkResult {
+        name: "single_threaded".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+        latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+        latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+        latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: HashMap::new(),
+    })
+}
+
+fn bench_multi_threaded(args: &Args, num_threads: usize) -> Result<BenchmarkResult> {
+    let (db, queries) = setup_database(args, QuantizationConfig::Scalar)?;
+    let db = Arc::new(db);
+
+    println!(
+        "Running {} queries ({} threads)...",
+        queries.len(),
+        num_threads
+    );
+
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(num_threads)
+        .build_global()
+        .ok();
+
+    let search_start = Instant::now();
+
+    let latencies: Vec<f64> = queries
+        .par_iter()
+        .map(|query| {
+            let query_start = Instant::now();
+            db.search(SearchQuery {
+                vector: query.clone(),
+                k: 10,
+                filter: None,
+                ef_search: None,
+            })
+            .ok();
+            query_start.elapsed().as_secs_f64() * 1000.0
+        })
+        .collect();
+
+    let total_time = search_start.elapsed();
+    let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+    // Calculate percentiles manually
+    let mut sorted_latencies = latencies.clone();
+    sorted_latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let p50 = sorted_latencies[(sorted_latencies.len() as f64 * 0.50) as usize];
+    let p95 = sorted_latencies[(sorted_latencies.len() as f64 * 0.95) as usize];
+    let p99 = sorted_latencies[(sorted_latencies.len() as f64 * 0.99) as usize];
+    let p999 = sorted_latencies[(sorted_latencies.len() as f64 * 0.999) as usize];
+
+    Ok(BenchmarkResult {
+        name: format!("multi_threaded_{}", num_threads),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors: args.num_vectors,
+        num_queries: queries.len(),
+        k: 10,
+        qps,
+        latency_p50: p50,
+        latency_p95: p95,
+        latency_p99: p99,
+        latency_p999: p999,
+        recall_at_1: 1.0,
+        recall_at_10: 1.0,
+        recall_at_100: 1.0,
+        memory_mb: 0.0,
+        build_time_secs: 0.0,
+        metadata: vec![("threads".to_string(), num_threads.to_string())]
+            .into_iter()
+            .collect(),
+    })
+}
+
+fn bench_ef_search_latency(args: &Args) -> Result<Vec<BenchmarkResult>> {
+    let ef_values = vec![50, 100, 200, 400, 800];
+    let mut results = Vec::new();
+
+    for ef_search in ef_values {
+        println!("Testing efSearch = {}...", ef_search);
+        let (db, queries) = setup_database(args, QuantizationConfig::Scalar)?;
+
+        let mut latency_stats = LatencyStats::new()?;
+        let pb = create_progress_bar(queries.len() as u64, &format!("ef={}", ef_search));
+
+        let search_start = Instant::now();
+        for query in &queries {
+            let query_start = Instant::now();
+            db.search(SearchQuery {
+                vector: query.clone(),
+                k: 10,
+                filter: None,
+                ef_search: Some(ef_search),
+            })?;
+            latency_stats.record(query_start.elapsed())?;
+            pb.inc(1);
+        }
+        pb.finish_with_message(format!("✓ ef={} complete", ef_search));
+
+        let total_time = search_start.elapsed();
+        let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+        results.push(BenchmarkResult {
+            name: format!("ef_search_{}", ef_search),
+            dataset: "synthetic".to_string(),
+            dimensions: args.dimensions,
+            num_vectors: args.num_vectors,
+            num_queries: queries.len(),
+            k: 10,
+            qps,
+            latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+            latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+            latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+            latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+            recall_at_1: 1.0,
+            recall_at_10: 1.0,
+            recall_at_100: 1.0,
+            memory_mb: 0.0,
+            build_time_secs: 0.0,
+            metadata: vec![("ef_search".to_string(), ef_search.to_string())]
+                .into_iter()
+                .collect(),
+        });
+    }
+
+    Ok(results)
+}
+
+fn bench_quantization_latency(args: &Args) -> Result<Vec<BenchmarkResult>> {
+    let quantizations = vec![
+        ("none", QuantizationConfig::None),
+        ("scalar", QuantizationConfig::Scalar),
+        ("binary", QuantizationConfig::Binary),
+    ];
+
+    let mut results = Vec::new();
+
+    for (name, quant_config) in quantizations {
+        println!("Testing quantization: {}...", name);
+        let (db, queries) = setup_database(args, quant_config)?;
+
+        let mut latency_stats = LatencyStats::new()?;
+        let pb = create_progress_bar(queries.len() as u64, &format!("quant={}", name));
+
+        let search_start = Instant::now();
+        for query in &queries {
+            let query_start = Instant::now();
+            db.search(SearchQuery {
+                vector: query.clone(),
+                k: 10,
+                filter: None,
+                ef_search: None,
+            })?;
+            latency_stats.record(query_start.elapsed())?;
+            pb.inc(1);
+        }
+        pb.finish_with_message(format!("✓ {} complete", name));
+
+        let total_time = search_start.elapsed();
+        let qps = queries.len() as f64 / total_time.as_secs_f64();
+
+        results.push(BenchmarkResult {
+            name: format!("quantization_{}", name),
+            dataset: "synthetic".to_string(),
+            dimensions: args.dimensions,
+            num_vectors: args.num_vectors,
+            num_queries: queries.len(),
+            k: 10,
+            qps,
+            latency_p50: latency_stats.percentile(0.50).as_secs_f64() * 1000.0,
+            latency_p95: latency_stats.percentile(0.95).as_secs_f64() * 1000.0,
+            latency_p99: latency_stats.percentile(0.99).as_secs_f64() * 1000.0,
+            latency_p999: latency_stats.percentile(0.999).as_secs_f64() * 1000.0,
+            recall_at_1: 1.0,
+            recall_at_10: 1.0,
+            recall_at_100: 1.0,
+            memory_mb: 0.0,
+            build_time_secs: 0.0,
+            metadata: vec![("quantization".to_string(), name.to_string())]
+                .into_iter()
+                .collect(),
+        });
+    }
+
+    Ok(results)
+}
+
+fn setup_database(
+    args: &Args,
+    quantization: QuantizationConfig,
+) -> Result<(VectorDB, Vec<Vec<f32>>)> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("latency.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(quantization),
+    };
+
+    let db = VectorDB::new(options)?;
+
+    // Generate and index data
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Indexing {} vectors...", args.num_vectors);
+    let pb = create_progress_bar(args.num_vectors as u64, "Indexing");
+
+    for i in 0..args.num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Indexing complete");
+
+    // Generate query vectors
+    let queries = gen.generate(args.queries);
+
+    Ok((db, queries))
+}
+
+fn print_summary(results: &[BenchmarkResult]) {
+    use tabled::{Table, Tabled};
+
+    #[derive(Tabled)]
+    struct ResultRow {
+        #[tabled(rename = "Configuration")]
+        name: String,
+        #[tabled(rename = "QPS")]
+        qps: String,
+        #[tabled(rename = "p50 (ms)")]
+        p50: String,
+        #[tabled(rename = "p95 (ms)")]
+        p95: String,
+        #[tabled(rename = "p99 (ms)")]
+        p99: String,
+        #[tabled(rename = "p99.9 (ms)")]
+        p999: String,
+    }
+
+    let rows: Vec<ResultRow> = results
+        .iter()
+        .map(|r| ResultRow {
+            name: r.name.clone(),
+            qps: format!("{:.0}", r.qps),
+            p50: format!("{:.2}", r.latency_p50),
+            p95: format!("{:.2}", r.latency_p95),
+            p99: format!("{:.2}", r.latency_p99),
+            p999: format!("{:.2}", r.latency_p999),
+        })
+        .collect();
+
+    println!("\n\n{}", Table::new(rows));
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/memory_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/memory_benchmark.rs
@@ -0,0 +1,432 @@
+//! Memory usage profiling benchmark
+//!
+//! Measures memory consumption at various scales and configurations:
+//! - Memory usage at 10K, 100K, 1M vectors
+//! - Effect of quantization on memory
+//! - Index overhead measurement
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_bench::{
+    create_progress_bar, BenchmarkResult, DatasetGenerator, MemoryProfiler, ResultWriter,
+    VectorDistribution,
+};
+use ruvector_core::{
+    types::{DbOptions, HnswConfig, QuantizationConfig},
+    DistanceMetric, VectorDB, VectorEntry,
+};
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "memory-benchmark")]
+#[command(about = "Memory usage profiling")]
+struct Args {
+    /// Vector dimensions
+    #[arg(short, long, default_value = "384")]
+    dimensions: usize,
+
+    /// Scales to test (comma-separated)
+    #[arg(short, long, default_value = "1000,10000,100000")]
+    scales: String,
+
+    /// Output directory
+    #[arg(short, long, default_value = "bench_results")]
+    output: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector Memory Profiling           ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    let mut all_results = Vec::new();
+
+    // Parse scales
+    let scales: Vec<usize> = args
+        .scales
+        .split(',')
+        .map(|s| s.trim().parse().unwrap())
+        .collect();
+
+    // Test 1: Memory usage at different scales
+    for &scale in &scales {
+        println!("\n{}", "=".repeat(60));
+        println!("Test: Memory at {} vectors", scale);
+        println!("{}\n", "=".repeat(60));
+        let result = bench_memory_scale(&args, scale)?;
+        all_results.push(result);
+    }
+
+    // Test 2: Effect of quantization on memory
+    println!("\n{}", "=".repeat(60));
+    println!("Test: Effect of Quantization on Memory");
+    println!("{}\n", "=".repeat(60));
+    let results = bench_quantization_memory(&args)?;
+    all_results.extend(results);
+
+    // Test 3: Index overhead analysis
+    println!("\n{}", "=".repeat(60));
+    println!("Test: Index Overhead Analysis");
+    println!("{}\n", "=".repeat(60));
+    let result = bench_index_overhead(&args)?;
+    all_results.push(result);
+
+    // Write results
+    let writer = ResultWriter::new(&args.output)?;
+    writer.write_json("memory_benchmark", &all_results)?;
+    writer.write_csv("memory_benchmark", &all_results)?;
+    writer.write_markdown_report("memory_benchmark", &all_results)?;
+
+    print_summary(&all_results);
+
+    println!(
+        "\n✓ Memory benchmark complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+fn bench_memory_scale(args: &Args, num_vectors: usize) -> Result<BenchmarkResult> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("memory_scale.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let initial_mb = mem_profiler.current_usage_mb();
+
+    println!("Initial memory: {:.2} MB", initial_mb);
+    println!("Indexing {} vectors...", num_vectors);
+
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    let pb = create_progress_bar(num_vectors as u64, "Indexing");
+
+    for i in 0..num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+
+        // Sample memory every 10%
+        if i % (num_vectors / 10).max(1) == 0 {
+            let current_mb = mem_profiler.current_usage_mb();
+            println!(
+                "  Progress: {}%, Memory: {:.2} MB",
+                (i * 100) / num_vectors,
+                current_mb
+            );
+        }
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Indexing complete");
+
+    let build_time = build_start.elapsed();
+    let final_mb = mem_profiler.current_usage_mb();
+    let memory_per_vector_kb = (final_mb - initial_mb) * 1024.0 / num_vectors as f64;
+
+    println!("Final memory: {:.2} MB", final_mb);
+    println!("Memory per vector: {:.2} KB", memory_per_vector_kb);
+
+    // Calculate theoretical minimum
+    let vector_size_bytes = args.dimensions * 4; // 4 bytes per f32
+    let theoretical_mb = (num_vectors * vector_size_bytes) as f64 / 1_048_576.0;
+    let overhead_ratio = final_mb / theoretical_mb;
+
+    println!("Theoretical minimum: {:.2} MB", theoretical_mb);
+    println!("Overhead ratio: {:.2}x", overhead_ratio);
+
+    Ok(BenchmarkResult {
+        name: format!("memory_scale_{}", num_vectors),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors,
+        num_queries: 0,
+        k: 0,
+        qps: 0.0,
+        latency_p50: 0.0,
+        latency_p95: 0.0,
+        latency_p99: 0.0,
+        latency_p999: 0.0,
+        recall_at_1: 0.0,
+        recall_at_10: 0.0,
+        recall_at_100: 0.0,
+        memory_mb: final_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: vec![
+            (
+                "memory_per_vector_kb".to_string(),
+                format!("{:.2}", memory_per_vector_kb),
+            ),
+            (
+                "theoretical_mb".to_string(),
+                format!("{:.2}", theoretical_mb),
+            ),
+            (
+                "overhead_ratio".to_string(),
+                format!("{:.2}", overhead_ratio),
+            ),
+        ]
+        .into_iter()
+        .collect(),
+    })
+}
+
+fn bench_quantization_memory(args: &Args) -> Result<Vec<BenchmarkResult>> {
+    let quantizations = vec![
+        ("none", QuantizationConfig::None),
+        ("scalar", QuantizationConfig::Scalar),
+        ("binary", QuantizationConfig::Binary),
+    ];
+
+    let num_vectors = 50_000;
+    let mut results = Vec::new();
+
+    for (name, quant_config) in quantizations {
+        println!("Testing quantization: {}...", name);
+
+        let temp_dir = tempfile::tempdir()?;
+        let db_path = temp_dir.path().join("quant_memory.db");
+
+        let options = DbOptions {
+            dimensions: args.dimensions,
+            distance_metric: DistanceMetric::Cosine,
+            storage_path: db_path.to_str().unwrap().to_string(),
+            hnsw_config: Some(HnswConfig::default()),
+            quantization: Some(quant_config),
+        };
+
+        let mem_profiler = MemoryProfiler::new();
+        let build_start = Instant::now();
+        let db = VectorDB::new(options)?;
+
+        let gen = DatasetGenerator::new(
+            args.dimensions,
+            VectorDistribution::Normal {
+                mean: 0.0,
+                std_dev: 1.0,
+            },
+        );
+
+        let pb = create_progress_bar(num_vectors as u64, &format!("quant={}", name));
+
+        for i in 0..num_vectors {
+            let entry = VectorEntry {
+                id: Some(i.to_string()),
+                vector: gen.generate(1).into_iter().next().unwrap(),
+                metadata: None,
+            };
+            db.insert(entry)?;
+            pb.inc(1);
+        }
+        pb.finish_with_message(format!("✓ {} complete", name));
+
+        let build_time = build_start.elapsed();
+        let memory_mb = mem_profiler.current_usage_mb();
+
+        let vector_size_bytes = args.dimensions * 4;
+        let theoretical_mb = (num_vectors * vector_size_bytes) as f64 / 1_048_576.0;
+        let compression_ratio = theoretical_mb / memory_mb;
+
+        println!(
+            "  Memory: {:.2} MB, Compression: {:.2}x",
+            memory_mb, compression_ratio
+        );
+
+        results.push(BenchmarkResult {
+            name: format!("quantization_{}", name),
+            dataset: "synthetic".to_string(),
+            dimensions: args.dimensions,
+            num_vectors,
+            num_queries: 0,
+            k: 0,
+            qps: 0.0,
+            latency_p50: 0.0,
+            latency_p95: 0.0,
+            latency_p99: 0.0,
+            latency_p999: 0.0,
+            recall_at_1: 0.0,
+            recall_at_10: 0.0,
+            recall_at_100: 0.0,
+            memory_mb,
+            build_time_secs: build_time.as_secs_f64(),
+            metadata: vec![
+                ("quantization".to_string(), name.to_string()),
+                (
+                    "compression_ratio".to_string(),
+                    format!("{:.2}", compression_ratio),
+                ),
+                (
+                    "theoretical_mb".to_string(),
+                    format!("{:.2}", theoretical_mb),
+                ),
+            ]
+            .into_iter()
+            .collect(),
+        });
+    }
+
+    Ok(results)
+}
+
+fn bench_index_overhead(args: &Args) -> Result<BenchmarkResult> {
+    let num_vectors = 100_000;
+
+    println!("Analyzing index overhead for {} vectors...", num_vectors);
+
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("overhead.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig {
+            m: 32,
+            ef_construction: 200,
+            ef_search: 100,
+            max_elements: num_vectors * 2,
+        }),
+        quantization: Some(QuantizationConfig::None), // No quantization for overhead analysis
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let build_start = Instant::now();
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    let pb = create_progress_bar(num_vectors as u64, "Building index");
+
+    for i in 0..num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Index built");
+
+    let build_time = build_start.elapsed();
+    let total_memory_mb = mem_profiler.current_usage_mb();
+
+    // Calculate components
+    let vector_data_mb = (num_vectors * args.dimensions * 4) as f64 / 1_048_576.0;
+    let index_overhead_mb = total_memory_mb - vector_data_mb;
+    let overhead_percentage = (index_overhead_mb / vector_data_mb) * 100.0;
+
+    println!("\nMemory Breakdown:");
+    println!("  Vector data: {:.2} MB", vector_data_mb);
+    println!(
+        "  Index overhead: {:.2} MB ({:.1}%)",
+        index_overhead_mb, overhead_percentage
+    );
+    println!("  Total: {:.2} MB", total_memory_mb);
+
+    Ok(BenchmarkResult {
+        name: "index_overhead".to_string(),
+        dataset: "synthetic".to_string(),
+        dimensions: args.dimensions,
+        num_vectors,
+        num_queries: 0,
+        k: 0,
+        qps: 0.0,
+        latency_p50: 0.0,
+        latency_p95: 0.0,
+        latency_p99: 0.0,
+        latency_p999: 0.0,
+        recall_at_1: 0.0,
+        recall_at_10: 0.0,
+        recall_at_100: 0.0,
+        memory_mb: total_memory_mb,
+        build_time_secs: build_time.as_secs_f64(),
+        metadata: vec![
+            (
+                "vector_data_mb".to_string(),
+                format!("{:.2}", vector_data_mb),
+            ),
+            (
+                "index_overhead_mb".to_string(),
+                format!("{:.2}", index_overhead_mb),
+            ),
+            (
+                "overhead_percentage".to_string(),
+                format!("{:.1}", overhead_percentage),
+            ),
+        ]
+        .into_iter()
+        .collect(),
+    })
+}
+
+fn print_summary(results: &[BenchmarkResult]) {
+    use tabled::{Table, Tabled};
+
+    #[derive(Tabled)]
+    struct ResultRow {
+        #[tabled(rename = "Configuration")]
+        name: String,
+        #[tabled(rename = "Vectors")]
+        vectors: String,
+        #[tabled(rename = "Memory (MB)")]
+        memory: String,
+        #[tabled(rename = "Per Vector")]
+        per_vector: String,
+        #[tabled(rename = "Build Time (s)")]
+        build_time: String,
+    }
+
+    let rows: Vec<ResultRow> = results
+        .iter()
+        .map(|r| {
+            let per_vector = if r.num_vectors > 0 {
+                format!("{:.2} KB", (r.memory_mb * 1024.0) / r.num_vectors as f64)
+            } else {
+                "N/A".to_string()
+            };
+
+            ResultRow {
+                name: r.name.clone(),
+                vectors: if r.num_vectors > 0 {
+                    r.num_vectors.to_string()
+                } else {
+                    "N/A".to_string()
+                },
+                memory: format!("{:.2}", r.memory_mb),
+                per_vector,
+                build_time: format!("{:.2}", r.build_time_secs),
+            }
+        })
+        .collect();
+
+    println!("\n\n{}", Table::new(rows));
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/bin/profiling_benchmark.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/bin/profiling_benchmark.rs
@@ -0,0 +1,334 @@
+//! Performance profiling benchmark with flamegraph support
+//!
+//! Generates:
+//! - CPU flamegraphs
+//! - Memory allocation profiles
+//! - Lock contention analysis
+//! - SIMD utilization measurement
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_bench::{create_progress_bar, DatasetGenerator, MemoryProfiler, VectorDistribution};
+use ruvector_core::{
+    types::{DbOptions, HnswConfig, QuantizationConfig},
+    DistanceMetric, SearchQuery, VectorDB, VectorEntry,
+};
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "profiling-benchmark")]
+#[command(about = "Performance profiling with flamegraph support")]
+struct Args {
+    /// Number of vectors
+    #[arg(short, long, default_value = "100000")]
+    num_vectors: usize,
+
+    /// Number of queries
+    #[arg(short, long, default_value = "10000")]
+    queries: usize,
+
+    /// Vector dimensions
+    #[arg(short, long, default_value = "384")]
+    dimensions: usize,
+
+    /// Enable flamegraph generation
+    #[arg(long)]
+    flamegraph: bool,
+
+    /// Output directory
+    #[arg(short, long, default_value = "bench_results/profiling")]
+    output: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔════════════════════════════════════════╗");
+    println!("║   Ruvector Performance Profiling      ║");
+    println!("╚════════════════════════════════════════╝\n");
+
+    std::fs::create_dir_all(&args.output)?;
+
+    // Start profiling if enabled
+    #[cfg(feature = "profiling")]
+    let guard = if args.flamegraph {
+        println!("Starting CPU profiling...");
+        Some(start_profiling())
+    } else {
+        None
+    };
+
+    // Profile 1: Indexing performance
+    println!("\n{}", "=".repeat(60));
+    println!("Profiling: Index Construction");
+    println!("{}\n", "=".repeat(60));
+    profile_indexing(&args)?;
+
+    // Profile 2: Search performance
+    println!("\n{}", "=".repeat(60));
+    println!("Profiling: Search Operations");
+    println!("{}\n", "=".repeat(60));
+    profile_search(&args)?;
+
+    // Profile 3: Mixed workload
+    println!("\n{}", "=".repeat(60));
+    println!("Profiling: Mixed Read/Write Workload");
+    println!("{}\n", "=".repeat(60));
+    profile_mixed_workload(&args)?;
+
+    // Stop profiling and generate flamegraph
+    #[cfg(feature = "profiling")]
+    if let Some(guard) = guard {
+        println!("\nGenerating flamegraph...");
+        stop_profiling(guard, &args.output)?;
+    }
+
+    #[cfg(not(feature = "profiling"))]
+    if args.flamegraph {
+        println!("\n⚠ Profiling feature not enabled. Rebuild with:");
+        println!("  cargo build --release --features profiling");
+    }
+
+    println!(
+        "\n✓ Profiling complete! Results saved to: {}",
+        args.output.display()
+    );
+    Ok(())
+}
+
+#[cfg(feature = "profiling")]
+fn start_profiling() -> pprof::ProfilerGuard<'static> {
+    pprof::ProfilerGuardBuilder::default()
+        .frequency(1000)
+        .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+        .build()
+        .unwrap()
+}
+
+#[cfg(feature = "profiling")]
+fn stop_profiling(guard: pprof::ProfilerGuard<'static>, output_dir: &PathBuf) -> Result<()> {
+    use std::fs::File;
+    use std::io::Write;
+
+    if let Ok(report) = guard.report().build() {
+        let flamegraph_path = output_dir.join("flamegraph.svg");
+        let mut file = File::create(&flamegraph_path)?;
+        report.flamegraph(&mut file)?;
+        println!("✓ Flamegraph saved to: {}", flamegraph_path.display());
+
+        // Also generate a text report
+        let profile_path = output_dir.join("profile.txt");
+        let mut profile_file = File::create(&profile_path)?;
+        writeln!(profile_file, "CPU Profile Report\n==================\n")?;
+        writeln!(profile_file, "{:?}", report)?;
+        println!("✓ Profile report saved to: {}", profile_path.display());
+    }
+
+    Ok(())
+}
+
+fn profile_indexing(args: &Args) -> Result<()> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("profiling.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let mem_profiler = MemoryProfiler::new();
+    let start = Instant::now();
+
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Indexing {} vectors for profiling...", args.num_vectors);
+    let pb = create_progress_bar(args.num_vectors as u64, "Indexing");
+
+    for i in 0..args.num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Indexing complete");
+
+    let elapsed = start.elapsed();
+    let memory_mb = mem_profiler.current_usage_mb();
+
+    println!("\nIndexing Performance:");
+    println!("  Total time: {:.2}s", elapsed.as_secs_f64());
+    println!(
+        "  Throughput: {:.0} vectors/sec",
+        args.num_vectors as f64 / elapsed.as_secs_f64()
+    );
+    println!("  Memory: {:.2} MB", memory_mb);
+
+    Ok(())
+}
+
+fn profile_search(args: &Args) -> Result<()> {
+    let (db, queries) = setup_database(args)?;
+
+    println!("Running {} search queries for profiling...", args.queries);
+    let pb = create_progress_bar(args.queries as u64, "Searching");
+
+    let start = Instant::now();
+    for query in &queries {
+        db.search(SearchQuery {
+            vector: query.clone(),
+            k: 10,
+            filter: None,
+            ef_search: None,
+        })?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Search complete");
+
+    let elapsed = start.elapsed();
+
+    println!("\nSearch Performance:");
+    println!("  Total time: {:.2}s", elapsed.as_secs_f64());
+    println!("  QPS: {:.0}", args.queries as f64 / elapsed.as_secs_f64());
+    println!(
+        "  Avg latency: {:.2}ms",
+        elapsed.as_secs_f64() * 1000.0 / args.queries as f64
+    );
+
+    Ok(())
+}
+
+fn profile_mixed_workload(args: &Args) -> Result<()> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("mixed.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    let num_ops = args.num_vectors / 10;
+    println!(
+        "Running {} mixed operations (70% writes, 30% reads)...",
+        num_ops
+    );
+    let pb = create_progress_bar(num_ops as u64, "Processing");
+
+    let start = Instant::now();
+    let mut write_count = 0;
+    let mut read_count = 0;
+
+    for i in 0..num_ops {
+        if i % 10 < 7 {
+            // Write operation
+            let entry = VectorEntry {
+                id: Some(i.to_string()),
+                vector: gen.generate(1).into_iter().next().unwrap(),
+                metadata: None,
+            };
+            db.insert(entry)?;
+            write_count += 1;
+        } else {
+            // Read operation
+            let query = gen.generate(1).into_iter().next().unwrap();
+            db.search(SearchQuery {
+                vector: query,
+                k: 10,
+                filter: None,
+                ef_search: None,
+            })?;
+            read_count += 1;
+        }
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Mixed workload complete");
+
+    let elapsed = start.elapsed();
+
+    println!("\nMixed Workload Performance:");
+    println!("  Total time: {:.2}s", elapsed.as_secs_f64());
+    println!(
+        "  Writes: {} ({:.0} writes/sec)",
+        write_count,
+        write_count as f64 / elapsed.as_secs_f64()
+    );
+    println!(
+        "  Reads: {} ({:.0} reads/sec)",
+        read_count,
+        read_count as f64 / elapsed.as_secs_f64()
+    );
+    println!(
+        "  Total throughput: {:.0} ops/sec",
+        num_ops as f64 / elapsed.as_secs_f64()
+    );
+
+    Ok(())
+}
+
+fn setup_database(args: &Args) -> Result<(VectorDB, Vec<Vec<f32>>)> {
+    let temp_dir = tempfile::tempdir()?;
+    let db_path = temp_dir.path().join("search.db");
+
+    let options = DbOptions {
+        dimensions: args.dimensions,
+        distance_metric: DistanceMetric::Cosine,
+        storage_path: db_path.to_str().unwrap().to_string(),
+        hnsw_config: Some(HnswConfig::default()),
+        quantization: Some(QuantizationConfig::Scalar),
+    };
+
+    let db = VectorDB::new(options)?;
+
+    let gen = DatasetGenerator::new(
+        args.dimensions,
+        VectorDistribution::Normal {
+            mean: 0.0,
+            std_dev: 1.0,
+        },
+    );
+
+    println!("Preparing database with {} vectors...", args.num_vectors);
+    let pb = create_progress_bar(args.num_vectors as u64, "Preparing");
+
+    for i in 0..args.num_vectors {
+        let entry = VectorEntry {
+            id: Some(i.to_string()),
+            vector: gen.generate(1).into_iter().next().unwrap(),
+            metadata: None,
+        };
+        db.insert(entry)?;
+        pb.inc(1);
+    }
+    pb.finish_with_message("✓ Database ready");
+
+    let queries = gen.generate(args.queries);
+
+    Ok((db, queries))
+}
--- a/vendor/ruvector/crates/ruvector-bench/src/lib.rs
+++ b/vendor/ruvector/crates/ruvector-bench/src/lib.rs
@@ -0,0 +1,356 @@
+//! Benchmarking utilities for Ruvector
+//!
+//! This module provides comprehensive benchmarking tools including:
+//! - ANN-Benchmarks compatibility for standardized testing
+//! - AgenticDB workload simulation
+//! - Latency profiling (p50, p95, p99, p99.9)
+//! - Memory usage analysis
+//! - Cross-system performance comparison
+//! - CPU and memory profiling with flamegraphs
+
+use anyhow::{Context, Result};
+use rand::Rng;
+use rand_distr::{Distribution, Normal, Uniform};
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fs::{self, File};
+use std::io::{BufReader, BufWriter, Write};
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};
+
+/// Benchmark result for a single test
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BenchmarkResult {
+    pub name: String,
+    pub dataset: String,
+    pub dimensions: usize,
+    pub num_vectors: usize,
+    pub num_queries: usize,
+    pub k: usize,
+    pub qps: f64,
+    pub latency_p50: f64,
+    pub latency_p95: f64,
+    pub latency_p99: f64,
+    pub latency_p999: f64,
+    pub recall_at_1: f64,
+    pub recall_at_10: f64,
+    pub recall_at_100: f64,
+    pub memory_mb: f64,
+    pub build_time_secs: f64,
+    pub metadata: HashMap<String, String>,
+}
+
+/// Statistics collector using HDR histogram
+pub struct LatencyStats {
+    histogram: hdrhistogram::Histogram<u64>,
+}
+
+impl LatencyStats {
+    pub fn new() -> Result<Self> {
+        let histogram = hdrhistogram::Histogram::new_with_bounds(1, 60_000_000, 3)?;
+        Ok(Self { histogram })
+    }
+
+    pub fn record(&mut self, duration: Duration) -> Result<()> {
+        let micros = duration.as_micros() as u64;
+        self.histogram.record(micros)?;
+        Ok(())
+    }
+
+    pub fn percentile(&self, percentile: f64) -> Duration {
+        let micros = self.histogram.value_at_percentile(percentile);
+        Duration::from_micros(micros)
+    }
+
+    pub fn mean(&self) -> Duration {
+        Duration::from_micros(self.histogram.mean() as u64)
+    }
+
+    pub fn count(&self) -> u64 {
+        self.histogram.len()
+    }
+}
+
+impl Default for LatencyStats {
+    fn default() -> Self {
+        Self::new().unwrap()
+    }
+}
+
+/// Dataset generator for synthetic benchmarks
+pub struct DatasetGenerator {
+    dimensions: usize,
+    distribution: VectorDistribution,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum VectorDistribution {
+    Uniform,
+    Normal { mean: f32, std_dev: f32 },
+    Clustered { num_clusters: usize },
+}
+
+impl DatasetGenerator {
+    pub fn new(dimensions: usize, distribution: VectorDistribution) -> Self {
+        Self {
+            dimensions,
+            distribution,
+        }
+    }
+
+    pub fn generate(&self, count: usize) -> Vec<Vec<f32>> {
+        let mut rng = rand::thread_rng();
+        (0..count).map(|_| self.generate_vector(&mut rng)).collect()
+    }
+
+    fn generate_vector<R: Rng>(&self, rng: &mut R) -> Vec<f32> {
+        match self.distribution {
+            VectorDistribution::Uniform => {
+                let uniform = Uniform::new(-1.0, 1.0);
+                (0..self.dimensions).map(|_| uniform.sample(rng)).collect()
+            }
+            VectorDistribution::Normal { mean, std_dev } => {
+                let normal = Normal::new(mean, std_dev).unwrap();
+                (0..self.dimensions).map(|_| normal.sample(rng)).collect()
+            }
+            VectorDistribution::Clustered { num_clusters } => {
+                let cluster_id = rng.gen_range(0..num_clusters);
+                let center_offset = cluster_id as f32 * 10.0;
+                let normal = Normal::new(center_offset, 1.0).unwrap();
+                (0..self.dimensions).map(|_| normal.sample(rng)).collect()
+            }
+        }
+    }
+
+    pub fn normalize_vector(vec: &mut [f32]) {
+        let norm: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
+        if norm > 0.0 {
+            for x in vec.iter_mut() {
+                *x /= norm;
+            }
+        }
+    }
+}
+
+/// Result writer for benchmark outputs
+pub struct ResultWriter {
+    output_dir: PathBuf,
+}
+
+impl ResultWriter {
+    pub fn new<P: AsRef<Path>>(output_dir: P) -> Result<Self> {
+        let output_dir = output_dir.as_ref().to_path_buf();
+        fs::create_dir_all(&output_dir)?;
+        Ok(Self { output_dir })
+    }
+
+    pub fn write_json<T: Serialize>(&self, name: &str, data: &T) -> Result<()> {
+        let path = self.output_dir.join(format!("{}.json", name));
+        let file = File::create(&path)?;
+        let writer = BufWriter::new(file);
+        serde_json::to_writer_pretty(writer, data)?;
+        println!("✓ Written results to: {}", path.display());
+        Ok(())
+    }
+
+    pub fn write_csv(&self, name: &str, results: &[BenchmarkResult]) -> Result<()> {
+        let path = self.output_dir.join(format!("{}.csv", name));
+        let mut file = File::create(&path)?;
+
+        // Write header
+        writeln!(
+            file,
+            "name,dataset,dimensions,num_vectors,num_queries,k,qps,p50,p95,p99,p999,recall@1,recall@10,recall@100,memory_mb,build_time"
+        )?;
+
+        // Write data
+        for result in results {
+            writeln!(
+                file,
+                "{},{},{},{},{},{},{:.2},{:.2},{:.2},{:.2},{:.2},{:.4},{:.4},{:.4},{:.2},{:.2}",
+                result.name,
+                result.dataset,
+                result.dimensions,
+                result.num_vectors,
+                result.num_queries,
+                result.k,
+                result.qps,
+                result.latency_p50,
+                result.latency_p95,
+                result.latency_p99,
+                result.latency_p999,
+                result.recall_at_1,
+                result.recall_at_10,
+                result.recall_at_100,
+                result.memory_mb,
+                result.build_time_secs,
+            )?;
+        }
+
+        println!("✓ Written CSV to: {}", path.display());
+        Ok(())
+    }
+
+    pub fn write_markdown_report(&self, name: &str, results: &[BenchmarkResult]) -> Result<()> {
+        let path = self.output_dir.join(format!("{}.md", name));
+        let mut file = File::create(&path)?;
+
+        writeln!(file, "# Ruvector Benchmark Results\n")?;
+        writeln!(
+            file,
+            "Generated: {}\n",
+            chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
+        )?;
+
+        for result in results {
+            writeln!(file, "## {}\n", result.name)?;
+            writeln!(
+                file,
+                "**Dataset:** {} ({}D, {} vectors)\n",
+                result.dataset, result.dimensions, result.num_vectors
+            )?;
+            writeln!(file, "### Performance")?;
+            writeln!(file, "- **QPS:** {:.2}", result.qps)?;
+            writeln!(file, "- **Latency (p50):** {:.2}ms", result.latency_p50)?;
+            writeln!(file, "- **Latency (p95):** {:.2}ms", result.latency_p95)?;
+            writeln!(file, "- **Latency (p99):** {:.2}ms", result.latency_p99)?;
+            writeln!(file, "- **Latency (p99.9):** {:.2}ms", result.latency_p999)?;
+            writeln!(file, "")?;
+            writeln!(file, "### Recall")?;
+            writeln!(file, "- **Recall@1:** {:.2}%", result.recall_at_1 * 100.0)?;
+            writeln!(file, "- **Recall@10:** {:.2}%", result.recall_at_10 * 100.0)?;
+            writeln!(
+                file,
+                "- **Recall@100:** {:.2}%",
+                result.recall_at_100 * 100.0
+            )?;
+            writeln!(file, "")?;
+            writeln!(file, "### Resources")?;
+            writeln!(file, "- **Memory:** {:.2} MB", result.memory_mb)?;
+            writeln!(file, "- **Build Time:** {:.2}s", result.build_time_secs)?;
+            writeln!(file, "")?;
+        }
+
+        println!("✓ Written markdown report to: {}", path.display());
+        Ok(())
+    }
+}
+
+/// Memory profiler
+pub struct MemoryProfiler {
+    #[cfg(feature = "profiling")]
+    initial_allocated: usize,
+    #[cfg(not(feature = "profiling"))]
+    _phantom: (),
+}
+
+impl MemoryProfiler {
+    pub fn new() -> Self {
+        #[cfg(feature = "profiling")]
+        {
+            use jemalloc_ctl::{epoch, stats};
+            epoch::mib().unwrap().advance().unwrap();
+            let allocated = stats::allocated::mib().unwrap().read().unwrap();
+            Self {
+                initial_allocated: allocated,
+            }
+        }
+        #[cfg(not(feature = "profiling"))]
+        {
+            Self { _phantom: () }
+        }
+    }
+
+    pub fn current_usage_mb(&self) -> f64 {
+        #[cfg(feature = "profiling")]
+        {
+            use jemalloc_ctl::{epoch, stats};
+            epoch::mib().unwrap().advance().unwrap();
+            let allocated = stats::allocated::mib().unwrap().read().unwrap();
+            (allocated - self.initial_allocated) as f64 / 1_048_576.0
+        }
+        #[cfg(not(feature = "profiling"))]
+        {
+            0.0
+        }
+    }
+
+    pub fn system_memory_info() -> Result<(u64, u64)> {
+        use sysinfo::System;
+        let mut sys = System::new_all();
+        sys.refresh_all();
+        let total = sys.total_memory();
+        let used = sys.used_memory();
+        Ok((total, used))
+    }
+}
+
+impl Default for MemoryProfiler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Calculate recall between search results and ground truth
+pub fn calculate_recall(results: &[Vec<String>], ground_truth: &[Vec<String>], k: usize) -> f64 {
+    assert_eq!(results.len(), ground_truth.len());
+
+    let mut total_recall = 0.0;
+    for (result, truth) in results.iter().zip(ground_truth.iter()) {
+        let result_set: std::collections::HashSet<_> = result.iter().take(k).collect();
+        let truth_set: std::collections::HashSet<_> = truth.iter().take(k).collect();
+        let intersection = result_set.intersection(&truth_set).count();
+        total_recall += intersection as f64 / k.min(truth.len()) as f64;
+    }
+
+    total_recall / results.len() as f64
+}
+
+/// Progress bar helper
+pub fn create_progress_bar(len: u64, msg: &str) -> indicatif::ProgressBar {
+    let pb = indicatif::ProgressBar::new(len);
+    pb.set_style(
+        indicatif::ProgressStyle::default_bar()
+            .template("{msg} [{bar:40.cyan/blue}] {pos}/{len} ({eta})")
+            .unwrap()
+            .progress_chars("#>-"),
+    );
+    pb.set_message(msg.to_string());
+    pb
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_dataset_generator() {
+        let gen = DatasetGenerator::new(128, VectorDistribution::Uniform);
+        let vectors = gen.generate(100);
+        assert_eq!(vectors.len(), 100);
+        assert_eq!(vectors[0].len(), 128);
+    }
+
+    #[test]
+    fn test_latency_stats() {
+        let mut stats = LatencyStats::new().unwrap();
+        for i in 0..1000 {
+            stats.record(Duration::from_micros(i)).unwrap();
+        }
+        assert!(stats.percentile(0.5).as_micros() > 0);
+    }
+
+    #[test]
+    fn test_recall_calculation() {
+        let results = vec![
+            vec!["1".to_string(), "2".to_string(), "3".to_string()],
+            vec!["4".to_string(), "5".to_string(), "6".to_string()],
+        ];
+        let ground_truth = vec![
+            vec!["1".to_string(), "2".to_string(), "7".to_string()],
+            vec!["4".to_string(), "8".to_string(), "6".to_string()],
+        ];
+        let recall = calculate_recall(&results, &ground_truth, 3);
+        assert!((recall - 0.666).abs() < 0.01);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-bench/tests/wasm_stack_bench.rs
+++ b/vendor/ruvector/crates/ruvector-bench/tests/wasm_stack_bench.rs
@@ -0,0 +1,416 @@
+//! Performance benchmarks for the WASM cognitive stack.
+//!
+//! Measures key operations against target latencies from the research:
+//! - Container tick:         < 200 us native
+//! - SCS full recompute:     < 5 ms (500 vertices)
+//! - Canonical min-cut:      < 1 ms (100 vertices)
+//! - Witness fragment:       < 50 us (64 vertices)
+//!
+//! Run with:
+//!   cargo test --test wasm_stack_bench --release -- --nocapture
+
+use std::time::Instant;
+
+// =========================================================================
+// (a) Canonical min-cut benchmark (ruvector-mincut, feature = "canonical")
+// =========================================================================
+
+#[test]
+fn bench_canonical_mincut_100v() {
+    use ruvector_mincut::canonical::CactusGraph;
+    use ruvector_mincut::graph::DynamicGraph;
+
+    let graph = DynamicGraph::new();
+
+    // Build a graph with 100 vertices and ~300 edges
+    for i in 0..100u64 {
+        graph.add_vertex(i);
+    }
+    // Ring edges (100)
+    for i in 0..100u64 {
+        let _ = graph.insert_edge(i, (i + 1) % 100, 1.0);
+    }
+    // Cross edges for richer structure (~200 more)
+    for i in 0..100u64 {
+        let _ = graph.insert_edge(i, (i + 37) % 100, 0.5);
+        let _ = graph.insert_edge(i, (i + 73) % 100, 0.3);
+    }
+
+    // Warm up
+    let _ = CactusGraph::build_from_graph(&graph);
+
+    // --- CactusGraph construction (100 iterations) ---
+    let n_iter = 100;
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let mut cactus = CactusGraph::build_from_graph(&graph);
+        cactus.root_at_lex_smallest();
+        std::hint::black_box(&cactus);
+    }
+    let cactus_time = start.elapsed();
+    let avg_cactus_us = cactus_time.as_micros() as f64 / n_iter as f64;
+
+    // --- Canonical cut extraction (100 iterations) ---
+    let mut cactus = CactusGraph::build_from_graph(&graph);
+    cactus.root_at_lex_smallest();
+    println!(
+        "  Cactus: {} vertices, {} edges, {} cycles",
+        cactus.n_vertices,
+        cactus.n_edges,
+        cactus.cycles.len()
+    );
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let result = cactus.canonical_cut();
+        std::hint::black_box(&result);
+    }
+    let cut_time = start.elapsed();
+    let avg_cut_us = cut_time.as_micros() as f64 / n_iter as f64;
+
+    // --- Determinism verification: 100 iterations produce the same result ---
+    let reference = cactus.canonical_cut();
+    let start = Instant::now();
+    for _ in 0..100 {
+        let mut c = CactusGraph::build_from_graph(&graph);
+        c.root_at_lex_smallest();
+        let result = c.canonical_cut();
+        assert_eq!(
+            result.canonical_key, reference.canonical_key,
+            "Determinism violation in canonical min-cut!"
+        );
+    }
+    let determinism_us = start.elapsed().as_micros();
+
+    let total_us = avg_cactus_us + avg_cut_us;
+    let status = if total_us < 1000.0 { "PASS" } else { "FAIL" };
+
+    println!("\n=== (a) Canonical Min-Cut (100 vertices, ~300 edges) ===");
+    println!(
+        "  CactusGraph construction:  {:.1} us  (avg of {} iters)",
+        avg_cactus_us, n_iter
+    );
+    println!(
+        "  Canonical cut extraction:  {:.1} us  (avg of {} iters)",
+        avg_cut_us, n_iter
+    );
+    println!(
+        "  Total (construct + cut):   {:.1} us  [target < 1000 us] [{}]",
+        total_us, status
+    );
+    println!("  Determinism (100x verify): {} us total", determinism_us);
+    println!("  Min-cut value:             {:.4}", reference.value);
+    println!("  Cut edges:                 {}", reference.cut_edges.len());
+    println!(
+        "  Partition sizes:           {} / {}",
+        reference.partition.0.len(),
+        reference.partition.1.len()
+    );
+}
+
+// =========================================================================
+// (b) Spectral Coherence Score benchmark (ruvector-coherence)
+// =========================================================================
+
+#[test]
+fn bench_spectral_coherence_500v() {
+    use ruvector_coherence::spectral::{CsrMatrixView, SpectralConfig, SpectralTracker};
+
+    let n = 500;
+
+    // Build a 500-node graph: ring + deterministic cross-edges (~1500 edges)
+    let mut edges: Vec<(usize, usize, f64)> = Vec::new();
+    for i in 0..n {
+        edges.push((i, (i + 1) % n, 1.0));
+    }
+    for i in 0..n {
+        edges.push((i, (i + 37) % n, 0.5));
+        edges.push((i, (i + 127) % n, 0.3));
+    }
+
+    let lap = CsrMatrixView::build_laplacian(n, &edges);
+    let config = SpectralConfig::default();
+
+    // Warm up
+    let mut tracker = SpectralTracker::new(config.clone());
+    let _ = tracker.compute(&lap);
+
+    // --- Full SCS recompute ---
+    let n_iter = 20;
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let mut t = SpectralTracker::new(config.clone());
+        let score = t.compute(&lap);
+        std::hint::black_box(&score);
+    }
+    let full_time = start.elapsed();
+    let avg_full_us = full_time.as_micros() as f64 / n_iter as f64;
+    let avg_full_ms = avg_full_us / 1000.0;
+
+    // Capture one result for reporting
+    let mut report_tracker = SpectralTracker::new(config.clone());
+    let initial_score = report_tracker.compute(&lap);
+
+    // --- Incremental update (single edge change) ---
+    let n_incr = 100;
+    let start = Instant::now();
+    for i in 0..n_incr {
+        report_tracker.update_edge(&lap, i % n, (i + 1) % n, 0.01);
+    }
+    let incr_time = start.elapsed();
+    let avg_incr_us = incr_time.as_micros() as f64 / n_incr as f64;
+
+    let status = if avg_full_ms < 5.0 { "PASS" } else { "FAIL" };
+
+    println!("\n=== (b) Spectral Coherence Score (500 vertices, ~1500 edges) ===");
+    println!(
+        "  Full SCS recompute:        {:.2} ms  (avg of {} iters) [target < 5 ms] [{}]",
+        avg_full_ms, n_iter, status
+    );
+    println!(
+        "  Incremental update:        {:.1} us  (avg of {} iters)",
+        avg_incr_us, n_incr
+    );
+    println!(
+        "  Initial composite SCS:     {:.6}",
+        initial_score.composite
+    );
+    println!("  Fiedler:                   {:.6}", initial_score.fiedler);
+    println!(
+        "  Spectral gap:              {:.6}",
+        initial_score.spectral_gap
+    );
+    println!(
+        "  Effective resistance:       {:.6}",
+        initial_score.effective_resistance
+    );
+    println!(
+        "  Degree regularity:         {:.6}",
+        initial_score.degree_regularity
+    );
+}
+
+// =========================================================================
+// (c) Cognitive Container benchmark
+// =========================================================================
+
+#[test]
+fn bench_cognitive_container_100_ticks() {
+    use ruvector_cognitive_container::{
+        CognitiveContainer, ContainerConfig, Delta, VerificationResult,
+    };
+
+    let config = ContainerConfig::default();
+    let mut container = CognitiveContainer::new(config).expect("Failed to create container");
+
+    // Build a base graph of 50 edges
+    let init_deltas: Vec<Delta> = (0..50)
+        .map(|i| Delta::EdgeAdd {
+            u: i,
+            v: (i + 1) % 50,
+            weight: 1.0,
+        })
+        .collect();
+    let _ = container.tick(&init_deltas);
+
+    // --- 100 ticks with incremental updates ---
+    let n_ticks = 100;
+    let mut tick_times = Vec::with_capacity(n_ticks);
+
+    let outer_start = Instant::now();
+    for i in 0..n_ticks {
+        let deltas = vec![
+            Delta::EdgeAdd {
+                u: i % 50,
+                v: (i + 17) % 50,
+                weight: 0.5 + (i as f64 * 0.01),
+            },
+            Delta::Observation {
+                node: i % 50,
+                value: 0.7 + (i as f64 * 0.001),
+            },
+        ];
+        let t0 = Instant::now();
+        let result = container.tick(&deltas).expect("Tick failed");
+        let elapsed = t0.elapsed().as_micros() as u64;
+        tick_times.push(elapsed);
+    }
+    let outer_elapsed = outer_start.elapsed();
+
+    let avg_tick_us = tick_times.iter().sum::<u64>() as f64 / tick_times.len() as f64;
+    let max_tick_us = *tick_times.iter().max().unwrap();
+    let min_tick_us = *tick_times.iter().min().unwrap();
+    let mut sorted_ticks = tick_times.clone();
+    sorted_ticks.sort();
+    let p50 = sorted_ticks[sorted_ticks.len() / 2];
+    let p99 = sorted_ticks[(sorted_ticks.len() as f64 * 0.99) as usize];
+
+    // --- Witness chain verification ---
+    let verify_start = Instant::now();
+    let verification = container.verify_chain();
+    let verify_us = verify_start.elapsed().as_micros();
+
+    let status = if avg_tick_us < 200.0 { "PASS" } else { "FAIL" };
+
+    println!("\n=== (c) Cognitive Container (100 ticks, 2 deltas each) ===");
+    println!(
+        "  Average tick:              {:.1} us  [target < 200 us] [{}]",
+        avg_tick_us, status
+    );
+    println!("  Median tick (p50):         {} us", p50);
+    println!("  p99 tick:                  {} us", p99);
+    println!(
+        "  Min / Max tick:            {} / {} us",
+        min_tick_us, max_tick_us
+    );
+    println!(
+        "  Total (100 ticks):         {:.2} ms",
+        outer_elapsed.as_micros() as f64 / 1000.0
+    );
+    println!(
+        "  Chain verification:        {} us  (chain len = {})",
+        verify_us,
+        container.current_epoch()
+    );
+    println!(
+        "  Chain valid:               {}",
+        matches!(verification, VerificationResult::Valid { .. })
+    );
+}
+
+// =========================================================================
+// (d) Canonical witness / gate-kernel benchmark
+// =========================================================================
+
+#[test]
+fn bench_canonical_witness_64v() {
+    use cognitum_gate_kernel::canonical_witness::{ArenaCactus, CanonicalWitnessFragment};
+    use cognitum_gate_kernel::shard::CompactGraph;
+    use cognitum_gate_kernel::TileState;
+
+    // Build a CompactGraph with 64 vertices and ~128 edges
+    let build_graph = || {
+        let mut g = CompactGraph::new();
+        // Ring
+        for i in 0..64u16 {
+            g.add_edge(i, (i + 1) % 64, 100);
+        }
+        // Cross edges
+        for i in 0..64u16 {
+            g.add_edge(i, (i + 13) % 64, 50);
+        }
+        g.recompute_components();
+        g
+    };
+
+    let graph = build_graph();
+
+    // Warm up
+    let _ = ArenaCactus::build_from_compact_graph(&graph);
+
+    // --- ArenaCactus construction (1000 iterations) ---
+    let n_iter = 1000;
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let cactus = ArenaCactus::build_from_compact_graph(&graph);
+        std::hint::black_box(&cactus);
+    }
+    let cactus_time = start.elapsed();
+    let avg_cactus_us = cactus_time.as_micros() as f64 / n_iter as f64;
+
+    // --- Canonical partition extraction (1000 iterations) ---
+    let cactus = ArenaCactus::build_from_compact_graph(&graph);
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let partition = cactus.canonical_partition();
+        std::hint::black_box(&partition);
+    }
+    let partition_time = start.elapsed();
+    let avg_partition_us = partition_time.as_micros() as f64 / n_iter as f64;
+
+    // --- Full witness fragment via TileState (1000 iterations) ---
+    let mut tile = TileState::new(42);
+    for i in 0..64u16 {
+        tile.graph.add_edge(i, (i + 1) % 64, 100);
+        tile.graph.add_edge(i, (i + 13) % 64, 50);
+    }
+    tile.graph.recompute_components();
+
+    let start = Instant::now();
+    for _ in 0..n_iter {
+        let fragment = tile.canonical_witness();
+        std::hint::black_box(&fragment);
+    }
+    let witness_time = start.elapsed();
+    let avg_witness_us = witness_time.as_micros() as f64 / n_iter as f64;
+
+    // --- Determinism verification ---
+    let ref_fragment = tile.canonical_witness();
+    let det_start = Instant::now();
+    for _ in 0..100 {
+        let g = build_graph();
+        let c = ArenaCactus::build_from_compact_graph(&g);
+        let p = c.canonical_partition();
+        assert_eq!(
+            p.canonical_hash,
+            {
+                let c2 = ArenaCactus::build_from_compact_graph(&graph);
+                c2.canonical_partition().canonical_hash
+            },
+            "Gate-kernel determinism violation!"
+        );
+    }
+    let det_us = det_start.elapsed().as_micros();
+
+    let total_us = avg_cactus_us + avg_partition_us;
+    let status = if avg_witness_us < 50.0 {
+        "PASS"
+    } else {
+        "FAIL"
+    };
+
+    println!("\n=== (d) Canonical Witness Fragment (64 vertices, ~128 edges) ===");
+    println!(
+        "  ArenaCactus construction:  {:.2} us  (avg of {} iters)",
+        avg_cactus_us, n_iter
+    );
+    println!(
+        "  Partition extraction:      {:.2} us  (avg of {} iters)",
+        avg_partition_us, n_iter
+    );
+    println!(
+        "  Full witness fragment:     {:.2} us  [target < 50 us] [{}]",
+        avg_witness_us, status
+    );
+    println!(
+        "  Fragment size:             {} bytes",
+        std::mem::size_of::<CanonicalWitnessFragment>()
+    );
+    println!("  Cactus nodes:              {}", cactus.n_nodes);
+    println!("  Cut value:                 {}", ref_fragment.cut_value);
+    println!(
+        "  Cardinality A/B:           {} / {}",
+        ref_fragment.cardinality_a, ref_fragment.cardinality_b
+    );
+    println!("  Determinism (100x):        {} us", det_us);
+}
+
+// =========================================================================
+// Summary report
+// =========================================================================
+
+#[test]
+fn bench_z_summary() {
+    println!("\n");
+    println!("================================================================");
+    println!("      WASM Cognitive Stack -- Benchmark Targets                ");
+    println!("================================================================");
+    println!("  Component                     Target");
+    println!("  ----------------------------  ----------");
+    println!("  (a) Canonical min-cut (100v)  < 1 ms");
+    println!("  (b) SCS full recompute (500v) < 5 ms");
+    println!("  (c) Container tick            < 200 us");
+    println!("  (d) Witness fragment (64v)    < 50 us");
+    println!("================================================================");
+    println!("  Run: cargo test --test wasm_stack_bench --release -- --nocapture");
+    println!("================================================================");
+}