Files

ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900

2026-02-28 14:39:40 -05:00

23 KiB

Raw Blame History

SONA Performance Benchmarks

Overview

This document defines performance targets, benchmark methodology, and expected results for SONA components. All benchmarks are designed to be reproducible and measurable.

Performance Targets Summary

┌─────────────────────────────────────────────────────────────────────────┐
│                      SONA Performance Targets                           │
├─────────────────────────────────────────────────────────────────────────┤
│  Component              │ Target         │ Stretch Goal  │ Unit        │
├─────────────────────────┼────────────────┼───────────────┼─────────────┤
│  Micro-LoRA forward     │ <50μs          │ <20μs         │ per request │
│  Micro-LoRA update      │ <100μs         │ <50μs         │ per signal  │
│  Base LoRA forward      │ <200μs         │ <100μs        │ per layer   │
│  Pattern extraction     │ <1s            │ <500ms        │ per 1000    │
│  Trajectory recording   │ <10μs          │ <5μs          │ per step    │
│  Background cycle       │ <30s           │ <15s          │ per cycle   │
│  Deep cycle             │ <10min         │ <5min         │ per cycle   │
│  Memory overhead        │ <100MB         │ <50MB         │ total       │
│  Pattern search         │ <1ms           │ <100μs        │ per query   │
│  Dream generation       │ <100ms         │ <50ms         │ per dream   │
└─────────────────────────────────────────────────────────────────────────┘

Micro-LoRA Benchmarks

Forward Pass Latency

Target: <50μs average, <100μs p99

// benches/micro_lora.rs
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};

fn bench_micro_lora_forward(c: &mut Criterion) {
    let mut group = c.benchmark_group("micro_lora_forward");

    for rank in [1, 2] {
        for hidden_dim in [256, 512, 1024, 2048] {
            let lora = MicroLoRA::new(hidden_dim, rank);
            let input = vec![0.1f32; hidden_dim];
            let mut output = vec![0.0f32; hidden_dim];

            group.bench_with_input(
                BenchmarkId::new(format!("rank{}", rank), hidden_dim),
                &hidden_dim,
                |b, _| {
                    b.iter(|| {
                        output.fill(0.0);
                        unsafe { lora.forward_simd(&input, &mut output) };
                    });
                },
            );
        }
    }

    group.finish();
}

Expected Results:

Rank	Hidden Dim	AVX2 (μs)	Scalar (μs)	Speedup
1	256	3.2	12.5	3.9x
1	512	5.8	24.1	4.2x
1	1024	10.4	47.3	4.5x
1	2048	19.7	93.8	4.8x
2	256	5.1	23.4	4.6x
2	512	9.3	46.2	5.0x
2	1024	17.2	91.5	5.3x
2	2048	33.1	182.4	5.5x

Gradient Accumulation

Target: <100μs per signal

fn bench_gradient_accumulation(c: &mut Criterion) {
    let mut group = c.benchmark_group("gradient_accumulation");

    for hidden_dim in [256, 512, 1024] {
        let mut lora = MicroLoRA::new(hidden_dim, 1);
        let signal = LearningSignal {
            query_embedding: vec![0.1; hidden_dim],
            gradient_estimate: vec![0.01; hidden_dim],
            quality_score: 0.8,
            timestamp: Instant::now(),
            metadata: SignalMetadata::default(),
        };

        group.bench_with_input(
            BenchmarkId::from_parameter(hidden_dim),
            &hidden_dim,
            |b, _| {
                b.iter(|| {
                    lora.accumulate_gradient(&signal);
                });
            },
        );
    }

    group.finish();
}

Expected Results:

Hidden Dim	Time (μs)	Throughput (signals/s)
256	8.3	120,481
512	15.7	63,694
1024	30.2	33,112

Base LoRA Benchmarks

Forward Pass (Per Layer)

Target: <200μs per layer

fn bench_base_lora_forward(c: &mut Criterion) {
    let mut group = c.benchmark_group("base_lora_forward");

    for rank in [4, 8, 16] {
        for hidden_dim in [512, 1024, 2048] {
            let lora = BaseLoRA::new(hidden_dim, rank, 1);
            let input = vec![0.1f32; hidden_dim];
            let mut output = vec![0.0f32; hidden_dim];

            group.bench_with_input(
                BenchmarkId::new(format!("rank{}", rank), hidden_dim),
                &hidden_dim,
                |b, _| {
                    b.iter(|| {
                        lora.forward_layer(0, &input, &mut output);
                    });
                },
            );
        }
    }

    group.finish();
}

Expected Results:

Rank	Hidden Dim	Time (μs)	FLOPs	GFLOPS
4	512	45	4.2M	93
4	1024	85	8.4M	99
4	2048	162	16.8M	104
8	512	82	8.4M	102
8	1024	158	16.8M	106
8	2048	305	33.5M	110
16	512	155	16.8M	108
16	1024	298	33.5M	112
16	2048	582	67.1M	115

Trajectory Recording Benchmarks

Step Recording Latency

Target: <10μs per step

fn bench_trajectory_recording(c: &mut Criterion) {
    let mut group = c.benchmark_group("trajectory_recording");

    for hidden_dim in [256, 512] {
        for num_heads in [4, 8] {
            let mut builder = TrajectoryBuilder::new(1, vec![0.1; hidden_dim]);

            group.bench_with_input(
                BenchmarkId::new(format!("h{}_heads{}", hidden_dim, num_heads), hidden_dim),
                &(hidden_dim, num_heads),
                |b, &(hd, nh)| {
                    b.iter(|| {
                        builder.add_step(
                            vec![0.5; hd],
                            vec![0.1; hd * nh],
                            0.8,
                        );
                    });
                },
            );
        }
    }

    group.finish();
}

Expected Results:

Hidden Dim	Heads	Time (μs)	Memory (bytes)
256	4	2.1	5,120
256	8	3.8	9,216
512	4	3.7	10,240
512	8	6.9	18,432

Buffer Operations

Target: Lock-free with <1% contention

fn bench_trajectory_buffer(c: &mut Criterion) {
    let buffer = Arc::new(TrajectoryBuffer::new(10000));

    c.bench_function("trajectory_buffer_record", |b| {
        let trajectory = QueryTrajectory {
            id: 1,
            query_embedding: vec![0.1; 256],
            steps: vec![],
            final_quality: 0.8,
            latency_us: 1000,
        };

        b.iter(|| {
            buffer.record(trajectory.clone());
        });
    });

    c.bench_function("trajectory_buffer_drain", |b| {
        // Pre-fill buffer
        for i in 0..1000 {
            buffer.record(QueryTrajectory {
                id: i,
                query_embedding: vec![0.1; 256],
                steps: vec![],
                final_quality: 0.8,
                latency_us: 1000,
            });
        }

        b.iter(|| {
            buffer.drain()
        });
    });
}

Pattern Learning Benchmarks

K-means++ Extraction

Target: <1s for 1000 trajectories

fn bench_pattern_extraction(c: &mut Criterion) {
    let mut group = c.benchmark_group("pattern_extraction");

    for n_trajectories in [100, 500, 1000, 5000] {
        let mut bank = ReasoningBank::new(PatternConfig {
            k_clusters: 50,
            embedding_dim: 256,
            ..Default::default()
        });

        // Pre-populate
        for i in 0..n_trajectories {
            bank.add_trajectory(&generate_random_trajectory(i, 256));
        }

        group.bench_with_input(
            BenchmarkId::from_parameter(n_trajectories),
            &n_trajectories,
            |b, _| {
                b.iter(|| {
                    bank.extract_patterns()
                });
            },
        );
    }

    group.finish();
}

Expected Results:

Trajectories	Clusters	Time (ms)	Iterations
100	10	12	8
500	25	95	12
1000	50	380	15
5000	100	2,450	20

Pattern Search

Target: <1ms per query

fn bench_pattern_search(c: &mut Criterion) {
    let mut group = c.benchmark_group("pattern_search");

    for n_patterns in [1000, 10000, 100000] {
        let mut index = PatternIndex::new(256, n_patterns);

        // Pre-populate
        for i in 0..n_patterns {
            let embedding: Vec<f32> = (0..256).map(|_| rand::random()).collect();
            index.add_pattern(i as u64, &embedding).unwrap();
        }

        let query: Vec<f32> = (0..256).map(|_| rand::random()).collect();

        group.bench_with_input(
            BenchmarkId::from_parameter(n_patterns),
            &n_patterns,
            |b, _| {
                b.iter(|| {
                    index.find_similar(&query, 10)
                });
            },
        );
    }

    group.finish();
}

Expected Results (HNSW with ef=50):

Patterns	Search Time (μs)	Recall@10
1,000	45	0.98
10,000	120	0.96
100,000	350	0.94
1,000,000	850	0.92

EWC++ Benchmarks

Fisher Information Update

Target: <1ms per update

fn bench_fisher_update(c: &mut Criterion) {
    let mut group = c.benchmark_group("fisher_update");

    for param_count in [1000, 10000, 100000] {
        let mut ewc = EwcPlusPlus::new(EwcConfig {
            param_count,
            ..Default::default()
        });

        let gradients: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();

        group.bench_with_input(
            BenchmarkId::from_parameter(param_count),
            &param_count,
            |b, _| {
                b.iter(|| {
                    ewc.update_fisher(&gradients);
                });
            },
        );
    }

    group.finish();
}

Expected Results:

Parameters	Update Time (μs)	Memory (KB)
1,000	15	8
10,000	120	80
100,000	1,150	800

Constraint Application

Target: <500μs per gradient vector

fn bench_constraint_application(c: &mut Criterion) {
    let mut group = c.benchmark_group("ewc_constraints");

    for param_count in [1000, 10000, 100000] {
        let ewc = EwcPlusPlus::new(EwcConfig {
            param_count,
            num_tasks: 5,
            ..Default::default()
        });

        // Pre-train Fisher
        for _ in 0..100 {
            let grads: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();
            ewc.update_fisher(&grads);
        }

        let gradients: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();

        group.bench_with_input(
            BenchmarkId::from_parameter(param_count),
            &param_count,
            |b, _| {
                b.iter(|| {
                    ewc.apply_constraints(&gradients)
                });
            },
        );
    }

    group.finish();
}

Dream Engine Benchmarks

Dream Generation

Target: <100ms per dream

fn bench_dream_generation(c: &mut Criterion) {
    let mut group = c.benchmark_group("dream_generation");

    for memory_size in [1000, 10000, 50000] {
        let mut engine = DreamEngine::new(DreamConfig::default());

        // Pre-populate memory
        for i in 0..memory_size {
            engine.add_memory_node(MemoryNode {
                id: i as u64,
                embedding: (0..256).map(|_| rand::random()).collect(),
                timestamp: Instant::now(),
                access_count: rand::random::<u32>() % 100,
                importance: rand::random(),
            });
        }

        group.bench_with_input(
            BenchmarkId::from_parameter(memory_size),
            &memory_size,
            |b, _| {
                b.iter(|| {
                    engine.generate_dream()
                });
            },
        );
    }

    group.finish();
}

Expected Results:

Memory Nodes	Dream Time (ms)	Avg Path Length
1,000	12	8
10,000	45	12
50,000	85	15

Dream Quality Evaluation

Target: <50ms per evaluation

fn bench_dream_evaluation(c: &mut Criterion) {
    let evaluator = DreamEvaluator::new(EvaluatorConfig::default());

    let dream = Dream {
        id: 1,
        path: (0..15).map(|i| MemoryNode {
            id: i,
            embedding: (0..256).map(|_| rand::random()).collect(),
            timestamp: Instant::now(),
            access_count: 10,
            importance: 0.5,
        }).collect(),
        creative_jumps: 3,
        total_novelty: 0.0,
    };

    c.bench_function("dream_evaluation", |b| {
        b.iter(|| {
            evaluator.evaluate(&dream)
        });
    });
}

Learning Loop Benchmarks

Loop A (Instant) - Per Request

Target: <1ms total overhead

fn bench_loop_a(c: &mut Criterion) {
    let loop_a = InstantLoop::new(256, InstantLoopConfig::default());

    let trajectory = QueryTrajectory {
        id: 1,
        query_embedding: vec![0.1; 256],
        steps: (0..10).map(|_| TrajectoryStep {
            activations: vec![0.5; 256],
            attention_weights: vec![0.1; 2048],
            reward: 0.8,
            timestamp: Instant::now(),
        }).collect(),
        final_quality: 0.8,
        latency_us: 50000,
    };

    c.bench_function("loop_a_on_inference", |b| {
        b.iter(|| {
            loop_a.on_inference(trajectory.clone());
        });
    });

    c.bench_function("loop_a_flush", |b| {
        // Pre-fill with signals
        for _ in 0..100 {
            loop_a.on_inference(trajectory.clone());
        }

        b.iter(|| {
            loop_a.flush_updates();
        });
    });
}

Expected Results:

Operation	Time (μs)	Notes
on_inference	650	Recording + accumulation
flush_updates	120	LoRA + edge commit
Total	770	Per request overhead

Loop B (Background) - Hourly

Target: <30s per cycle

fn bench_loop_b(c: &mut Criterion) {
    let runtime = tokio::runtime::Runtime::new().unwrap();

    let loop_b = BackgroundLoop::new(BackgroundLoopConfig::default(), 256);

    // Generate trajectories
    let trajectories: Vec<_> = (0..1000)
        .map(|i| generate_random_trajectory(i, 256))
        .collect();

    c.bench_function("loop_b_cycle", |b| {
        b.to_async(&runtime).iter(|| async {
            loop_b.run_cycle(trajectories.clone()).await
        });
    });
}

Breakdown:

Phase	Time (s)	% of Total
Trajectory ingestion	0.5	2%
Pattern extraction	8.0	32%
Gradient computation	5.0	20%
EWC++ constraints	3.0	12%
LoRA update	2.0	8%
Fisher update	4.0	16%
Metrics/logging	2.5	10%
Total	25.0	100%

Loop C (Deep) - Weekly

Target: <10min per cycle

fn bench_loop_c(c: &mut Criterion) {
    let runtime = tokio::runtime::Runtime::new().unwrap();

    let loop_c = DeepLoop::new(DeepLoopConfig::default());

    // This is a longer benchmark, run fewer iterations
    c.bench_function("loop_c_cycle", |b| {
        b.to_async(&runtime).iter(|| async {
            loop_c.run_cycle().await
        });
    });
}

Breakdown:

Phase	Time (min)	% of Total
Dream generation (50)	1.5	15%
Φ evaluation	2.0	20%
Dream integration	1.0	10%
Memory consolidation	3.0	30%
EWC++ consolidation	2.0	20%
Metrics/persistence	0.5	5%
Total	10.0	100%

Memory Benchmarks

Memory Usage by Component

fn measure_memory_usage() -> MemoryReport {
    let mut report = MemoryReport::default();

    // Micro-LoRA (rank=1, hidden=256)
    let micro_lora = MicroLoRA::new(256, 1);
    report.micro_lora = std::mem::size_of_val(&micro_lora)
        + micro_lora.down_proj.len() * 4
        + micro_lora.up_proj.len() * 4
        + micro_lora.gradient_buffer.len() * 4;

    // Base LoRA (rank=8, hidden=256, layers=12)
    let base_lora = BaseLoRA::new(256, 8, 12);
    report.base_lora = std::mem::size_of_val(&base_lora)
        + base_lora.layers.iter().map(|l|
            l.down_proj.len() * 4 + l.up_proj.len() * 4
        ).sum::<usize>();

    // Trajectory buffer (capacity=10000)
    report.trajectory_buffer = 10000 * (
        256 * 4  // query embedding
        + 10 * (256 * 4 + 2048 * 4 + 4 + 8)  // 10 steps
    );

    // Pattern index (100k patterns)
    report.pattern_index = 100000 * (256 * 4 + 64);  // embedding + metadata

    // EWC++ (100k params, 5 tasks)
    report.ewc = 100000 * 4 * 5;  // Fisher per task

    report
}

Expected Memory Usage:

Component	Size (MB)	Notes
Micro-LoRA	0.004	Minimal overhead
Base LoRA	0.6	12 layers
Trajectory Buffer	82.0	10k capacity
Pattern Index	102.4	100k patterns
EWC++ Fisher	2.0	100k params × 5 tasks
Dream Engine	12.8	50k memory nodes
Total	199.8	Peak usage

Throughput Benchmarks

End-to-End Query Throughput

fn bench_query_throughput(c: &mut Criterion) {
    let runtime = tokio::runtime::Runtime::new().unwrap();

    let sona = runtime.block_on(async {
        SonaEngine::new(SonaConfig::default()).await.unwrap()
    });

    c.bench_function("query_throughput", |b| {
        b.to_async(&runtime).iter(|| async {
            sona.process("test query", &Context::default()).await
        });
    });
}

Expected Throughput:

Scenario	QPS	Latency p50	Latency p99
Baseline (no SONA)	850	1.1ms	2.5ms
With Micro-LoRA	780	1.2ms	2.8ms
Full SONA	720	1.3ms	3.2ms

Overhead: ~15% throughput reduction for full self-learning capability.

Hardware-Specific Benchmarks

CPU Feature Detection

fn check_cpu_features() -> CpuFeatures {
    CpuFeatures {
        avx2: is_x86_feature_detected!("avx2"),
        avx512f: is_x86_feature_detected!("avx512f"),
        fma: is_x86_feature_detected!("fma"),
        sse4_1: is_x86_feature_detected!("sse4.1"),
        sse4_2: is_x86_feature_detected!("sse4.2"),
    }
}

Performance by CPU

CPU	Micro-LoRA (μs)	Pattern Search (μs)	Overall Speedup
Intel i9-13900K (AVX2)	3.2	45	4.8x
AMD Ryzen 9 7950X	3.5	48	4.5x
Apple M2 Pro (NEON)	4.1	52	3.9x
Intel Xeon Platinum	2.8	38	5.2x

Benchmark Commands

# Run all benchmarks
cargo bench --package ruvllm --features sona

# Run specific benchmark group
cargo bench --package ruvllm --bench micro_lora

# Run with specific features
cargo bench --package ruvllm --features "sona,avx2"

# Profile memory
cargo bench --package ruvllm --bench memory -- --profile-time 60

# Generate flamegraph
cargo flamegraph --bench micro_lora -- --bench

Continuous Benchmarking

CI Integration

# .github/workflows/bench.yml
name: Benchmarks

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  benchmark:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Run benchmarks
        run: cargo bench --package ruvllm --features sona -- --save-baseline main

      - name: Compare with baseline
        run: cargo bench --package ruvllm --features sona -- --baseline main

      - name: Upload results
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          path: target/criterion

Regression Detection

// Fail CI if performance regresses by more than 10%
const MAX_REGRESSION_PERCENT: f64 = 10.0;

fn check_regression(baseline: Duration, current: Duration) -> Result<(), String> {
    let regression = (current.as_nanos() as f64 / baseline.as_nanos() as f64 - 1.0) * 100.0;

    if regression > MAX_REGRESSION_PERCENT {
        Err(format!(
            "Performance regression of {:.1}% exceeds threshold of {}%",
            regression, MAX_REGRESSION_PERCENT
        ))
    } else {
        Ok(())
    }
}

Next Steps

09-API-REFERENCE.md - Complete API documentation

23 KiB Raw Blame History Unescape Escape

SONA Performance Benchmarks

Overview

Performance Targets Summary

Micro-LoRA Benchmarks

Forward Pass Latency

Gradient Accumulation

Base LoRA Benchmarks

Forward Pass (Per Layer)

Trajectory Recording Benchmarks

Step Recording Latency

Buffer Operations

Pattern Learning Benchmarks

K-means++ Extraction

Pattern Search

EWC++ Benchmarks

Fisher Information Update

Constraint Application

Dream Engine Benchmarks

Dream Generation

Dream Quality Evaluation

Learning Loop Benchmarks

Loop A (Instant) - Per Request

Loop B (Background) - Hourly

Loop C (Deep) - Weekly

Memory Benchmarks

Memory Usage by Component

Throughput Benchmarks

End-to-End Query Throughput

Hardware-Specific Benchmarks

CPU Feature Detection

Performance by CPU

Benchmark Commands

Continuous Benchmarking

CI Integration

Regression Detection

Next Steps

23 KiB

Raw Blame History