23 KiB
SONA Performance Benchmarks
Overview
This document defines performance targets, benchmark methodology, and expected results for SONA components. All benchmarks are designed to be reproducible and measurable.
Performance Targets Summary
┌─────────────────────────────────────────────────────────────────────────┐
│ SONA Performance Targets │
├─────────────────────────────────────────────────────────────────────────┤
│ Component │ Target │ Stretch Goal │ Unit │
├─────────────────────────┼────────────────┼───────────────┼─────────────┤
│ Micro-LoRA forward │ <50μs │ <20μs │ per request │
│ Micro-LoRA update │ <100μs │ <50μs │ per signal │
│ Base LoRA forward │ <200μs │ <100μs │ per layer │
│ Pattern extraction │ <1s │ <500ms │ per 1000 │
│ Trajectory recording │ <10μs │ <5μs │ per step │
│ Background cycle │ <30s │ <15s │ per cycle │
│ Deep cycle │ <10min │ <5min │ per cycle │
│ Memory overhead │ <100MB │ <50MB │ total │
│ Pattern search │ <1ms │ <100μs │ per query │
│ Dream generation │ <100ms │ <50ms │ per dream │
└─────────────────────────────────────────────────────────────────────────┘
Micro-LoRA Benchmarks
Forward Pass Latency
Target: <50μs average, <100μs p99
// benches/micro_lora.rs
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
fn bench_micro_lora_forward(c: &mut Criterion) {
let mut group = c.benchmark_group("micro_lora_forward");
for rank in [1, 2] {
for hidden_dim in [256, 512, 1024, 2048] {
let lora = MicroLoRA::new(hidden_dim, rank);
let input = vec![0.1f32; hidden_dim];
let mut output = vec![0.0f32; hidden_dim];
group.bench_with_input(
BenchmarkId::new(format!("rank{}", rank), hidden_dim),
&hidden_dim,
|b, _| {
b.iter(|| {
output.fill(0.0);
unsafe { lora.forward_simd(&input, &mut output) };
});
},
);
}
}
group.finish();
}
Expected Results:
| Rank | Hidden Dim | AVX2 (μs) | Scalar (μs) | Speedup |
|---|---|---|---|---|
| 1 | 256 | 3.2 | 12.5 | 3.9x |
| 1 | 512 | 5.8 | 24.1 | 4.2x |
| 1 | 1024 | 10.4 | 47.3 | 4.5x |
| 1 | 2048 | 19.7 | 93.8 | 4.8x |
| 2 | 256 | 5.1 | 23.4 | 4.6x |
| 2 | 512 | 9.3 | 46.2 | 5.0x |
| 2 | 1024 | 17.2 | 91.5 | 5.3x |
| 2 | 2048 | 33.1 | 182.4 | 5.5x |
Gradient Accumulation
Target: <100μs per signal
fn bench_gradient_accumulation(c: &mut Criterion) {
let mut group = c.benchmark_group("gradient_accumulation");
for hidden_dim in [256, 512, 1024] {
let mut lora = MicroLoRA::new(hidden_dim, 1);
let signal = LearningSignal {
query_embedding: vec![0.1; hidden_dim],
gradient_estimate: vec![0.01; hidden_dim],
quality_score: 0.8,
timestamp: Instant::now(),
metadata: SignalMetadata::default(),
};
group.bench_with_input(
BenchmarkId::from_parameter(hidden_dim),
&hidden_dim,
|b, _| {
b.iter(|| {
lora.accumulate_gradient(&signal);
});
},
);
}
group.finish();
}
Expected Results:
| Hidden Dim | Time (μs) | Throughput (signals/s) |
|---|---|---|
| 256 | 8.3 | 120,481 |
| 512 | 15.7 | 63,694 |
| 1024 | 30.2 | 33,112 |
Base LoRA Benchmarks
Forward Pass (Per Layer)
Target: <200μs per layer
fn bench_base_lora_forward(c: &mut Criterion) {
let mut group = c.benchmark_group("base_lora_forward");
for rank in [4, 8, 16] {
for hidden_dim in [512, 1024, 2048] {
let lora = BaseLoRA::new(hidden_dim, rank, 1);
let input = vec![0.1f32; hidden_dim];
let mut output = vec![0.0f32; hidden_dim];
group.bench_with_input(
BenchmarkId::new(format!("rank{}", rank), hidden_dim),
&hidden_dim,
|b, _| {
b.iter(|| {
lora.forward_layer(0, &input, &mut output);
});
},
);
}
}
group.finish();
}
Expected Results:
| Rank | Hidden Dim | Time (μs) | FLOPs | GFLOPS |
|---|---|---|---|---|
| 4 | 512 | 45 | 4.2M | 93 |
| 4 | 1024 | 85 | 8.4M | 99 |
| 4 | 2048 | 162 | 16.8M | 104 |
| 8 | 512 | 82 | 8.4M | 102 |
| 8 | 1024 | 158 | 16.8M | 106 |
| 8 | 2048 | 305 | 33.5M | 110 |
| 16 | 512 | 155 | 16.8M | 108 |
| 16 | 1024 | 298 | 33.5M | 112 |
| 16 | 2048 | 582 | 67.1M | 115 |
Trajectory Recording Benchmarks
Step Recording Latency
Target: <10μs per step
fn bench_trajectory_recording(c: &mut Criterion) {
let mut group = c.benchmark_group("trajectory_recording");
for hidden_dim in [256, 512] {
for num_heads in [4, 8] {
let mut builder = TrajectoryBuilder::new(1, vec![0.1; hidden_dim]);
group.bench_with_input(
BenchmarkId::new(format!("h{}_heads{}", hidden_dim, num_heads), hidden_dim),
&(hidden_dim, num_heads),
|b, &(hd, nh)| {
b.iter(|| {
builder.add_step(
vec![0.5; hd],
vec![0.1; hd * nh],
0.8,
);
});
},
);
}
}
group.finish();
}
Expected Results:
| Hidden Dim | Heads | Time (μs) | Memory (bytes) |
|---|---|---|---|
| 256 | 4 | 2.1 | 5,120 |
| 256 | 8 | 3.8 | 9,216 |
| 512 | 4 | 3.7 | 10,240 |
| 512 | 8 | 6.9 | 18,432 |
Buffer Operations
Target: Lock-free with <1% contention
fn bench_trajectory_buffer(c: &mut Criterion) {
let buffer = Arc::new(TrajectoryBuffer::new(10000));
c.bench_function("trajectory_buffer_record", |b| {
let trajectory = QueryTrajectory {
id: 1,
query_embedding: vec![0.1; 256],
steps: vec![],
final_quality: 0.8,
latency_us: 1000,
};
b.iter(|| {
buffer.record(trajectory.clone());
});
});
c.bench_function("trajectory_buffer_drain", |b| {
// Pre-fill buffer
for i in 0..1000 {
buffer.record(QueryTrajectory {
id: i,
query_embedding: vec![0.1; 256],
steps: vec![],
final_quality: 0.8,
latency_us: 1000,
});
}
b.iter(|| {
buffer.drain()
});
});
}
Pattern Learning Benchmarks
K-means++ Extraction
Target: <1s for 1000 trajectories
fn bench_pattern_extraction(c: &mut Criterion) {
let mut group = c.benchmark_group("pattern_extraction");
for n_trajectories in [100, 500, 1000, 5000] {
let mut bank = ReasoningBank::new(PatternConfig {
k_clusters: 50,
embedding_dim: 256,
..Default::default()
});
// Pre-populate
for i in 0..n_trajectories {
bank.add_trajectory(&generate_random_trajectory(i, 256));
}
group.bench_with_input(
BenchmarkId::from_parameter(n_trajectories),
&n_trajectories,
|b, _| {
b.iter(|| {
bank.extract_patterns()
});
},
);
}
group.finish();
}
Expected Results:
| Trajectories | Clusters | Time (ms) | Iterations |
|---|---|---|---|
| 100 | 10 | 12 | 8 |
| 500 | 25 | 95 | 12 |
| 1000 | 50 | 380 | 15 |
| 5000 | 100 | 2,450 | 20 |
Pattern Search
Target: <1ms per query
fn bench_pattern_search(c: &mut Criterion) {
let mut group = c.benchmark_group("pattern_search");
for n_patterns in [1000, 10000, 100000] {
let mut index = PatternIndex::new(256, n_patterns);
// Pre-populate
for i in 0..n_patterns {
let embedding: Vec<f32> = (0..256).map(|_| rand::random()).collect();
index.add_pattern(i as u64, &embedding).unwrap();
}
let query: Vec<f32> = (0..256).map(|_| rand::random()).collect();
group.bench_with_input(
BenchmarkId::from_parameter(n_patterns),
&n_patterns,
|b, _| {
b.iter(|| {
index.find_similar(&query, 10)
});
},
);
}
group.finish();
}
Expected Results (HNSW with ef=50):
| Patterns | Search Time (μs) | Recall@10 |
|---|---|---|
| 1,000 | 45 | 0.98 |
| 10,000 | 120 | 0.96 |
| 100,000 | 350 | 0.94 |
| 1,000,000 | 850 | 0.92 |
EWC++ Benchmarks
Fisher Information Update
Target: <1ms per update
fn bench_fisher_update(c: &mut Criterion) {
let mut group = c.benchmark_group("fisher_update");
for param_count in [1000, 10000, 100000] {
let mut ewc = EwcPlusPlus::new(EwcConfig {
param_count,
..Default::default()
});
let gradients: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();
group.bench_with_input(
BenchmarkId::from_parameter(param_count),
¶m_count,
|b, _| {
b.iter(|| {
ewc.update_fisher(&gradients);
});
},
);
}
group.finish();
}
Expected Results:
| Parameters | Update Time (μs) | Memory (KB) |
|---|---|---|
| 1,000 | 15 | 8 |
| 10,000 | 120 | 80 |
| 100,000 | 1,150 | 800 |
Constraint Application
Target: <500μs per gradient vector
fn bench_constraint_application(c: &mut Criterion) {
let mut group = c.benchmark_group("ewc_constraints");
for param_count in [1000, 10000, 100000] {
let ewc = EwcPlusPlus::new(EwcConfig {
param_count,
num_tasks: 5,
..Default::default()
});
// Pre-train Fisher
for _ in 0..100 {
let grads: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();
ewc.update_fisher(&grads);
}
let gradients: Vec<f32> = (0..param_count).map(|_| rand::random::<f32>() * 0.01).collect();
group.bench_with_input(
BenchmarkId::from_parameter(param_count),
¶m_count,
|b, _| {
b.iter(|| {
ewc.apply_constraints(&gradients)
});
},
);
}
group.finish();
}
Dream Engine Benchmarks
Dream Generation
Target: <100ms per dream
fn bench_dream_generation(c: &mut Criterion) {
let mut group = c.benchmark_group("dream_generation");
for memory_size in [1000, 10000, 50000] {
let mut engine = DreamEngine::new(DreamConfig::default());
// Pre-populate memory
for i in 0..memory_size {
engine.add_memory_node(MemoryNode {
id: i as u64,
embedding: (0..256).map(|_| rand::random()).collect(),
timestamp: Instant::now(),
access_count: rand::random::<u32>() % 100,
importance: rand::random(),
});
}
group.bench_with_input(
BenchmarkId::from_parameter(memory_size),
&memory_size,
|b, _| {
b.iter(|| {
engine.generate_dream()
});
},
);
}
group.finish();
}
Expected Results:
| Memory Nodes | Dream Time (ms) | Avg Path Length |
|---|---|---|
| 1,000 | 12 | 8 |
| 10,000 | 45 | 12 |
| 50,000 | 85 | 15 |
Dream Quality Evaluation
Target: <50ms per evaluation
fn bench_dream_evaluation(c: &mut Criterion) {
let evaluator = DreamEvaluator::new(EvaluatorConfig::default());
let dream = Dream {
id: 1,
path: (0..15).map(|i| MemoryNode {
id: i,
embedding: (0..256).map(|_| rand::random()).collect(),
timestamp: Instant::now(),
access_count: 10,
importance: 0.5,
}).collect(),
creative_jumps: 3,
total_novelty: 0.0,
};
c.bench_function("dream_evaluation", |b| {
b.iter(|| {
evaluator.evaluate(&dream)
});
});
}
Learning Loop Benchmarks
Loop A (Instant) - Per Request
Target: <1ms total overhead
fn bench_loop_a(c: &mut Criterion) {
let loop_a = InstantLoop::new(256, InstantLoopConfig::default());
let trajectory = QueryTrajectory {
id: 1,
query_embedding: vec![0.1; 256],
steps: (0..10).map(|_| TrajectoryStep {
activations: vec![0.5; 256],
attention_weights: vec![0.1; 2048],
reward: 0.8,
timestamp: Instant::now(),
}).collect(),
final_quality: 0.8,
latency_us: 50000,
};
c.bench_function("loop_a_on_inference", |b| {
b.iter(|| {
loop_a.on_inference(trajectory.clone());
});
});
c.bench_function("loop_a_flush", |b| {
// Pre-fill with signals
for _ in 0..100 {
loop_a.on_inference(trajectory.clone());
}
b.iter(|| {
loop_a.flush_updates();
});
});
}
Expected Results:
| Operation | Time (μs) | Notes |
|---|---|---|
| on_inference | 650 | Recording + accumulation |
| flush_updates | 120 | LoRA + edge commit |
| Total | 770 | Per request overhead |
Loop B (Background) - Hourly
Target: <30s per cycle
fn bench_loop_b(c: &mut Criterion) {
let runtime = tokio::runtime::Runtime::new().unwrap();
let loop_b = BackgroundLoop::new(BackgroundLoopConfig::default(), 256);
// Generate trajectories
let trajectories: Vec<_> = (0..1000)
.map(|i| generate_random_trajectory(i, 256))
.collect();
c.bench_function("loop_b_cycle", |b| {
b.to_async(&runtime).iter(|| async {
loop_b.run_cycle(trajectories.clone()).await
});
});
}
Breakdown:
| Phase | Time (s) | % of Total |
|---|---|---|
| Trajectory ingestion | 0.5 | 2% |
| Pattern extraction | 8.0 | 32% |
| Gradient computation | 5.0 | 20% |
| EWC++ constraints | 3.0 | 12% |
| LoRA update | 2.0 | 8% |
| Fisher update | 4.0 | 16% |
| Metrics/logging | 2.5 | 10% |
| Total | 25.0 | 100% |
Loop C (Deep) - Weekly
Target: <10min per cycle
fn bench_loop_c(c: &mut Criterion) {
let runtime = tokio::runtime::Runtime::new().unwrap();
let loop_c = DeepLoop::new(DeepLoopConfig::default());
// This is a longer benchmark, run fewer iterations
c.bench_function("loop_c_cycle", |b| {
b.to_async(&runtime).iter(|| async {
loop_c.run_cycle().await
});
});
}
Breakdown:
| Phase | Time (min) | % of Total |
|---|---|---|
| Dream generation (50) | 1.5 | 15% |
| Φ evaluation | 2.0 | 20% |
| Dream integration | 1.0 | 10% |
| Memory consolidation | 3.0 | 30% |
| EWC++ consolidation | 2.0 | 20% |
| Metrics/persistence | 0.5 | 5% |
| Total | 10.0 | 100% |
Memory Benchmarks
Memory Usage by Component
fn measure_memory_usage() -> MemoryReport {
let mut report = MemoryReport::default();
// Micro-LoRA (rank=1, hidden=256)
let micro_lora = MicroLoRA::new(256, 1);
report.micro_lora = std::mem::size_of_val(µ_lora)
+ micro_lora.down_proj.len() * 4
+ micro_lora.up_proj.len() * 4
+ micro_lora.gradient_buffer.len() * 4;
// Base LoRA (rank=8, hidden=256, layers=12)
let base_lora = BaseLoRA::new(256, 8, 12);
report.base_lora = std::mem::size_of_val(&base_lora)
+ base_lora.layers.iter().map(|l|
l.down_proj.len() * 4 + l.up_proj.len() * 4
).sum::<usize>();
// Trajectory buffer (capacity=10000)
report.trajectory_buffer = 10000 * (
256 * 4 // query embedding
+ 10 * (256 * 4 + 2048 * 4 + 4 + 8) // 10 steps
);
// Pattern index (100k patterns)
report.pattern_index = 100000 * (256 * 4 + 64); // embedding + metadata
// EWC++ (100k params, 5 tasks)
report.ewc = 100000 * 4 * 5; // Fisher per task
report
}
Expected Memory Usage:
| Component | Size (MB) | Notes |
|---|---|---|
| Micro-LoRA | 0.004 | Minimal overhead |
| Base LoRA | 0.6 | 12 layers |
| Trajectory Buffer | 82.0 | 10k capacity |
| Pattern Index | 102.4 | 100k patterns |
| EWC++ Fisher | 2.0 | 100k params × 5 tasks |
| Dream Engine | 12.8 | 50k memory nodes |
| Total | 199.8 | Peak usage |
Throughput Benchmarks
End-to-End Query Throughput
fn bench_query_throughput(c: &mut Criterion) {
let runtime = tokio::runtime::Runtime::new().unwrap();
let sona = runtime.block_on(async {
SonaEngine::new(SonaConfig::default()).await.unwrap()
});
c.bench_function("query_throughput", |b| {
b.to_async(&runtime).iter(|| async {
sona.process("test query", &Context::default()).await
});
});
}
Expected Throughput:
| Scenario | QPS | Latency p50 | Latency p99 |
|---|---|---|---|
| Baseline (no SONA) | 850 | 1.1ms | 2.5ms |
| With Micro-LoRA | 780 | 1.2ms | 2.8ms |
| Full SONA | 720 | 1.3ms | 3.2ms |
Overhead: ~15% throughput reduction for full self-learning capability.
Hardware-Specific Benchmarks
CPU Feature Detection
fn check_cpu_features() -> CpuFeatures {
CpuFeatures {
avx2: is_x86_feature_detected!("avx2"),
avx512f: is_x86_feature_detected!("avx512f"),
fma: is_x86_feature_detected!("fma"),
sse4_1: is_x86_feature_detected!("sse4.1"),
sse4_2: is_x86_feature_detected!("sse4.2"),
}
}
Performance by CPU
| CPU | Micro-LoRA (μs) | Pattern Search (μs) | Overall Speedup |
|---|---|---|---|
| Intel i9-13900K (AVX2) | 3.2 | 45 | 4.8x |
| AMD Ryzen 9 7950X | 3.5 | 48 | 4.5x |
| Apple M2 Pro (NEON) | 4.1 | 52 | 3.9x |
| Intel Xeon Platinum | 2.8 | 38 | 5.2x |
Benchmark Commands
# Run all benchmarks
cargo bench --package ruvllm --features sona
# Run specific benchmark group
cargo bench --package ruvllm --bench micro_lora
# Run with specific features
cargo bench --package ruvllm --features "sona,avx2"
# Profile memory
cargo bench --package ruvllm --bench memory -- --profile-time 60
# Generate flamegraph
cargo flamegraph --bench micro_lora -- --bench
Continuous Benchmarking
CI Integration
# .github/workflows/bench.yml
name: Benchmarks
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run benchmarks
run: cargo bench --package ruvllm --features sona -- --save-baseline main
- name: Compare with baseline
run: cargo bench --package ruvllm --features sona -- --baseline main
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: target/criterion
Regression Detection
// Fail CI if performance regresses by more than 10%
const MAX_REGRESSION_PERCENT: f64 = 10.0;
fn check_regression(baseline: Duration, current: Duration) -> Result<(), String> {
let regression = (current.as_nanos() as f64 / baseline.as_nanos() as f64 - 1.0) * 100.0;
if regression > MAX_REGRESSION_PERCENT {
Err(format!(
"Performance regression of {:.1}% exceeds threshold of {}%",
regression, MAX_REGRESSION_PERCENT
))
} else {
Ok(())
}
}
Next Steps
- 09-API-REFERENCE.md - Complete API documentation