#![allow( clippy::all, unused_imports, unused_variables, dead_code, unused_mut, unused_assignments, non_camel_case_types, clippy::approx_constant, unexpected_cfgs, unused_must_use, unused_parens )] //! MicroLoRA Benchmarks for M4 Pro //! //! Benchmarks for LoRA adapter operations: //! - Forward pass latency //! - SIMD-optimized forward //! - Gradient accumulation //! - EWC++ overhead //! - Adaptation speed //! //! Performance targets for M4 Pro: //! - MicroLoRA forward (rank=2, dim=768): <500us //! - MicroLoRA forward (rank=2, dim=4096): <1ms //! - Gradient accumulation: <100us //! - EWC++ update: <200us use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use rand::Rng; /// Target modules for LoRA adaptation #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] enum TargetModule { QProj, VProj, } /// Single LoRA adapter for benchmarking #[derive(Clone)] struct LoraAdapter { lora_a: Vec, lora_b: Vec, in_features: usize, out_features: usize, rank: usize, scaling: f32, // Gradients grad_a: Vec, grad_b: Vec, grad_count: usize, } impl LoraAdapter { fn new(in_features: usize, out_features: usize, rank: usize, alpha: f32) -> Self { let scaling = alpha / rank as f32; // Kaiming initialization for A let std_a = (2.0 / in_features as f32).sqrt() * 0.01; let lora_a: Vec = (0..in_features * rank) .map(|idx| { let seed = idx as f32; ((seed * 0.618033988749895) % 1.0 - 0.5) * 2.0 * std_a }) .collect(); // Zero initialization for B let lora_b = vec![0.0; rank * out_features]; Self { lora_a, lora_b, in_features, out_features, rank, scaling, grad_a: vec![0.0; in_features * rank], grad_b: vec![0.0; rank * out_features], grad_count: 0, } } /// Forward pass: output = x @ A @ B * scaling fn forward(&self, x: &[f32]) -> Vec { debug_assert_eq!(x.len(), self.in_features); // Down projection: x @ A -> intermediate (rank,) let mut intermediate = vec![0.0f32; self.rank]; for r in 0..self.rank { let mut sum = 0.0f32; for i in 0..self.in_features { sum += x[i] * self.lora_a[i * self.rank + r]; } intermediate[r] = sum; } // Up projection: intermediate @ B -> output (out_features,) let mut output = vec![0.0f32; self.out_features]; for o in 0..self.out_features { let mut sum = 0.0f32; for r in 0..self.rank { sum += intermediate[r] * self.lora_b[r * self.out_features + o]; } output[o] = sum * self.scaling; } output } /// SIMD-optimized forward for flat f32 slices (adds to output) fn forward_simd(&self, input: &[f32], output: &mut [f32]) { debug_assert_eq!(input.len(), self.in_features); debug_assert_eq!(output.len(), self.out_features); #[cfg(target_arch = "aarch64")] unsafe { self.forward_simd_neon(input, output); } #[cfg(not(target_arch = "aarch64"))] { self.forward_simd_scalar(input, output); } } #[cfg(target_arch = "aarch64")] #[inline(always)] unsafe fn forward_simd_neon(&self, input: &[f32], output: &mut [f32]) { use std::arch::aarch64::*; // Down projection with NEON let mut intermediate = vec![0.0f32; self.rank]; for r in 0..self.rank { let mut sum = vdupq_n_f32(0.0); let chunks = self.in_features / 4; let mut i = 0; for _ in 0..chunks { let x_v = vld1q_f32(input.as_ptr().add(i)); // Load A column (strided access - not ideal but works for small rank) let a_vals = [ self.lora_a[i * self.rank + r], self.lora_a[(i + 1) * self.rank + r], self.lora_a[(i + 2) * self.rank + r], self.lora_a[(i + 3) * self.rank + r], ]; let a_v = vld1q_f32(a_vals.as_ptr()); sum = vfmaq_f32(sum, x_v, a_v); i += 4; } let mut sum_val = vaddvq_f32(sum); for ii in i..self.in_features { sum_val += input[ii] * self.lora_a[ii * self.rank + r]; } intermediate[r] = sum_val; } // Up projection with NEON let scaling_vec = vdupq_n_f32(self.scaling); let chunks = self.out_features / 4; let mut o = 0; for _ in 0..chunks { let mut out_v = vld1q_f32(output.as_ptr().add(o)); for r in 0..self.rank { let inter_val = vdupq_n_f32(intermediate[r]); let b_v = vld1q_f32(self.lora_b.as_ptr().add(r * self.out_features + o)); out_v = vfmaq_f32(out_v, vmulq_f32(inter_val, b_v), scaling_vec); } vst1q_f32(output.as_mut_ptr().add(o), out_v); o += 4; } // Remaining elements for oo in o..self.out_features { let mut sum = output[oo]; for r in 0..self.rank { sum += intermediate[r] * self.lora_b[r * self.out_features + oo] * self.scaling; } output[oo] = sum; } } #[allow(dead_code)] fn forward_simd_scalar(&self, input: &[f32], output: &mut [f32]) { let mut intermediate = vec![0.0f32; self.rank]; for r in 0..self.rank { let mut sum = 0.0f32; for i in 0..self.in_features { sum += input[i] * self.lora_a[i * self.rank + r]; } intermediate[r] = sum; } for o in 0..self.out_features { let mut sum = output[o]; for r in 0..self.rank { sum += intermediate[r] * self.lora_b[r * self.out_features + o] * self.scaling; } output[o] = sum; } } /// Batched forward pass for efficiency fn forward_batch(&self, x: &[f32], batch_size: usize) -> Vec { debug_assert_eq!(x.len(), batch_size * self.in_features); let mut outputs = vec![0.0f32; batch_size * self.out_features]; for b in 0..batch_size { let input_offset = b * self.in_features; let output_offset = b * self.out_features; let input = &x[input_offset..input_offset + self.in_features]; let output = &mut outputs[output_offset..output_offset + self.out_features]; self.forward_simd(input, output); } outputs } /// Compute gradients for REINFORCE-style update fn accumulate_gradient(&mut self, input: &[f32], grad_output: &[f32], reward: f32) { debug_assert_eq!(input.len(), self.in_features); debug_assert_eq!(grad_output.len(), self.out_features); // Compute intermediate activation let mut intermediate = vec![0.0f32; self.rank]; for r in 0..self.rank { let mut sum = 0.0f32; for i in 0..self.in_features { sum += input[i] * self.lora_a[i * self.rank + r]; } intermediate[r] = sum; } // Gradient for B: outer(intermediate, grad_output) * reward * scaling for r in 0..self.rank { for o in 0..self.out_features { self.grad_b[r * self.out_features + o] += intermediate[r] * grad_output[o] * reward * self.scaling; } } // Gradient for A: input outer grad_intermediate // grad_intermediate = grad_output @ B.T * reward * scaling let mut grad_intermediate = vec![0.0f32; self.rank]; for r in 0..self.rank { let mut sum = 0.0f32; for o in 0..self.out_features { sum += grad_output[o] * self.lora_b[r * self.out_features + o]; } grad_intermediate[r] = sum * reward * self.scaling; } for i in 0..self.in_features { for r in 0..self.rank { self.grad_a[i * self.rank + r] += input[i] * grad_intermediate[r]; } } self.grad_count += 1; } /// Apply accumulated gradients with learning rate fn apply_gradients(&mut self, learning_rate: f32) { if self.grad_count == 0 { return; } let scale = learning_rate / self.grad_count as f32; for i in 0..self.lora_a.len() { self.lora_a[i] -= self.grad_a[i] * scale; self.grad_a[i] = 0.0; } for i in 0..self.lora_b.len() { self.lora_b[i] -= self.grad_b[i] * scale; self.grad_b[i] = 0.0; } self.grad_count = 0; } /// Apply gradients with EWC++ regularization fn apply_gradients_with_ewc( &mut self, learning_rate: f32, fisher_a: &[f32], fisher_b: &[f32], optimal_a: &[f32], optimal_b: &[f32], ewc_lambda: f32, ) { if self.grad_count == 0 { return; } let scale = learning_rate / self.grad_count as f32; // Update A with EWC regularization for i in 0..self.lora_a.len() { let grad = self.grad_a[i] * scale; let ewc_penalty = ewc_lambda * fisher_a[i] * (self.lora_a[i] - optimal_a[i]); self.lora_a[i] -= grad + ewc_penalty * learning_rate; self.grad_a[i] = 0.0; } // Update B with EWC regularization for i in 0..self.lora_b.len() { let grad = self.grad_b[i] * scale; let ewc_penalty = ewc_lambda * fisher_b[i] * (self.lora_b[i] - optimal_b[i]); self.lora_b[i] -= grad + ewc_penalty * learning_rate; self.grad_b[i] = 0.0; } self.grad_count = 0; } fn param_count(&self) -> usize { self.lora_a.len() + self.lora_b.len() } fn memory_bytes(&self) -> usize { self.param_count() * std::mem::size_of::() } } /// EWC state for benchmarking struct EwcState { fisher_a: Vec, fisher_b: Vec, optimal_a: Vec, optimal_b: Vec, } impl EwcState { fn from_adapter(adapter: &LoraAdapter) -> Self { Self { fisher_a: vec![0.01; adapter.lora_a.len()], fisher_b: vec![0.01; adapter.lora_b.len()], optimal_a: adapter.lora_a.clone(), optimal_b: adapter.lora_b.clone(), } } fn update_fisher(&mut self, grad_a: &[f32], grad_b: &[f32], decay: f32) { for i in 0..self.fisher_a.len() { self.fisher_a[i] = decay * self.fisher_a[i] + (1.0 - decay) * grad_a[i] * grad_a[i]; } for i in 0..self.fisher_b.len() { self.fisher_b[i] = decay * self.fisher_b[i] + (1.0 - decay) * grad_b[i] * grad_b[i]; } } } // Helper function to generate random tensor data fn random_tensor(size: usize) -> Vec { let mut rng = rand::thread_rng(); (0..size).map(|_| rng.gen_range(-1.0..1.0)).collect() } // === Benchmark Functions === fn bench_lora_forward(c: &mut Criterion) { let mut group = c.benchmark_group("lora_forward"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { for rank in [1, 2] { let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let input = random_tensor(in_features); let id = BenchmarkId::new( format!("dim_{}_rank_{}", in_features, rank), adapter.param_count(), ); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| b.iter(|| adapter.forward(black_box(&input)))); } } group.finish(); } fn bench_lora_forward_simd(c: &mut Criterion) { let mut group = c.benchmark_group("lora_forward_simd"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { for rank in [1, 2] { let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let input = random_tensor(in_features); let mut output = vec![0.0f32; out_features]; let id = BenchmarkId::new( format!("dim_{}_rank_{}", in_features, rank), adapter.param_count(), ); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| { b.iter(|| { output.fill(0.0); adapter.forward_simd(black_box(&input), black_box(&mut output)); }) }); } } group.finish(); } fn bench_lora_forward_batch(c: &mut Criterion) { let mut group = c.benchmark_group("lora_forward_batch"); group.sample_size(50); let in_features = 4096; let out_features = 4096; let rank = 2; let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); for batch_size in [1, 8, 16, 32, 64] { let input = random_tensor(batch_size * in_features); let id = BenchmarkId::new(format!("batch_{}", batch_size), batch_size); group.throughput(Throughput::Elements( (batch_size * adapter.param_count()) as u64, )); group.bench_function(id, |b| { b.iter(|| adapter.forward_batch(black_box(&input), batch_size)) }); } group.finish(); } fn bench_lora_gradient_accumulation(c: &mut Criterion) { let mut group = c.benchmark_group("lora_gradient_accumulation"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { let rank = 2; let mut adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let input = random_tensor(in_features); let grad_output = random_tensor(out_features); let id = BenchmarkId::new(format!("dim_{}", in_features), in_features); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| { b.iter(|| { adapter.accumulate_gradient(black_box(&input), black_box(&grad_output), 0.8); }) }); } group.finish(); } fn bench_lora_apply_gradients(c: &mut Criterion) { let mut group = c.benchmark_group("lora_apply_gradients"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { let rank = 2; let mut adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let input = random_tensor(in_features); let grad_output = random_tensor(out_features); // Accumulate some gradients first for _ in 0..10 { adapter.accumulate_gradient(&input, &grad_output, 0.8); } let id = BenchmarkId::new(format!("dim_{}", in_features), in_features); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| { b.iter_batched( || { let mut a = adapter.clone(); for _ in 0..10 { a.accumulate_gradient(&input, &grad_output, 0.8); } a }, |mut a| { a.apply_gradients(black_box(0.01)); }, criterion::BatchSize::SmallInput, ) }); } group.finish(); } fn bench_lora_ewc_update(c: &mut Criterion) { let mut group = c.benchmark_group("lora_ewc_update"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { let rank = 2; let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let ewc = EwcState::from_adapter(&adapter); let input = random_tensor(in_features); let grad_output = random_tensor(out_features); let id = BenchmarkId::new(format!("dim_{}", in_features), in_features); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| { b.iter_batched( || { let mut a = adapter.clone(); for _ in 0..10 { a.accumulate_gradient(&input, &grad_output, 0.8); } a }, |mut a| { a.apply_gradients_with_ewc( black_box(0.01), black_box(&ewc.fisher_a), black_box(&ewc.fisher_b), black_box(&ewc.optimal_a), black_box(&ewc.optimal_b), black_box(0.1), ); }, criterion::BatchSize::SmallInput, ) }); } group.finish(); } fn bench_lora_adaptation_cycle(c: &mut Criterion) { let mut group = c.benchmark_group("lora_adaptation_cycle"); group.sample_size(50); // Full adaptation cycle: forward + gradient + apply for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { let rank = 2; let input = random_tensor(in_features); let grad_output = random_tensor(out_features); let id = BenchmarkId::new(format!("dim_{}", in_features), in_features); group.bench_function(id, |b| { b.iter_batched( || LoraAdapter::new(in_features, out_features, rank, 4.0), |mut adapter| { // Forward let _output = adapter.forward(black_box(&input)); // Gradient adapter.accumulate_gradient(black_box(&input), black_box(&grad_output), 0.8); // Apply adapter.apply_gradients(black_box(0.01)); }, criterion::BatchSize::SmallInput, ) }); } group.finish(); } fn bench_lora_memory_footprint(c: &mut Criterion) { let mut group = c.benchmark_group("lora_memory"); group.sample_size(100); // Test memory efficiency at different scales let configs = [ ("rank1_768", 768, 768, 1), ("rank2_768", 768, 768, 2), ("rank1_4096", 4096, 4096, 1), ("rank2_4096", 4096, 4096, 2), ("rank2_4096x11008", 4096, 11008, 2), // MLP-like ]; for (name, in_features, out_features, rank) in configs { let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let input = random_tensor(in_features); let memory_bytes = adapter.memory_bytes(); let id = BenchmarkId::new(format!("{}_{}KB", name, memory_bytes / 1024), memory_bytes); group.throughput(Throughput::Bytes(memory_bytes as u64)); group.bench_function(id, |b| b.iter(|| adapter.forward(black_box(&input)))); } group.finish(); } fn bench_ewc_fisher_update(c: &mut Criterion) { let mut group = c.benchmark_group("ewc_fisher_update"); group.sample_size(100); for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] { let rank = 2; let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0); let mut ewc = EwcState::from_adapter(&adapter); let grad_a = random_tensor(in_features * rank); let grad_b = random_tensor(rank * out_features); let id = BenchmarkId::new(format!("dim_{}", in_features), in_features); group.throughput(Throughput::Elements(adapter.param_count() as u64)); group.bench_function(id, |b| { b.iter(|| { ewc.update_fisher(black_box(&grad_a), black_box(&grad_b), 0.9); }) }); } group.finish(); } fn bench_lora_vs_dense(c: &mut Criterion) { let mut group = c.benchmark_group("lora_vs_dense_overhead"); group.sample_size(50); // Compare LoRA overhead vs dense matmul let dim = 4096; let rank = 2; let adapter = LoraAdapter::new(dim, dim, rank, 4.0); let input = random_tensor(dim); // LoRA forward group.bench_function(BenchmarkId::new("lora_rank2", dim), |b| { b.iter(|| adapter.forward(black_box(&input))) }); // Equivalent dense GEMV (what LoRA replaces) let dense_weight = random_tensor(dim * dim); group.bench_function(BenchmarkId::new("dense_equivalent", dim), |b| { b.iter(|| { let mut dense_output = vec![0.0f32; dim]; for i in 0..dim { let mut sum = 0.0f32; for j in 0..dim { sum += input[j] * dense_weight[j * dim + i]; } dense_output[i] = sum; } black_box(dense_output) }) }); group.finish(); } fn bench_multiple_adapters(c: &mut Criterion) { let mut group = c.benchmark_group("multiple_adapters"); group.sample_size(50); // Test applying multiple LoRA adapters (Q, K, V, O projections) let dim = 4096; let rank = 2; let adapters: Vec = (0..4) .map(|_| LoraAdapter::new(dim, dim, rank, 4.0)) .collect(); let input = random_tensor(dim); group.bench_function(BenchmarkId::new("4_adapters_sequential", 4), |b| { b.iter(|| { let mut outputs: Vec> = Vec::with_capacity(4); for adapter in &adapters { outputs.push(adapter.forward(black_box(&input))); } outputs }) }); group.finish(); } criterion_group!( benches, bench_lora_forward, bench_lora_forward_simd, bench_lora_forward_batch, bench_lora_gradient_accumulation, bench_lora_apply_gradients, bench_lora_ewc_update, bench_lora_adaptation_cycle, bench_lora_memory_footprint, bench_ewc_fisher_update, bench_lora_vs_dense, bench_multiple_adapters, ); criterion_main!(benches);