#![allow( clippy::all, unused_imports, unused_variables, dead_code, unused_mut, unused_assignments, non_camel_case_types, clippy::approx_constant, unexpected_cfgs, unused_must_use, unused_parens )] //! ANE vs NEON Benchmark Suite //! //! Compares Apple Neural Engine (via BNNS) operations against //! hand-optimized NEON implementations. //! //! ## Running Benchmarks //! //! ANE benchmarks (requires macOS with coreml feature): //! ```bash //! cargo bench -p ruvllm --features coreml --bench ane_bench //! ``` //! //! Compare ANE vs Accelerate: //! ```bash //! cargo bench -p ruvllm --features coreml,accelerate --bench ane_bench //! ``` //! //! ## Performance Targets (M4 Pro) //! //! | Operation | Size | ANE Target | NEON Baseline | Expected Speedup | //! |-----------|------|------------|---------------|------------------| //! | GEMM | 1x4096x4096 | <500us | <800us | 1.5-2x | //! | GELU | 64x4096 | <100us | <150us | 1.3-1.5x | //! | SiLU | 64x4096 | <100us | <150us | 1.3-1.5x | //! | Softmax | 64x4096 | <150us | <200us | 1.2-1.4x | //! | LayerNorm | 64x4096 | <200us | <250us | 1.2-1.3x | //! //! ## Power Efficiency //! //! ANE typically provides 3-4x better performance per watt compared to //! GPU or CPU for supported operations. This benchmark suite measures //! wall-clock time, not power consumption. //! //! To measure power consumption on macOS, use: //! ```bash //! sudo powermetrics --samplers tasks -i 100 | grep ruvllm //! ``` use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use rand::Rng; // ============================================================================ // Helper Functions // ============================================================================ /// Generate random tensor data fn random_tensor(size: usize) -> Vec { let mut rng = rand::thread_rng(); (0..size).map(|_| rng.gen_range(-1.0..1.0)).collect() } /// Generate random positive tensor (for softmax stability testing) fn random_positive_tensor(size: usize) -> Vec { let mut rng = rand::thread_rng(); (0..size).map(|_| rng.gen_range(0.0..10.0)).collect() } // ============================================================================ // Matrix Multiplication Benchmarks // ============================================================================ /// Compare GEMM implementations: ANE vs Accelerate vs NEON fn bench_gemm_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("gemm_ane_vs_neon"); group.sample_size(30); // Test various matrix sizes relevant to LLM inference // Format: (m, k, n) - m=batch, k=input_dim, n=output_dim // // Size categories: // - Small (128x128, 256x256): ANE should dominate (~30-50% faster) // - Medium (512x512, 1024x1024): Transition zone, ANE slight edge // - Large (2048x2048, 4096x4096): GPU crossover zone // - Very Large (8192x8192): GPU clear winner let sizes = [ // Small matrices - ANE advantage zone (1, 128, 128), // Tiny matmul - ANE wins (1, 256, 256), // Small matmul - ANE wins (1, 512, 512), // Medium-small - ANE edge // Medium matrices - Transition zone (1, 1024, 1024), // ANE/GPU crossover starts (1, 2048, 2048), // Crossover zone // Large matrices - GPU advantage (1, 4096, 4096), // Single token, typical projection - GPU starts winning (1, 4096, 11008), // Llama MLP up-projection (1, 11008, 4096), // Llama MLP down-projection // Batch inference - ANE optimal for small batches (8, 4096, 4096), // Small batch (32, 4096, 4096), // Medium batch (64, 4096, 4096), // Optimal ANE batch size (128, 4096, 4096), // Beyond ANE optimal - GPU wins ]; for (m, k, n) in sizes { let a = random_tensor(m * k); let b = random_tensor(k * n); let mut c_out = vec![0.0f32; m * n]; let flops = 2 * m * k * n; let id_suffix = format!("{}x{}x{}", m, k, n); group.throughput(Throughput::Elements(flops as u64)); // NEON baseline (always available on aarch64) #[cfg(target_arch = "aarch64")] { let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { // Use local GEMM implementation to avoid module dependency issues gemm_neon_local(black_box(&a), black_box(&b), black_box(&mut c_out), m, k, n); }) }); } // Accelerate (uses AMX coprocessor) #[cfg(all(target_os = "macos", feature = "accelerate"))] { let id = BenchmarkId::new("accelerate", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { ruvllm::kernels::accelerate::gemm_accelerate( black_box(&a), black_box(&b), black_box(&mut c_out), m, k, n, ); }) }); } // ANE via BNNS/Accelerate #[cfg(all(target_os = "macos", feature = "coreml"))] { let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { ruvllm::kernels::ane_ops::matmul_ane( black_box(&a), black_box(&b), black_box(&mut c_out), m, k, n, ); }) }); } } group.finish(); } /// Benchmark batched matrix multiplication fn bench_batched_gemm_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("batched_gemm_ane_vs_neon"); group.sample_size(30); // Typical attention shapes: batch of Q*K^T or attention*V let configs = [ (8, 128, 128, 128), // 8 heads, seq=128 (32, 128, 128, 128), // 32 heads, seq=128 (32, 256, 128, 256), // 32 heads, seq=256, head_dim=128 (8, 512, 128, 512), // 8 heads, seq=512 ]; for (batch_size, m, k, n) in configs { let a = random_tensor(batch_size * m * k); let b = random_tensor(batch_size * k * n); let mut c_out = vec![0.0f32; batch_size * m * n]; let flops = 2 * batch_size * m * k * n; let id_suffix = format!("batch{}_{}x{}x{}", batch_size, m, k, n); group.throughput(Throughput::Elements(flops as u64)); // NEON batched #[cfg(target_arch = "aarch64")] { let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { for batch in 0..batch_size { let a_off = batch * m * k; let b_off = batch * k * n; let c_off = batch * m * n; gemm_neon_local( black_box(&a[a_off..a_off + m * k]), black_box(&b[b_off..b_off + k * n]), black_box(&mut c_out[c_off..c_off + m * n]), m, k, n, ); } }) }); } // ANE batched #[cfg(all(target_os = "macos", feature = "coreml"))] { let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { ruvllm::kernels::ane_ops::batched_matmul_ane( black_box(&a), black_box(&b), black_box(&mut c_out), batch_size, m, k, n, ); }) }); } } group.finish(); } // ============================================================================ // Activation Function Benchmarks // ============================================================================ /// Compare GELU implementations fn bench_gelu_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("gelu_ane_vs_neon"); group.sample_size(50); // Various batch and dimension sizes let configs = [ (1, 4096), (8, 4096), (32, 4096), (64, 4096), (1, 11008), // Llama MLP intermediate (32, 11008), ]; for (batch_size, dim) in configs { let size = batch_size * dim; let x_orig = random_tensor(size); let ops = size; // One GELU per element let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(ops as u64)); // NEON #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim); }) }); } // ANE #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::gelu_ane(black_box(&mut x), batch_size, dim); }) }); } } group.finish(); } /// Compare SiLU implementations fn bench_silu_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("silu_ane_vs_neon"); group.sample_size(50); let configs = [ (1, 4096), (8, 4096), (32, 4096), (64, 4096), (1, 11008), (32, 11008), ]; for (batch_size, dim) in configs { let size = batch_size * dim; let x_orig = random_tensor(size); let ops = size; let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(ops as u64)); // NEON #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim); }) }); } // ANE #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim); }) }); } } group.finish(); } /// Compare Softmax implementations fn bench_softmax_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("softmax_ane_vs_neon"); group.sample_size(50); // Softmax is typically applied to attention scores let configs = [ (1, 128), // Single head, short seq (32, 128), // 32 heads, short seq (32, 512), // 32 heads, medium seq (32, 2048), // 32 heads, long seq (1, 4096), // Single head, very long ]; for (batch_size, dim) in configs { let size = batch_size * dim; let x_orig = random_positive_tensor(size); let ops = size; let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(ops as u64)); // NEON #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::activations::batch_softmax(black_box(&mut x), dim); }) }); } // ANE #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::softmax_ane(black_box(&mut x), batch_size, dim); }) }); } } group.finish(); } // ============================================================================ // Normalization Benchmarks // ============================================================================ /// Compare LayerNorm implementations fn bench_layer_norm_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("layernorm_ane_vs_neon"); group.sample_size(50); let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)]; for (batch_size, dim) in configs { let size = batch_size * dim; let x_orig = random_tensor(size); let weight = vec![1.0f32; dim]; let bias = vec![0.0f32; dim]; let ops = size * 4; // Approximate: mean, var, normalize, scale let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(ops as u64)); // NEON #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::norm::batched_layer_norm_neon( black_box(&mut x), black_box(&weight), black_box(&bias), batch_size, dim, 1e-6, ); }) }); } // ANE #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::layer_norm_ane( black_box(&mut x), black_box(&weight), black_box(&bias), batch_size, dim, 1e-6, ); }) }); } } group.finish(); } /// Compare RMSNorm implementations fn bench_rms_norm_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("rmsnorm_ane_vs_neon"); group.sample_size(50); let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)]; for (batch_size, dim) in configs { let size = batch_size * dim; let x_orig = random_tensor(size); let weight = vec![1.0f32; dim]; let ops = size * 3; // Approximate: sum_sq, normalize, scale let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(ops as u64)); // NEON #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::norm::batched_rms_norm_neon( black_box(&mut x), black_box(&weight), batch_size, dim, 1e-6, ); }) }); } // ANE #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::rms_norm_ane( black_box(&mut x), black_box(&weight), batch_size, dim, 1e-6, ); }) }); } } group.finish(); } // ============================================================================ // Auto-Dispatch Benchmarks // ============================================================================ /// Test the auto-dispatch functions that select best backend fn bench_auto_dispatch(c: &mut Criterion) { let mut group = c.benchmark_group("auto_dispatch"); group.sample_size(50); let batch_size = 32; let dim = 4096; let size = batch_size * dim; let x_orig = random_tensor(size); let weight = vec![1.0f32; dim]; let bias = vec![0.0f32; dim]; // Auto-dispatch GELU { let mut x = x_orig.clone(); group.bench_function("gelu_auto", |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); #[cfg(all(target_os = "macos", feature = "coreml"))] ruvllm::kernels::ane_ops::gelu_auto(black_box(&mut x), batch_size, dim); #[cfg(not(all(target_os = "macos", feature = "coreml")))] ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim); }) }); } // Auto-dispatch SiLU { let mut x = x_orig.clone(); group.bench_function("silu_auto", |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); #[cfg(all(target_os = "macos", feature = "coreml"))] ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim); #[cfg(not(all(target_os = "macos", feature = "coreml")))] ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim); }) }); } // Auto-dispatch LayerNorm { let mut x = x_orig.clone(); group.bench_function("layernorm_auto", |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); #[cfg(all(target_os = "macos", feature = "coreml"))] ruvllm::kernels::ane_ops::layer_norm_auto( black_box(&mut x), black_box(&weight), black_box(&bias), batch_size, dim, 1e-6, ); #[cfg(not(all(target_os = "macos", feature = "coreml")))] ruvllm::kernels::norm::batched_layer_norm_neon( black_box(&mut x), black_box(&weight), black_box(&bias), batch_size, dim, 1e-6, ); }) }); } group.finish(); } // ============================================================================ // LLM Workload Benchmarks (Realistic Scenarios) // ============================================================================ /// Benchmark typical MLP block operations fn bench_mlp_block(c: &mut Criterion) { let mut group = c.benchmark_group("mlp_block"); group.sample_size(20); // Llama2-7B MLP: hidden_dim=4096, intermediate=11008 let batch_size = 1; let hidden_dim = 4096; let intermediate_dim = 11008; // Up projection weights let w_up = random_tensor(hidden_dim * intermediate_dim); // Down projection weights let w_down = random_tensor(intermediate_dim * hidden_dim); let input = random_tensor(batch_size * hidden_dim); let mut intermediate = vec![0.0f32; batch_size * intermediate_dim]; let mut output = vec![0.0f32; batch_size * hidden_dim]; let total_flops = 2 * batch_size * hidden_dim * intermediate_dim // Up + batch_size * intermediate_dim // Activation + 2 * batch_size * intermediate_dim * hidden_dim; // Down group.throughput(Throughput::Elements(total_flops as u64)); // NEON path #[cfg(target_arch = "aarch64")] { group.bench_function("neon", |bencher| { bencher.iter(|| { // Up projection gemm_neon_local( black_box(&input), black_box(&w_up), black_box(&mut intermediate), batch_size, hidden_dim, intermediate_dim, ); // SiLU activation ruvllm::kernels::activations::batch_silu( black_box(&mut intermediate), intermediate_dim, ); // Down projection gemm_neon_local( black_box(&intermediate), black_box(&w_down), black_box(&mut output), batch_size, intermediate_dim, hidden_dim, ); }) }); } // ANE path #[cfg(all(target_os = "macos", feature = "coreml"))] { group.bench_function("ane", |bencher| { bencher.iter(|| { // Up projection ruvllm::kernels::ane_ops::matmul_ane( black_box(&input), black_box(&w_up), black_box(&mut intermediate), batch_size, hidden_dim, intermediate_dim, ); // SiLU activation ruvllm::kernels::ane_ops::silu_ane( black_box(&mut intermediate), batch_size, intermediate_dim, ); // Down projection ruvllm::kernels::ane_ops::matmul_ane( black_box(&intermediate), black_box(&w_down), black_box(&mut output), batch_size, intermediate_dim, hidden_dim, ); }) }); } group.finish(); } // ============================================================================ // Local NEON GEMM Implementation (to avoid module dependency issues) // ============================================================================ #[cfg(target_arch = "aarch64")] fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) { c.fill(0.0); unsafe { use std::arch::aarch64::*; let a_ptr = a.as_ptr(); let b_ptr = b.as_ptr(); let c_ptr = c.as_mut_ptr(); for i in 0..m { let mut j = 0usize; while j + 4 <= n { let mut acc = vdupq_n_f32(0.0); for kk in 0..k { let a_val = vdupq_n_f32(*a_ptr.add(i * k + kk)); let b_v = vld1q_f32(b_ptr.add(kk * n + j)); acc = vfmaq_f32(acc, a_val, b_v); } vst1q_f32(c_ptr.add(i * n + j), acc); j += 4; } // Handle remaining columns while j < n { let mut sum = 0.0f32; for kk in 0..k { sum += *a_ptr.add(i * k + kk) * *b_ptr.add(kk * n + j); } *c_ptr.add(i * n + j) = sum; j += 1; } } } } #[cfg(not(target_arch = "aarch64"))] fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) { c.fill(0.0); for i in 0..m { for j in 0..n { let mut sum = 0.0f32; for kk in 0..k { sum += a[i * k + kk] * b[kk * n + j]; } c[i * n + j] = sum; } } } // ============================================================================ // Crossover Point Detection Benchmark // ============================================================================ /// Benchmark to identify the exact crossover point where GPU beats ANE /// /// This benchmark tests matrix sizes in increments to find where: /// 1. ANE is clearly faster (small matrices) /// 2. Performance is similar (crossover zone) /// 3. GPU is clearly faster (large matrices) /// /// Expected M4 Pro results: /// - ANE wins: dim < 1024 /// - Crossover: 1024 <= dim <= 2048 /// - GPU wins: dim > 2048 fn bench_crossover_detection(c: &mut Criterion) { let mut group = c.benchmark_group("crossover_detection"); group.sample_size(20); // Test dimensions in powers of 2 to find crossover let dimensions = [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096]; for dim in dimensions { let a = random_tensor(dim * dim); let b = random_tensor(dim * dim); let mut c_out = vec![0.0f32; dim * dim]; let flops = 2 * dim * dim * dim; let id_suffix = format!("{}x{}", dim, dim); group.throughput(Throughput::Elements(flops as u64)); // NEON baseline #[cfg(target_arch = "aarch64")] { let id = BenchmarkId::new("neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { gemm_neon_local( black_box(&a), black_box(&b), black_box(&mut c_out), dim, dim, dim, ); }) }); } // ANE via BNNS #[cfg(all(target_os = "macos", feature = "coreml"))] { let id = BenchmarkId::new("ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { ruvllm::kernels::ane_ops::matmul_ane( black_box(&a), black_box(&b), black_box(&mut c_out), dim, dim, dim, ); }) }); } // Accelerate (AMX) #[cfg(all(target_os = "macos", feature = "accelerate"))] { let id = BenchmarkId::new("accelerate", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { ruvllm::kernels::accelerate::gemm_accelerate( black_box(&a), black_box(&b), black_box(&mut c_out), dim, dim, dim, ); }) }); } } group.finish(); } // ============================================================================ // Hybrid Pipeline Benchmarks (ANE for MLP, GPU for Attention) // ============================================================================ /// Benchmark hybrid ANE+GPU pipeline for transformer inference /// /// Real transformer layers have different compute patterns: /// - Attention: memory-bound, GPU-friendly (high parallelism) /// - MLP: compute-bound, ANE-friendly (batch operations) /// /// This benchmark simulates a hybrid pipeline where: /// 1. ANE handles MLP layers (activations, small projections) /// 2. GPU/NEON handles attention (Q*K^T, softmax*V) #[cfg(all(target_os = "macos", feature = "coreml"))] fn bench_hybrid_pipeline(c: &mut Criterion) { let mut group = c.benchmark_group("hybrid_pipeline"); group.sample_size(15); // Transformer configuration (Llama-7B like) let configs = [ // (batch, seq_len, hidden, heads, head_dim, intermediate) (1, 128, 4096, 32, 128, 11008), // Short context (1, 512, 4096, 32, 128, 11008), // Medium context (1, 2048, 4096, 32, 128, 11008), // Long context ]; for (batch, seq_len, hidden_dim, num_heads, head_dim, intermediate_dim) in configs { let id_suffix = format!("batch{}_seq{}", batch, seq_len); // Pre-allocate tensors let hidden = random_tensor(batch * seq_len * hidden_dim); let w_q = random_tensor(hidden_dim * hidden_dim); let w_k = random_tensor(hidden_dim * hidden_dim); let w_v = random_tensor(hidden_dim * hidden_dim); let w_o = random_tensor(hidden_dim * hidden_dim); let w_up = random_tensor(hidden_dim * intermediate_dim); let w_down = random_tensor(intermediate_dim * hidden_dim); let mut q = vec![0.0f32; batch * seq_len * hidden_dim]; let mut k = vec![0.0f32; batch * seq_len * hidden_dim]; let mut v = vec![0.0f32; batch * seq_len * hidden_dim]; let mut attn_output = vec![0.0f32; batch * seq_len * hidden_dim]; let mut intermediate = vec![0.0f32; batch * seq_len * intermediate_dim]; let mut mlp_output = vec![0.0f32; batch * seq_len * hidden_dim]; let total_ops = // Q, K, V projections 3 * 2 * batch * seq_len * hidden_dim * hidden_dim + // Attention (Q*K^T + softmax + attn*V) 2 * batch * num_heads * seq_len * seq_len * head_dim * 2 + // O projection 2 * batch * seq_len * hidden_dim * hidden_dim + // MLP up + down 2 * batch * seq_len * hidden_dim * intermediate_dim * 2 + // Activations batch * seq_len * intermediate_dim; group.throughput(Throughput::Elements(total_ops as u64)); // Pure NEON path #[cfg(target_arch = "aarch64")] { let id = BenchmarkId::new("pure_neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { // Q, K, V projections gemm_neon_local( &hidden, &w_q, &mut q, batch * seq_len, hidden_dim, hidden_dim, ); gemm_neon_local( &hidden, &w_k, &mut k, batch * seq_len, hidden_dim, hidden_dim, ); gemm_neon_local( &hidden, &w_v, &mut v, batch * seq_len, hidden_dim, hidden_dim, ); // O projection gemm_neon_local( &v, &w_o, &mut attn_output, batch * seq_len, hidden_dim, hidden_dim, ); // MLP: up projection gemm_neon_local( &attn_output, &w_up, &mut intermediate, batch * seq_len, hidden_dim, intermediate_dim, ); // MLP: SiLU activation (in-place) ruvllm::kernels::activations::batch_silu( black_box(&mut intermediate), intermediate_dim, ); // MLP: down projection gemm_neon_local( &intermediate, &w_down, &mut mlp_output, batch * seq_len, intermediate_dim, hidden_dim, ); }) }); } // Pure ANE path let id = BenchmarkId::new("pure_ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { // Q, K, V projections ruvllm::kernels::ane_ops::matmul_ane( &hidden, &w_q, &mut q, batch * seq_len, hidden_dim, hidden_dim, ); ruvllm::kernels::ane_ops::matmul_ane( &hidden, &w_k, &mut k, batch * seq_len, hidden_dim, hidden_dim, ); ruvllm::kernels::ane_ops::matmul_ane( &hidden, &w_v, &mut v, batch * seq_len, hidden_dim, hidden_dim, ); // O projection ruvllm::kernels::ane_ops::matmul_ane( &v, &w_o, &mut attn_output, batch * seq_len, hidden_dim, hidden_dim, ); // MLP: up projection ruvllm::kernels::ane_ops::matmul_ane( &attn_output, &w_up, &mut intermediate, batch * seq_len, hidden_dim, intermediate_dim, ); // MLP: SiLU activation (ANE) ruvllm::kernels::ane_ops::silu_ane( black_box(&mut intermediate), batch * seq_len, intermediate_dim, ); // MLP: down projection ruvllm::kernels::ane_ops::matmul_ane( &intermediate, &w_down, &mut mlp_output, batch * seq_len, intermediate_dim, hidden_dim, ); }) }); // Hybrid path: ANE for MLP activations, auto-dispatch for matmul let id = BenchmarkId::new("hybrid", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { // Q, K, V projections (auto-dispatch based on size) ruvllm::kernels::ane_ops::matmul_auto( &hidden, &w_q, &mut q, batch * seq_len, hidden_dim, hidden_dim, ); ruvllm::kernels::ane_ops::matmul_auto( &hidden, &w_k, &mut k, batch * seq_len, hidden_dim, hidden_dim, ); ruvllm::kernels::ane_ops::matmul_auto( &hidden, &w_v, &mut v, batch * seq_len, hidden_dim, hidden_dim, ); // O projection (auto-dispatch) ruvllm::kernels::ane_ops::matmul_auto( &v, &w_o, &mut attn_output, batch * seq_len, hidden_dim, hidden_dim, ); // MLP: up projection (auto-dispatch) ruvllm::kernels::ane_ops::matmul_auto( &attn_output, &w_up, &mut intermediate, batch * seq_len, hidden_dim, intermediate_dim, ); // MLP: SiLU activation (auto-dispatch - typically ANE) ruvllm::kernels::ane_ops::silu_auto( black_box(&mut intermediate), batch * seq_len, intermediate_dim, ); // MLP: down projection (auto-dispatch) ruvllm::kernels::ane_ops::matmul_auto( &intermediate, &w_down, &mut mlp_output, batch * seq_len, intermediate_dim, hidden_dim, ); }) }); } group.finish(); } // ============================================================================ // Activation Crossover Benchmark // ============================================================================ /// Benchmark activation functions to find ANE vs NEON crossover fn bench_activation_crossover(c: &mut Criterion) { let mut group = c.benchmark_group("activation_crossover"); group.sample_size(50); // Test various sizes to find where ANE beats NEON let sizes = [ (1, 128), // Tiny (1, 512), // Small (1, 2048), // Medium (1, 4096), // Llama hidden (1, 11008), // Llama intermediate (32, 4096), // Batch (64, 4096), // Larger batch (128, 4096), // Big batch ]; for (batch_size, dim) in sizes { let size = batch_size * dim; let x_orig = random_tensor(size); let id_suffix = format!("{}x{}", batch_size, dim); group.throughput(Throughput::Elements(size as u64)); // NEON SiLU #[cfg(target_arch = "aarch64")] { let mut x = x_orig.clone(); let id = BenchmarkId::new("silu_neon", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim); }) }); } // ANE SiLU #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("silu_ane", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim); }) }); } // Auto-dispatch SiLU #[cfg(all(target_os = "macos", feature = "coreml"))] { let mut x = x_orig.clone(); let id = BenchmarkId::new("silu_auto", &id_suffix); group.bench_function(id, |bencher| { bencher.iter(|| { x.copy_from_slice(&x_orig); ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim); }) }); } } group.finish(); } // ============================================================================ // Criterion Groups // ============================================================================ // Full benchmark group for macOS with both features #[cfg(all(target_os = "macos", feature = "coreml"))] criterion_group!( benches, bench_gemm_comparison, bench_batched_gemm_comparison, bench_gelu_comparison, bench_silu_comparison, bench_softmax_comparison, bench_layer_norm_comparison, bench_rms_norm_comparison, bench_auto_dispatch, bench_mlp_block, bench_crossover_detection, bench_hybrid_pipeline, bench_activation_crossover, ); // Reduced benchmark group for non-coreml builds #[cfg(not(all(target_os = "macos", feature = "coreml")))] criterion_group!( benches, bench_gemm_comparison, bench_batched_gemm_comparison, bench_gelu_comparison, bench_silu_comparison, bench_softmax_comparison, bench_layer_norm_comparison, bench_rms_norm_comparison, bench_mlp_block, bench_crossover_detection, bench_activation_crossover, ); criterion_main!(benches);