Files
wifi-densepose/vendor/ruvector/crates/ruvllm/benches/ane_bench.rs

1229 lines
40 KiB
Rust

#![allow(
clippy::all,
unused_imports,
unused_variables,
dead_code,
unused_mut,
unused_assignments,
non_camel_case_types,
clippy::approx_constant,
unexpected_cfgs,
unused_must_use,
unused_parens
)]
//! ANE vs NEON Benchmark Suite
//!
//! Compares Apple Neural Engine (via BNNS) operations against
//! hand-optimized NEON implementations.
//!
//! ## Running Benchmarks
//!
//! ANE benchmarks (requires macOS with coreml feature):
//! ```bash
//! cargo bench -p ruvllm --features coreml --bench ane_bench
//! ```
//!
//! Compare ANE vs Accelerate:
//! ```bash
//! cargo bench -p ruvllm --features coreml,accelerate --bench ane_bench
//! ```
//!
//! ## Performance Targets (M4 Pro)
//!
//! | Operation | Size | ANE Target | NEON Baseline | Expected Speedup |
//! |-----------|------|------------|---------------|------------------|
//! | GEMM | 1x4096x4096 | <500us | <800us | 1.5-2x |
//! | GELU | 64x4096 | <100us | <150us | 1.3-1.5x |
//! | SiLU | 64x4096 | <100us | <150us | 1.3-1.5x |
//! | Softmax | 64x4096 | <150us | <200us | 1.2-1.4x |
//! | LayerNorm | 64x4096 | <200us | <250us | 1.2-1.3x |
//!
//! ## Power Efficiency
//!
//! ANE typically provides 3-4x better performance per watt compared to
//! GPU or CPU for supported operations. This benchmark suite measures
//! wall-clock time, not power consumption.
//!
//! To measure power consumption on macOS, use:
//! ```bash
//! sudo powermetrics --samplers tasks -i 100 | grep ruvllm
//! ```
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use rand::Rng;
// ============================================================================
// Helper Functions
// ============================================================================
/// Generate random tensor data
fn random_tensor(size: usize) -> Vec<f32> {
let mut rng = rand::thread_rng();
(0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
}
/// Generate random positive tensor (for softmax stability testing)
fn random_positive_tensor(size: usize) -> Vec<f32> {
let mut rng = rand::thread_rng();
(0..size).map(|_| rng.gen_range(0.0..10.0)).collect()
}
// ============================================================================
// Matrix Multiplication Benchmarks
// ============================================================================
/// Compare GEMM implementations: ANE vs Accelerate vs NEON
fn bench_gemm_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("gemm_ane_vs_neon");
group.sample_size(30);
// Test various matrix sizes relevant to LLM inference
// Format: (m, k, n) - m=batch, k=input_dim, n=output_dim
//
// Size categories:
// - Small (128x128, 256x256): ANE should dominate (~30-50% faster)
// - Medium (512x512, 1024x1024): Transition zone, ANE slight edge
// - Large (2048x2048, 4096x4096): GPU crossover zone
// - Very Large (8192x8192): GPU clear winner
let sizes = [
// Small matrices - ANE advantage zone
(1, 128, 128), // Tiny matmul - ANE wins
(1, 256, 256), // Small matmul - ANE wins
(1, 512, 512), // Medium-small - ANE edge
// Medium matrices - Transition zone
(1, 1024, 1024), // ANE/GPU crossover starts
(1, 2048, 2048), // Crossover zone
// Large matrices - GPU advantage
(1, 4096, 4096), // Single token, typical projection - GPU starts winning
(1, 4096, 11008), // Llama MLP up-projection
(1, 11008, 4096), // Llama MLP down-projection
// Batch inference - ANE optimal for small batches
(8, 4096, 4096), // Small batch
(32, 4096, 4096), // Medium batch
(64, 4096, 4096), // Optimal ANE batch size
(128, 4096, 4096), // Beyond ANE optimal - GPU wins
];
for (m, k, n) in sizes {
let a = random_tensor(m * k);
let b = random_tensor(k * n);
let mut c_out = vec![0.0f32; m * n];
let flops = 2 * m * k * n;
let id_suffix = format!("{}x{}x{}", m, k, n);
group.throughput(Throughput::Elements(flops as u64));
// NEON baseline (always available on aarch64)
#[cfg(target_arch = "aarch64")]
{
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
// Use local GEMM implementation to avoid module dependency issues
gemm_neon_local(black_box(&a), black_box(&b), black_box(&mut c_out), m, k, n);
})
});
}
// Accelerate (uses AMX coprocessor)
#[cfg(all(target_os = "macos", feature = "accelerate"))]
{
let id = BenchmarkId::new("accelerate", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
ruvllm::kernels::accelerate::gemm_accelerate(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
m,
k,
n,
);
})
});
}
// ANE via BNNS/Accelerate
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
ruvllm::kernels::ane_ops::matmul_ane(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
m,
k,
n,
);
})
});
}
}
group.finish();
}
/// Benchmark batched matrix multiplication
fn bench_batched_gemm_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("batched_gemm_ane_vs_neon");
group.sample_size(30);
// Typical attention shapes: batch of Q*K^T or attention*V
let configs = [
(8, 128, 128, 128), // 8 heads, seq=128
(32, 128, 128, 128), // 32 heads, seq=128
(32, 256, 128, 256), // 32 heads, seq=256, head_dim=128
(8, 512, 128, 512), // 8 heads, seq=512
];
for (batch_size, m, k, n) in configs {
let a = random_tensor(batch_size * m * k);
let b = random_tensor(batch_size * k * n);
let mut c_out = vec![0.0f32; batch_size * m * n];
let flops = 2 * batch_size * m * k * n;
let id_suffix = format!("batch{}_{}x{}x{}", batch_size, m, k, n);
group.throughput(Throughput::Elements(flops as u64));
// NEON batched
#[cfg(target_arch = "aarch64")]
{
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
for batch in 0..batch_size {
let a_off = batch * m * k;
let b_off = batch * k * n;
let c_off = batch * m * n;
gemm_neon_local(
black_box(&a[a_off..a_off + m * k]),
black_box(&b[b_off..b_off + k * n]),
black_box(&mut c_out[c_off..c_off + m * n]),
m,
k,
n,
);
}
})
});
}
// ANE batched
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
ruvllm::kernels::ane_ops::batched_matmul_ane(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
batch_size,
m,
k,
n,
);
})
});
}
}
group.finish();
}
// ============================================================================
// Activation Function Benchmarks
// ============================================================================
/// Compare GELU implementations
fn bench_gelu_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("gelu_ane_vs_neon");
group.sample_size(50);
// Various batch and dimension sizes
let configs = [
(1, 4096),
(8, 4096),
(32, 4096),
(64, 4096),
(1, 11008), // Llama MLP intermediate
(32, 11008),
];
for (batch_size, dim) in configs {
let size = batch_size * dim;
let x_orig = random_tensor(size);
let ops = size; // One GELU per element
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(ops as u64));
// NEON
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim);
})
});
}
// ANE
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::gelu_ane(black_box(&mut x), batch_size, dim);
})
});
}
}
group.finish();
}
/// Compare SiLU implementations
fn bench_silu_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("silu_ane_vs_neon");
group.sample_size(50);
let configs = [
(1, 4096),
(8, 4096),
(32, 4096),
(64, 4096),
(1, 11008),
(32, 11008),
];
for (batch_size, dim) in configs {
let size = batch_size * dim;
let x_orig = random_tensor(size);
let ops = size;
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(ops as u64));
// NEON
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
})
});
}
// ANE
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim);
})
});
}
}
group.finish();
}
/// Compare Softmax implementations
fn bench_softmax_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("softmax_ane_vs_neon");
group.sample_size(50);
// Softmax is typically applied to attention scores
let configs = [
(1, 128), // Single head, short seq
(32, 128), // 32 heads, short seq
(32, 512), // 32 heads, medium seq
(32, 2048), // 32 heads, long seq
(1, 4096), // Single head, very long
];
for (batch_size, dim) in configs {
let size = batch_size * dim;
let x_orig = random_positive_tensor(size);
let ops = size;
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(ops as u64));
// NEON
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::activations::batch_softmax(black_box(&mut x), dim);
})
});
}
// ANE
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::softmax_ane(black_box(&mut x), batch_size, dim);
})
});
}
}
group.finish();
}
// ============================================================================
// Normalization Benchmarks
// ============================================================================
/// Compare LayerNorm implementations
fn bench_layer_norm_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("layernorm_ane_vs_neon");
group.sample_size(50);
let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)];
for (batch_size, dim) in configs {
let size = batch_size * dim;
let x_orig = random_tensor(size);
let weight = vec![1.0f32; dim];
let bias = vec![0.0f32; dim];
let ops = size * 4; // Approximate: mean, var, normalize, scale
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(ops as u64));
// NEON
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::norm::batched_layer_norm_neon(
black_box(&mut x),
black_box(&weight),
black_box(&bias),
batch_size,
dim,
1e-6,
);
})
});
}
// ANE
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::layer_norm_ane(
black_box(&mut x),
black_box(&weight),
black_box(&bias),
batch_size,
dim,
1e-6,
);
})
});
}
}
group.finish();
}
/// Compare RMSNorm implementations
fn bench_rms_norm_comparison(c: &mut Criterion) {
let mut group = c.benchmark_group("rmsnorm_ane_vs_neon");
group.sample_size(50);
let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)];
for (batch_size, dim) in configs {
let size = batch_size * dim;
let x_orig = random_tensor(size);
let weight = vec![1.0f32; dim];
let ops = size * 3; // Approximate: sum_sq, normalize, scale
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(ops as u64));
// NEON
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::norm::batched_rms_norm_neon(
black_box(&mut x),
black_box(&weight),
batch_size,
dim,
1e-6,
);
})
});
}
// ANE
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::rms_norm_ane(
black_box(&mut x),
black_box(&weight),
batch_size,
dim,
1e-6,
);
})
});
}
}
group.finish();
}
// ============================================================================
// Auto-Dispatch Benchmarks
// ============================================================================
/// Test the auto-dispatch functions that select best backend
fn bench_auto_dispatch(c: &mut Criterion) {
let mut group = c.benchmark_group("auto_dispatch");
group.sample_size(50);
let batch_size = 32;
let dim = 4096;
let size = batch_size * dim;
let x_orig = random_tensor(size);
let weight = vec![1.0f32; dim];
let bias = vec![0.0f32; dim];
// Auto-dispatch GELU
{
let mut x = x_orig.clone();
group.bench_function("gelu_auto", |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
#[cfg(all(target_os = "macos", feature = "coreml"))]
ruvllm::kernels::ane_ops::gelu_auto(black_box(&mut x), batch_size, dim);
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim);
})
});
}
// Auto-dispatch SiLU
{
let mut x = x_orig.clone();
group.bench_function("silu_auto", |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
#[cfg(all(target_os = "macos", feature = "coreml"))]
ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim);
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
})
});
}
// Auto-dispatch LayerNorm
{
let mut x = x_orig.clone();
group.bench_function("layernorm_auto", |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
#[cfg(all(target_os = "macos", feature = "coreml"))]
ruvllm::kernels::ane_ops::layer_norm_auto(
black_box(&mut x),
black_box(&weight),
black_box(&bias),
batch_size,
dim,
1e-6,
);
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
ruvllm::kernels::norm::batched_layer_norm_neon(
black_box(&mut x),
black_box(&weight),
black_box(&bias),
batch_size,
dim,
1e-6,
);
})
});
}
group.finish();
}
// ============================================================================
// LLM Workload Benchmarks (Realistic Scenarios)
// ============================================================================
/// Benchmark typical MLP block operations
fn bench_mlp_block(c: &mut Criterion) {
let mut group = c.benchmark_group("mlp_block");
group.sample_size(20);
// Llama2-7B MLP: hidden_dim=4096, intermediate=11008
let batch_size = 1;
let hidden_dim = 4096;
let intermediate_dim = 11008;
// Up projection weights
let w_up = random_tensor(hidden_dim * intermediate_dim);
// Down projection weights
let w_down = random_tensor(intermediate_dim * hidden_dim);
let input = random_tensor(batch_size * hidden_dim);
let mut intermediate = vec![0.0f32; batch_size * intermediate_dim];
let mut output = vec![0.0f32; batch_size * hidden_dim];
let total_flops = 2 * batch_size * hidden_dim * intermediate_dim // Up
+ batch_size * intermediate_dim // Activation
+ 2 * batch_size * intermediate_dim * hidden_dim; // Down
group.throughput(Throughput::Elements(total_flops as u64));
// NEON path
#[cfg(target_arch = "aarch64")]
{
group.bench_function("neon", |bencher| {
bencher.iter(|| {
// Up projection
gemm_neon_local(
black_box(&input),
black_box(&w_up),
black_box(&mut intermediate),
batch_size,
hidden_dim,
intermediate_dim,
);
// SiLU activation
ruvllm::kernels::activations::batch_silu(
black_box(&mut intermediate),
intermediate_dim,
);
// Down projection
gemm_neon_local(
black_box(&intermediate),
black_box(&w_down),
black_box(&mut output),
batch_size,
intermediate_dim,
hidden_dim,
);
})
});
}
// ANE path
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
group.bench_function("ane", |bencher| {
bencher.iter(|| {
// Up projection
ruvllm::kernels::ane_ops::matmul_ane(
black_box(&input),
black_box(&w_up),
black_box(&mut intermediate),
batch_size,
hidden_dim,
intermediate_dim,
);
// SiLU activation
ruvllm::kernels::ane_ops::silu_ane(
black_box(&mut intermediate),
batch_size,
intermediate_dim,
);
// Down projection
ruvllm::kernels::ane_ops::matmul_ane(
black_box(&intermediate),
black_box(&w_down),
black_box(&mut output),
batch_size,
intermediate_dim,
hidden_dim,
);
})
});
}
group.finish();
}
// ============================================================================
// Local NEON GEMM Implementation (to avoid module dependency issues)
// ============================================================================
#[cfg(target_arch = "aarch64")]
fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
c.fill(0.0);
unsafe {
use std::arch::aarch64::*;
let a_ptr = a.as_ptr();
let b_ptr = b.as_ptr();
let c_ptr = c.as_mut_ptr();
for i in 0..m {
let mut j = 0usize;
while j + 4 <= n {
let mut acc = vdupq_n_f32(0.0);
for kk in 0..k {
let a_val = vdupq_n_f32(*a_ptr.add(i * k + kk));
let b_v = vld1q_f32(b_ptr.add(kk * n + j));
acc = vfmaq_f32(acc, a_val, b_v);
}
vst1q_f32(c_ptr.add(i * n + j), acc);
j += 4;
}
// Handle remaining columns
while j < n {
let mut sum = 0.0f32;
for kk in 0..k {
sum += *a_ptr.add(i * k + kk) * *b_ptr.add(kk * n + j);
}
*c_ptr.add(i * n + j) = sum;
j += 1;
}
}
}
}
#[cfg(not(target_arch = "aarch64"))]
fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
c.fill(0.0);
for i in 0..m {
for j in 0..n {
let mut sum = 0.0f32;
for kk in 0..k {
sum += a[i * k + kk] * b[kk * n + j];
}
c[i * n + j] = sum;
}
}
}
// ============================================================================
// Crossover Point Detection Benchmark
// ============================================================================
/// Benchmark to identify the exact crossover point where GPU beats ANE
///
/// This benchmark tests matrix sizes in increments to find where:
/// 1. ANE is clearly faster (small matrices)
/// 2. Performance is similar (crossover zone)
/// 3. GPU is clearly faster (large matrices)
///
/// Expected M4 Pro results:
/// - ANE wins: dim < 1024
/// - Crossover: 1024 <= dim <= 2048
/// - GPU wins: dim > 2048
fn bench_crossover_detection(c: &mut Criterion) {
let mut group = c.benchmark_group("crossover_detection");
group.sample_size(20);
// Test dimensions in powers of 2 to find crossover
let dimensions = [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096];
for dim in dimensions {
let a = random_tensor(dim * dim);
let b = random_tensor(dim * dim);
let mut c_out = vec![0.0f32; dim * dim];
let flops = 2 * dim * dim * dim;
let id_suffix = format!("{}x{}", dim, dim);
group.throughput(Throughput::Elements(flops as u64));
// NEON baseline
#[cfg(target_arch = "aarch64")]
{
let id = BenchmarkId::new("neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
gemm_neon_local(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
dim,
dim,
dim,
);
})
});
}
// ANE via BNNS
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let id = BenchmarkId::new("ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
ruvllm::kernels::ane_ops::matmul_ane(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
dim,
dim,
dim,
);
})
});
}
// Accelerate (AMX)
#[cfg(all(target_os = "macos", feature = "accelerate"))]
{
let id = BenchmarkId::new("accelerate", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
ruvllm::kernels::accelerate::gemm_accelerate(
black_box(&a),
black_box(&b),
black_box(&mut c_out),
dim,
dim,
dim,
);
})
});
}
}
group.finish();
}
// ============================================================================
// Hybrid Pipeline Benchmarks (ANE for MLP, GPU for Attention)
// ============================================================================
/// Benchmark hybrid ANE+GPU pipeline for transformer inference
///
/// Real transformer layers have different compute patterns:
/// - Attention: memory-bound, GPU-friendly (high parallelism)
/// - MLP: compute-bound, ANE-friendly (batch operations)
///
/// This benchmark simulates a hybrid pipeline where:
/// 1. ANE handles MLP layers (activations, small projections)
/// 2. GPU/NEON handles attention (Q*K^T, softmax*V)
#[cfg(all(target_os = "macos", feature = "coreml"))]
fn bench_hybrid_pipeline(c: &mut Criterion) {
let mut group = c.benchmark_group("hybrid_pipeline");
group.sample_size(15);
// Transformer configuration (Llama-7B like)
let configs = [
// (batch, seq_len, hidden, heads, head_dim, intermediate)
(1, 128, 4096, 32, 128, 11008), // Short context
(1, 512, 4096, 32, 128, 11008), // Medium context
(1, 2048, 4096, 32, 128, 11008), // Long context
];
for (batch, seq_len, hidden_dim, num_heads, head_dim, intermediate_dim) in configs {
let id_suffix = format!("batch{}_seq{}", batch, seq_len);
// Pre-allocate tensors
let hidden = random_tensor(batch * seq_len * hidden_dim);
let w_q = random_tensor(hidden_dim * hidden_dim);
let w_k = random_tensor(hidden_dim * hidden_dim);
let w_v = random_tensor(hidden_dim * hidden_dim);
let w_o = random_tensor(hidden_dim * hidden_dim);
let w_up = random_tensor(hidden_dim * intermediate_dim);
let w_down = random_tensor(intermediate_dim * hidden_dim);
let mut q = vec![0.0f32; batch * seq_len * hidden_dim];
let mut k = vec![0.0f32; batch * seq_len * hidden_dim];
let mut v = vec![0.0f32; batch * seq_len * hidden_dim];
let mut attn_output = vec![0.0f32; batch * seq_len * hidden_dim];
let mut intermediate = vec![0.0f32; batch * seq_len * intermediate_dim];
let mut mlp_output = vec![0.0f32; batch * seq_len * hidden_dim];
let total_ops =
// Q, K, V projections
3 * 2 * batch * seq_len * hidden_dim * hidden_dim +
// Attention (Q*K^T + softmax + attn*V)
2 * batch * num_heads * seq_len * seq_len * head_dim * 2 +
// O projection
2 * batch * seq_len * hidden_dim * hidden_dim +
// MLP up + down
2 * batch * seq_len * hidden_dim * intermediate_dim * 2 +
// Activations
batch * seq_len * intermediate_dim;
group.throughput(Throughput::Elements(total_ops as u64));
// Pure NEON path
#[cfg(target_arch = "aarch64")]
{
let id = BenchmarkId::new("pure_neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
// Q, K, V projections
gemm_neon_local(
&hidden,
&w_q,
&mut q,
batch * seq_len,
hidden_dim,
hidden_dim,
);
gemm_neon_local(
&hidden,
&w_k,
&mut k,
batch * seq_len,
hidden_dim,
hidden_dim,
);
gemm_neon_local(
&hidden,
&w_v,
&mut v,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// O projection
gemm_neon_local(
&v,
&w_o,
&mut attn_output,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// MLP: up projection
gemm_neon_local(
&attn_output,
&w_up,
&mut intermediate,
batch * seq_len,
hidden_dim,
intermediate_dim,
);
// MLP: SiLU activation (in-place)
ruvllm::kernels::activations::batch_silu(
black_box(&mut intermediate),
intermediate_dim,
);
// MLP: down projection
gemm_neon_local(
&intermediate,
&w_down,
&mut mlp_output,
batch * seq_len,
intermediate_dim,
hidden_dim,
);
})
});
}
// Pure ANE path
let id = BenchmarkId::new("pure_ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
// Q, K, V projections
ruvllm::kernels::ane_ops::matmul_ane(
&hidden,
&w_q,
&mut q,
batch * seq_len,
hidden_dim,
hidden_dim,
);
ruvllm::kernels::ane_ops::matmul_ane(
&hidden,
&w_k,
&mut k,
batch * seq_len,
hidden_dim,
hidden_dim,
);
ruvllm::kernels::ane_ops::matmul_ane(
&hidden,
&w_v,
&mut v,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// O projection
ruvllm::kernels::ane_ops::matmul_ane(
&v,
&w_o,
&mut attn_output,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// MLP: up projection
ruvllm::kernels::ane_ops::matmul_ane(
&attn_output,
&w_up,
&mut intermediate,
batch * seq_len,
hidden_dim,
intermediate_dim,
);
// MLP: SiLU activation (ANE)
ruvllm::kernels::ane_ops::silu_ane(
black_box(&mut intermediate),
batch * seq_len,
intermediate_dim,
);
// MLP: down projection
ruvllm::kernels::ane_ops::matmul_ane(
&intermediate,
&w_down,
&mut mlp_output,
batch * seq_len,
intermediate_dim,
hidden_dim,
);
})
});
// Hybrid path: ANE for MLP activations, auto-dispatch for matmul
let id = BenchmarkId::new("hybrid", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
// Q, K, V projections (auto-dispatch based on size)
ruvllm::kernels::ane_ops::matmul_auto(
&hidden,
&w_q,
&mut q,
batch * seq_len,
hidden_dim,
hidden_dim,
);
ruvllm::kernels::ane_ops::matmul_auto(
&hidden,
&w_k,
&mut k,
batch * seq_len,
hidden_dim,
hidden_dim,
);
ruvllm::kernels::ane_ops::matmul_auto(
&hidden,
&w_v,
&mut v,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// O projection (auto-dispatch)
ruvllm::kernels::ane_ops::matmul_auto(
&v,
&w_o,
&mut attn_output,
batch * seq_len,
hidden_dim,
hidden_dim,
);
// MLP: up projection (auto-dispatch)
ruvllm::kernels::ane_ops::matmul_auto(
&attn_output,
&w_up,
&mut intermediate,
batch * seq_len,
hidden_dim,
intermediate_dim,
);
// MLP: SiLU activation (auto-dispatch - typically ANE)
ruvllm::kernels::ane_ops::silu_auto(
black_box(&mut intermediate),
batch * seq_len,
intermediate_dim,
);
// MLP: down projection (auto-dispatch)
ruvllm::kernels::ane_ops::matmul_auto(
&intermediate,
&w_down,
&mut mlp_output,
batch * seq_len,
intermediate_dim,
hidden_dim,
);
})
});
}
group.finish();
}
// ============================================================================
// Activation Crossover Benchmark
// ============================================================================
/// Benchmark activation functions to find ANE vs NEON crossover
fn bench_activation_crossover(c: &mut Criterion) {
let mut group = c.benchmark_group("activation_crossover");
group.sample_size(50);
// Test various sizes to find where ANE beats NEON
let sizes = [
(1, 128), // Tiny
(1, 512), // Small
(1, 2048), // Medium
(1, 4096), // Llama hidden
(1, 11008), // Llama intermediate
(32, 4096), // Batch
(64, 4096), // Larger batch
(128, 4096), // Big batch
];
for (batch_size, dim) in sizes {
let size = batch_size * dim;
let x_orig = random_tensor(size);
let id_suffix = format!("{}x{}", batch_size, dim);
group.throughput(Throughput::Elements(size as u64));
// NEON SiLU
#[cfg(target_arch = "aarch64")]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("silu_neon", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
})
});
}
// ANE SiLU
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("silu_ane", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim);
})
});
}
// Auto-dispatch SiLU
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
let mut x = x_orig.clone();
let id = BenchmarkId::new("silu_auto", &id_suffix);
group.bench_function(id, |bencher| {
bencher.iter(|| {
x.copy_from_slice(&x_orig);
ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim);
})
});
}
}
group.finish();
}
// ============================================================================
// Criterion Groups
// ============================================================================
// Full benchmark group for macOS with both features
#[cfg(all(target_os = "macos", feature = "coreml"))]
criterion_group!(
benches,
bench_gemm_comparison,
bench_batched_gemm_comparison,
bench_gelu_comparison,
bench_silu_comparison,
bench_softmax_comparison,
bench_layer_norm_comparison,
bench_rms_norm_comparison,
bench_auto_dispatch,
bench_mlp_block,
bench_crossover_detection,
bench_hybrid_pipeline,
bench_activation_crossover,
);
// Reduced benchmark group for non-coreml builds
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
criterion_group!(
benches,
bench_gemm_comparison,
bench_batched_gemm_comparison,
bench_gelu_comparison,
bench_silu_comparison,
bench_softmax_comparison,
bench_layer_norm_comparison,
bench_rms_norm_comparison,
bench_mlp_block,
bench_crossover_detection,
bench_activation_crossover,
);
criterion_main!(benches);