wifi-densepose/vendor/ruvector/crates/ruvllm/benches/ane_bench.rs

#![allow(
    clippy::all,
    unused_imports,
    unused_variables,
    dead_code,
    unused_mut,
    unused_assignments,
    non_camel_case_types,
    clippy::approx_constant,
    unexpected_cfgs,
    unused_must_use,
    unused_parens
)]
//! ANE vs NEON Benchmark Suite
//!
//! Compares Apple Neural Engine (via BNNS) operations against
//! hand-optimized NEON implementations.
//!
//! ## Running Benchmarks
//!
//! ANE benchmarks (requires macOS with coreml feature):
//! ```bash
//! cargo bench -p ruvllm --features coreml --bench ane_bench
//! ```
//!
//! Compare ANE vs Accelerate:
//! ```bash
//! cargo bench -p ruvllm --features coreml,accelerate --bench ane_bench
//! ```
//!
//! ## Performance Targets (M4 Pro)
//!
//! | Operation | Size | ANE Target | NEON Baseline | Expected Speedup |
//! |-----------|------|------------|---------------|------------------|
//! | GEMM | 1x4096x4096 | <500us | <800us | 1.5-2x |
//! | GELU | 64x4096 | <100us | <150us | 1.3-1.5x |
//! | SiLU | 64x4096 | <100us | <150us | 1.3-1.5x |
//! | Softmax | 64x4096 | <150us | <200us | 1.2-1.4x |
//! | LayerNorm | 64x4096 | <200us | <250us | 1.2-1.3x |
//!
//! ## Power Efficiency
//!
//! ANE typically provides 3-4x better performance per watt compared to
//! GPU or CPU for supported operations. This benchmark suite measures
//! wall-clock time, not power consumption.
//!
//! To measure power consumption on macOS, use:
//! ```bash
//! sudo powermetrics --samplers tasks -i 100 | grep ruvllm
//! ```

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use rand::Rng;

// ============================================================================
// Helper Functions
// ============================================================================

/// Generate random tensor data
fn random_tensor(size: usize) -> Vec<f32> {
    let mut rng = rand::thread_rng();
    (0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
}

/// Generate random positive tensor (for softmax stability testing)
fn random_positive_tensor(size: usize) -> Vec<f32> {
    let mut rng = rand::thread_rng();
    (0..size).map(|_| rng.gen_range(0.0..10.0)).collect()
}

// ============================================================================
// Matrix Multiplication Benchmarks
// ============================================================================

/// Compare GEMM implementations: ANE vs Accelerate vs NEON
fn bench_gemm_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("gemm_ane_vs_neon");
    group.sample_size(30);

    // Test various matrix sizes relevant to LLM inference
    // Format: (m, k, n) - m=batch, k=input_dim, n=output_dim
    //
    // Size categories:
    // - Small (128x128, 256x256): ANE should dominate (~30-50% faster)
    // - Medium (512x512, 1024x1024): Transition zone, ANE slight edge
    // - Large (2048x2048, 4096x4096): GPU crossover zone
    // - Very Large (8192x8192): GPU clear winner
    let sizes = [
        // Small matrices - ANE advantage zone
        (1, 128, 128), // Tiny matmul - ANE wins
        (1, 256, 256), // Small matmul - ANE wins
        (1, 512, 512), // Medium-small - ANE edge
        // Medium matrices - Transition zone
        (1, 1024, 1024), // ANE/GPU crossover starts
        (1, 2048, 2048), // Crossover zone
        // Large matrices - GPU advantage
        (1, 4096, 4096),  // Single token, typical projection - GPU starts winning
        (1, 4096, 11008), // Llama MLP up-projection
        (1, 11008, 4096), // Llama MLP down-projection
        // Batch inference - ANE optimal for small batches
        (8, 4096, 4096),   // Small batch
        (32, 4096, 4096),  // Medium batch
        (64, 4096, 4096),  // Optimal ANE batch size
        (128, 4096, 4096), // Beyond ANE optimal - GPU wins
    ];

    for (m, k, n) in sizes {
        let a = random_tensor(m * k);
        let b = random_tensor(k * n);
        let mut c_out = vec![0.0f32; m * n];

        let flops = 2 * m * k * n;
        let id_suffix = format!("{}x{}x{}", m, k, n);

        group.throughput(Throughput::Elements(flops as u64));

        // NEON baseline (always available on aarch64)
        #[cfg(target_arch = "aarch64")]
        {
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    // Use local GEMM implementation to avoid module dependency issues
                    gemm_neon_local(black_box(&a), black_box(&b), black_box(&mut c_out), m, k, n);
                })
            });
        }

        // Accelerate (uses AMX coprocessor)
        #[cfg(all(target_os = "macos", feature = "accelerate"))]
        {
            let id = BenchmarkId::new("accelerate", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    ruvllm::kernels::accelerate::gemm_accelerate(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        m,
                        k,
                        n,
                    );
                })
            });
        }

        // ANE via BNNS/Accelerate
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    ruvllm::kernels::ane_ops::matmul_ane(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        m,
                        k,
                        n,
                    );
                })
            });
        }
    }

    group.finish();
}

/// Benchmark batched matrix multiplication
fn bench_batched_gemm_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("batched_gemm_ane_vs_neon");
    group.sample_size(30);

    // Typical attention shapes: batch of Q*K^T or attention*V
    let configs = [
        (8, 128, 128, 128),  // 8 heads, seq=128
        (32, 128, 128, 128), // 32 heads, seq=128
        (32, 256, 128, 256), // 32 heads, seq=256, head_dim=128
        (8, 512, 128, 512),  // 8 heads, seq=512
    ];

    for (batch_size, m, k, n) in configs {
        let a = random_tensor(batch_size * m * k);
        let b = random_tensor(batch_size * k * n);
        let mut c_out = vec![0.0f32; batch_size * m * n];

        let flops = 2 * batch_size * m * k * n;
        let id_suffix = format!("batch{}_{}x{}x{}", batch_size, m, k, n);

        group.throughput(Throughput::Elements(flops as u64));

        // NEON batched
        #[cfg(target_arch = "aarch64")]
        {
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    for batch in 0..batch_size {
                        let a_off = batch * m * k;
                        let b_off = batch * k * n;
                        let c_off = batch * m * n;
                        gemm_neon_local(
                            black_box(&a[a_off..a_off + m * k]),
                            black_box(&b[b_off..b_off + k * n]),
                            black_box(&mut c_out[c_off..c_off + m * n]),
                            m,
                            k,
                            n,
                        );
                    }
                })
            });
        }

        // ANE batched
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    ruvllm::kernels::ane_ops::batched_matmul_ane(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        batch_size,
                        m,
                        k,
                        n,
                    );
                })
            });
        }
    }

    group.finish();
}

// ============================================================================
// Activation Function Benchmarks
// ============================================================================

/// Compare GELU implementations
fn bench_gelu_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("gelu_ane_vs_neon");
    group.sample_size(50);

    // Various batch and dimension sizes
    let configs = [
        (1, 4096),
        (8, 4096),
        (32, 4096),
        (64, 4096),
        (1, 11008), // Llama MLP intermediate
        (32, 11008),
    ];

    for (batch_size, dim) in configs {
        let size = batch_size * dim;
        let x_orig = random_tensor(size);

        let ops = size; // One GELU per element
        let id_suffix = format!("{}x{}", batch_size, dim);

        group.throughput(Throughput::Elements(ops as u64));

        // NEON
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim);
                })
            });
        }

        // ANE
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::gelu_ane(black_box(&mut x), batch_size, dim);
                })
            });
        }
    }

    group.finish();
}

/// Compare SiLU implementations
fn bench_silu_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("silu_ane_vs_neon");
    group.sample_size(50);

    let configs = [
        (1, 4096),
        (8, 4096),
        (32, 4096),
        (64, 4096),
        (1, 11008),
        (32, 11008),
    ];

    for (batch_size, dim) in configs {
        let size = batch_size * dim;
        let x_orig = random_tensor(size);

        let ops = size;
        let id_suffix = format!("{}x{}", batch_size, dim);

        group.throughput(Throughput::Elements(ops as u64));

        // NEON
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
                })
            });
        }

        // ANE
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim);
                })
            });
        }
    }

    group.finish();
}

/// Compare Softmax implementations
fn bench_softmax_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("softmax_ane_vs_neon");
    group.sample_size(50);

    // Softmax is typically applied to attention scores
    let configs = [
        (1, 128),   // Single head, short seq
        (32, 128),  // 32 heads, short seq
        (32, 512),  // 32 heads, medium seq
        (32, 2048), // 32 heads, long seq
        (1, 4096),  // Single head, very long
    ];

    for (batch_size, dim) in configs {
        let size = batch_size * dim;
        let x_orig = random_positive_tensor(size);

        let ops = size;
        let id_suffix = format!("{}x{}", batch_size, dim);

        group.throughput(Throughput::Elements(ops as u64));

        // NEON
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::activations::batch_softmax(black_box(&mut x), dim);
                })
            });
        }

        // ANE
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::softmax_ane(black_box(&mut x), batch_size, dim);
                })
            });
        }
    }

    group.finish();
}

// ============================================================================
// Normalization Benchmarks
// ============================================================================

/// Compare LayerNorm implementations
fn bench_layer_norm_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("layernorm_ane_vs_neon");
    group.sample_size(50);

    let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)];

    for (batch_size, dim) in configs {
        let size = batch_size * dim;
        let x_orig = random_tensor(size);
        let weight = vec![1.0f32; dim];
        let bias = vec![0.0f32; dim];

        let ops = size * 4; // Approximate: mean, var, normalize, scale
        let id_suffix = format!("{}x{}", batch_size, dim);

        group.throughput(Throughput::Elements(ops as u64));

        // NEON
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::norm::batched_layer_norm_neon(
                        black_box(&mut x),
                        black_box(&weight),
                        black_box(&bias),
                        batch_size,
                        dim,
                        1e-6,
                    );
                })
            });
        }

        // ANE
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::layer_norm_ane(
                        black_box(&mut x),
                        black_box(&weight),
                        black_box(&bias),
                        batch_size,
                        dim,
                        1e-6,
                    );
                })
            });
        }
    }

    group.finish();
}

/// Compare RMSNorm implementations
fn bench_rms_norm_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("rmsnorm_ane_vs_neon");
    group.sample_size(50);

    let configs = [(1, 4096), (8, 4096), (32, 4096), (64, 4096), (128, 4096)];

    for (batch_size, dim) in configs {
        let size = batch_size * dim;
        let x_orig = random_tensor(size);
        let weight = vec![1.0f32; dim];

        let ops = size * 3; // Approximate: sum_sq, normalize, scale
        let id_suffix = format!("{}x{}", batch_size, dim);

        group.throughput(Throughput::Elements(ops as u64));

        // NEON
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::norm::batched_rms_norm_neon(
                        black_box(&mut x),
                        black_box(&weight),
                        batch_size,
                        dim,
                        1e-6,
                    );
                })
            });
        }

        // ANE
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::rms_norm_ane(
                        black_box(&mut x),
                        black_box(&weight),
                        batch_size,
                        dim,
                        1e-6,
                    );
                })
            });
        }
    }

    group.finish();
}

// ============================================================================
// Auto-Dispatch Benchmarks
// ============================================================================

/// Test the auto-dispatch functions that select best backend
fn bench_auto_dispatch(c: &mut Criterion) {
    let mut group = c.benchmark_group("auto_dispatch");
    group.sample_size(50);

    let batch_size = 32;
    let dim = 4096;
    let size = batch_size * dim;

    let x_orig = random_tensor(size);
    let weight = vec![1.0f32; dim];
    let bias = vec![0.0f32; dim];

    // Auto-dispatch GELU
    {
        let mut x = x_orig.clone();
        group.bench_function("gelu_auto", |bencher| {
            bencher.iter(|| {
                x.copy_from_slice(&x_orig);
                #[cfg(all(target_os = "macos", feature = "coreml"))]
                ruvllm::kernels::ane_ops::gelu_auto(black_box(&mut x), batch_size, dim);
                #[cfg(not(all(target_os = "macos", feature = "coreml")))]
                ruvllm::kernels::activations::batch_gelu(black_box(&mut x), dim);
            })
        });
    }

    // Auto-dispatch SiLU
    {
        let mut x = x_orig.clone();
        group.bench_function("silu_auto", |bencher| {
            bencher.iter(|| {
                x.copy_from_slice(&x_orig);
                #[cfg(all(target_os = "macos", feature = "coreml"))]
                ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim);
                #[cfg(not(all(target_os = "macos", feature = "coreml")))]
                ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
            })
        });
    }

    // Auto-dispatch LayerNorm
    {
        let mut x = x_orig.clone();
        group.bench_function("layernorm_auto", |bencher| {
            bencher.iter(|| {
                x.copy_from_slice(&x_orig);
                #[cfg(all(target_os = "macos", feature = "coreml"))]
                ruvllm::kernels::ane_ops::layer_norm_auto(
                    black_box(&mut x),
                    black_box(&weight),
                    black_box(&bias),
                    batch_size,
                    dim,
                    1e-6,
                );
                #[cfg(not(all(target_os = "macos", feature = "coreml")))]
                ruvllm::kernels::norm::batched_layer_norm_neon(
                    black_box(&mut x),
                    black_box(&weight),
                    black_box(&bias),
                    batch_size,
                    dim,
                    1e-6,
                );
            })
        });
    }

    group.finish();
}

// ============================================================================
// LLM Workload Benchmarks (Realistic Scenarios)
// ============================================================================

/// Benchmark typical MLP block operations
fn bench_mlp_block(c: &mut Criterion) {
    let mut group = c.benchmark_group("mlp_block");
    group.sample_size(20);

    // Llama2-7B MLP: hidden_dim=4096, intermediate=11008
    let batch_size = 1;
    let hidden_dim = 4096;
    let intermediate_dim = 11008;

    // Up projection weights
    let w_up = random_tensor(hidden_dim * intermediate_dim);
    // Down projection weights
    let w_down = random_tensor(intermediate_dim * hidden_dim);

    let input = random_tensor(batch_size * hidden_dim);
    let mut intermediate = vec![0.0f32; batch_size * intermediate_dim];
    let mut output = vec![0.0f32; batch_size * hidden_dim];

    let total_flops = 2 * batch_size * hidden_dim * intermediate_dim  // Up
                    + batch_size * intermediate_dim                    // Activation
                    + 2 * batch_size * intermediate_dim * hidden_dim; // Down

    group.throughput(Throughput::Elements(total_flops as u64));

    // NEON path
    #[cfg(target_arch = "aarch64")]
    {
        group.bench_function("neon", |bencher| {
            bencher.iter(|| {
                // Up projection
                gemm_neon_local(
                    black_box(&input),
                    black_box(&w_up),
                    black_box(&mut intermediate),
                    batch_size,
                    hidden_dim,
                    intermediate_dim,
                );
                // SiLU activation
                ruvllm::kernels::activations::batch_silu(
                    black_box(&mut intermediate),
                    intermediate_dim,
                );
                // Down projection
                gemm_neon_local(
                    black_box(&intermediate),
                    black_box(&w_down),
                    black_box(&mut output),
                    batch_size,
                    intermediate_dim,
                    hidden_dim,
                );
            })
        });
    }

    // ANE path
    #[cfg(all(target_os = "macos", feature = "coreml"))]
    {
        group.bench_function("ane", |bencher| {
            bencher.iter(|| {
                // Up projection
                ruvllm::kernels::ane_ops::matmul_ane(
                    black_box(&input),
                    black_box(&w_up),
                    black_box(&mut intermediate),
                    batch_size,
                    hidden_dim,
                    intermediate_dim,
                );
                // SiLU activation
                ruvllm::kernels::ane_ops::silu_ane(
                    black_box(&mut intermediate),
                    batch_size,
                    intermediate_dim,
                );
                // Down projection
                ruvllm::kernels::ane_ops::matmul_ane(
                    black_box(&intermediate),
                    black_box(&w_down),
                    black_box(&mut output),
                    batch_size,
                    intermediate_dim,
                    hidden_dim,
                );
            })
        });
    }

    group.finish();
}

// ============================================================================
// Local NEON GEMM Implementation (to avoid module dependency issues)
// ============================================================================

#[cfg(target_arch = "aarch64")]
fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
    c.fill(0.0);

    unsafe {
        use std::arch::aarch64::*;

        let a_ptr = a.as_ptr();
        let b_ptr = b.as_ptr();
        let c_ptr = c.as_mut_ptr();

        for i in 0..m {
            let mut j = 0usize;
            while j + 4 <= n {
                let mut acc = vdupq_n_f32(0.0);

                for kk in 0..k {
                    let a_val = vdupq_n_f32(*a_ptr.add(i * k + kk));
                    let b_v = vld1q_f32(b_ptr.add(kk * n + j));
                    acc = vfmaq_f32(acc, a_val, b_v);
                }

                vst1q_f32(c_ptr.add(i * n + j), acc);
                j += 4;
            }

            // Handle remaining columns
            while j < n {
                let mut sum = 0.0f32;
                for kk in 0..k {
                    sum += *a_ptr.add(i * k + kk) * *b_ptr.add(kk * n + j);
                }
                *c_ptr.add(i * n + j) = sum;
                j += 1;
            }
        }
    }
}

#[cfg(not(target_arch = "aarch64"))]
fn gemm_neon_local(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
    c.fill(0.0);
    for i in 0..m {
        for j in 0..n {
            let mut sum = 0.0f32;
            for kk in 0..k {
                sum += a[i * k + kk] * b[kk * n + j];
            }
            c[i * n + j] = sum;
        }
    }
}

// ============================================================================
// Crossover Point Detection Benchmark
// ============================================================================

/// Benchmark to identify the exact crossover point where GPU beats ANE
///
/// This benchmark tests matrix sizes in increments to find where:
/// 1. ANE is clearly faster (small matrices)
/// 2. Performance is similar (crossover zone)
/// 3. GPU is clearly faster (large matrices)
///
/// Expected M4 Pro results:
/// - ANE wins: dim < 1024
/// - Crossover: 1024 <= dim <= 2048
/// - GPU wins: dim > 2048
fn bench_crossover_detection(c: &mut Criterion) {
    let mut group = c.benchmark_group("crossover_detection");
    group.sample_size(20);

    // Test dimensions in powers of 2 to find crossover
    let dimensions = [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096];

    for dim in dimensions {
        let a = random_tensor(dim * dim);
        let b = random_tensor(dim * dim);
        let mut c_out = vec![0.0f32; dim * dim];

        let flops = 2 * dim * dim * dim;
        let id_suffix = format!("{}x{}", dim, dim);

        group.throughput(Throughput::Elements(flops as u64));

        // NEON baseline
        #[cfg(target_arch = "aarch64")]
        {
            let id = BenchmarkId::new("neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    gemm_neon_local(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        dim,
                        dim,
                        dim,
                    );
                })
            });
        }

        // ANE via BNNS
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let id = BenchmarkId::new("ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    ruvllm::kernels::ane_ops::matmul_ane(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        dim,
                        dim,
                        dim,
                    );
                })
            });
        }

        // Accelerate (AMX)
        #[cfg(all(target_os = "macos", feature = "accelerate"))]
        {
            let id = BenchmarkId::new("accelerate", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    ruvllm::kernels::accelerate::gemm_accelerate(
                        black_box(&a),
                        black_box(&b),
                        black_box(&mut c_out),
                        dim,
                        dim,
                        dim,
                    );
                })
            });
        }
    }

    group.finish();
}

// ============================================================================
// Hybrid Pipeline Benchmarks (ANE for MLP, GPU for Attention)
// ============================================================================

/// Benchmark hybrid ANE+GPU pipeline for transformer inference
///
/// Real transformer layers have different compute patterns:
/// - Attention: memory-bound, GPU-friendly (high parallelism)
/// - MLP: compute-bound, ANE-friendly (batch operations)
///
/// This benchmark simulates a hybrid pipeline where:
/// 1. ANE handles MLP layers (activations, small projections)
/// 2. GPU/NEON handles attention (Q*K^T, softmax*V)
#[cfg(all(target_os = "macos", feature = "coreml"))]
fn bench_hybrid_pipeline(c: &mut Criterion) {
    let mut group = c.benchmark_group("hybrid_pipeline");
    group.sample_size(15);

    // Transformer configuration (Llama-7B like)
    let configs = [
        // (batch, seq_len, hidden, heads, head_dim, intermediate)
        (1, 128, 4096, 32, 128, 11008),  // Short context
        (1, 512, 4096, 32, 128, 11008),  // Medium context
        (1, 2048, 4096, 32, 128, 11008), // Long context
    ];

    for (batch, seq_len, hidden_dim, num_heads, head_dim, intermediate_dim) in configs {
        let id_suffix = format!("batch{}_seq{}", batch, seq_len);

        // Pre-allocate tensors
        let hidden = random_tensor(batch * seq_len * hidden_dim);
        let w_q = random_tensor(hidden_dim * hidden_dim);
        let w_k = random_tensor(hidden_dim * hidden_dim);
        let w_v = random_tensor(hidden_dim * hidden_dim);
        let w_o = random_tensor(hidden_dim * hidden_dim);
        let w_up = random_tensor(hidden_dim * intermediate_dim);
        let w_down = random_tensor(intermediate_dim * hidden_dim);

        let mut q = vec![0.0f32; batch * seq_len * hidden_dim];
        let mut k = vec![0.0f32; batch * seq_len * hidden_dim];
        let mut v = vec![0.0f32; batch * seq_len * hidden_dim];
        let mut attn_output = vec![0.0f32; batch * seq_len * hidden_dim];
        let mut intermediate = vec![0.0f32; batch * seq_len * intermediate_dim];
        let mut mlp_output = vec![0.0f32; batch * seq_len * hidden_dim];

        let total_ops =
            // Q, K, V projections
            3 * 2 * batch * seq_len * hidden_dim * hidden_dim +
            // Attention (Q*K^T + softmax + attn*V)
            2 * batch * num_heads * seq_len * seq_len * head_dim * 2 +
            // O projection
            2 * batch * seq_len * hidden_dim * hidden_dim +
            // MLP up + down
            2 * batch * seq_len * hidden_dim * intermediate_dim * 2 +
            // Activations
            batch * seq_len * intermediate_dim;

        group.throughput(Throughput::Elements(total_ops as u64));

        // Pure NEON path
        #[cfg(target_arch = "aarch64")]
        {
            let id = BenchmarkId::new("pure_neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    // Q, K, V projections
                    gemm_neon_local(
                        &hidden,
                        &w_q,
                        &mut q,
                        batch * seq_len,
                        hidden_dim,
                        hidden_dim,
                    );
                    gemm_neon_local(
                        &hidden,
                        &w_k,
                        &mut k,
                        batch * seq_len,
                        hidden_dim,
                        hidden_dim,
                    );
                    gemm_neon_local(
                        &hidden,
                        &w_v,
                        &mut v,
                        batch * seq_len,
                        hidden_dim,
                        hidden_dim,
                    );

                    // O projection
                    gemm_neon_local(
                        &v,
                        &w_o,
                        &mut attn_output,
                        batch * seq_len,
                        hidden_dim,
                        hidden_dim,
                    );

                    // MLP: up projection
                    gemm_neon_local(
                        &attn_output,
                        &w_up,
                        &mut intermediate,
                        batch * seq_len,
                        hidden_dim,
                        intermediate_dim,
                    );

                    // MLP: SiLU activation (in-place)
                    ruvllm::kernels::activations::batch_silu(
                        black_box(&mut intermediate),
                        intermediate_dim,
                    );

                    // MLP: down projection
                    gemm_neon_local(
                        &intermediate,
                        &w_down,
                        &mut mlp_output,
                        batch * seq_len,
                        intermediate_dim,
                        hidden_dim,
                    );
                })
            });
        }

        // Pure ANE path
        let id = BenchmarkId::new("pure_ane", &id_suffix);
        group.bench_function(id, |bencher| {
            bencher.iter(|| {
                // Q, K, V projections
                ruvllm::kernels::ane_ops::matmul_ane(
                    &hidden,
                    &w_q,
                    &mut q,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );
                ruvllm::kernels::ane_ops::matmul_ane(
                    &hidden,
                    &w_k,
                    &mut k,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );
                ruvllm::kernels::ane_ops::matmul_ane(
                    &hidden,
                    &w_v,
                    &mut v,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );

                // O projection
                ruvllm::kernels::ane_ops::matmul_ane(
                    &v,
                    &w_o,
                    &mut attn_output,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );

                // MLP: up projection
                ruvllm::kernels::ane_ops::matmul_ane(
                    &attn_output,
                    &w_up,
                    &mut intermediate,
                    batch * seq_len,
                    hidden_dim,
                    intermediate_dim,
                );

                // MLP: SiLU activation (ANE)
                ruvllm::kernels::ane_ops::silu_ane(
                    black_box(&mut intermediate),
                    batch * seq_len,
                    intermediate_dim,
                );

                // MLP: down projection
                ruvllm::kernels::ane_ops::matmul_ane(
                    &intermediate,
                    &w_down,
                    &mut mlp_output,
                    batch * seq_len,
                    intermediate_dim,
                    hidden_dim,
                );
            })
        });

        // Hybrid path: ANE for MLP activations, auto-dispatch for matmul
        let id = BenchmarkId::new("hybrid", &id_suffix);
        group.bench_function(id, |bencher| {
            bencher.iter(|| {
                // Q, K, V projections (auto-dispatch based on size)
                ruvllm::kernels::ane_ops::matmul_auto(
                    &hidden,
                    &w_q,
                    &mut q,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );
                ruvllm::kernels::ane_ops::matmul_auto(
                    &hidden,
                    &w_k,
                    &mut k,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );
                ruvllm::kernels::ane_ops::matmul_auto(
                    &hidden,
                    &w_v,
                    &mut v,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );

                // O projection (auto-dispatch)
                ruvllm::kernels::ane_ops::matmul_auto(
                    &v,
                    &w_o,
                    &mut attn_output,
                    batch * seq_len,
                    hidden_dim,
                    hidden_dim,
                );

                // MLP: up projection (auto-dispatch)
                ruvllm::kernels::ane_ops::matmul_auto(
                    &attn_output,
                    &w_up,
                    &mut intermediate,
                    batch * seq_len,
                    hidden_dim,
                    intermediate_dim,
                );

                // MLP: SiLU activation (auto-dispatch - typically ANE)
                ruvllm::kernels::ane_ops::silu_auto(
                    black_box(&mut intermediate),
                    batch * seq_len,
                    intermediate_dim,
                );

                // MLP: down projection (auto-dispatch)
                ruvllm::kernels::ane_ops::matmul_auto(
                    &intermediate,
                    &w_down,
                    &mut mlp_output,
                    batch * seq_len,
                    intermediate_dim,
                    hidden_dim,
                );
            })
        });
    }

    group.finish();
}

// ============================================================================
// Activation Crossover Benchmark
// ============================================================================

/// Benchmark activation functions to find ANE vs NEON crossover
fn bench_activation_crossover(c: &mut Criterion) {
    let mut group = c.benchmark_group("activation_crossover");
    group.sample_size(50);

    // Test various sizes to find where ANE beats NEON
    let sizes = [
        (1, 128),    // Tiny
        (1, 512),    // Small
        (1, 2048),   // Medium
        (1, 4096),   // Llama hidden
        (1, 11008),  // Llama intermediate
        (32, 4096),  // Batch
        (64, 4096),  // Larger batch
        (128, 4096), // Big batch
    ];

    for (batch_size, dim) in sizes {
        let size = batch_size * dim;
        let x_orig = random_tensor(size);

        let id_suffix = format!("{}x{}", batch_size, dim);
        group.throughput(Throughput::Elements(size as u64));

        // NEON SiLU
        #[cfg(target_arch = "aarch64")]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("silu_neon", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::activations::batch_silu(black_box(&mut x), dim);
                })
            });
        }

        // ANE SiLU
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("silu_ane", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::silu_ane(black_box(&mut x), batch_size, dim);
                })
            });
        }

        // Auto-dispatch SiLU
        #[cfg(all(target_os = "macos", feature = "coreml"))]
        {
            let mut x = x_orig.clone();
            let id = BenchmarkId::new("silu_auto", &id_suffix);
            group.bench_function(id, |bencher| {
                bencher.iter(|| {
                    x.copy_from_slice(&x_orig);
                    ruvllm::kernels::ane_ops::silu_auto(black_box(&mut x), batch_size, dim);
                })
            });
        }
    }

    group.finish();
}

// ============================================================================
// Criterion Groups
// ============================================================================

// Full benchmark group for macOS with both features
#[cfg(all(target_os = "macos", feature = "coreml"))]
criterion_group!(
    benches,
    bench_gemm_comparison,
    bench_batched_gemm_comparison,
    bench_gelu_comparison,
    bench_silu_comparison,
    bench_softmax_comparison,
    bench_layer_norm_comparison,
    bench_rms_norm_comparison,
    bench_auto_dispatch,
    bench_mlp_block,
    bench_crossover_detection,
    bench_hybrid_pipeline,
    bench_activation_crossover,
);

// Reduced benchmark group for non-coreml builds
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
criterion_group!(
    benches,
    bench_gemm_comparison,
    bench_batched_gemm_comparison,
    bench_gelu_comparison,
    bench_silu_comparison,
    bench_softmax_comparison,
    bench_layer_norm_comparison,
    bench_rms_norm_comparison,
    bench_mlp_block,
    bench_crossover_detection,
    bench_activation_crossover,
);

criterion_main!(benches);