Files
wifi-densepose/vendor/ruvector/crates/ruvllm/benches/metal_bench.rs

597 lines
18 KiB
Rust

#![allow(
clippy::all,
unused_imports,
unused_variables,
dead_code,
unused_mut,
unused_assignments,
non_camel_case_types,
clippy::approx_constant,
unexpected_cfgs,
unused_must_use,
unused_parens
)]
//! Metal GPU acceleration benchmarks
//!
//! Benchmarks Metal compute shaders for LLM operations.
//! Only runs on macOS with `metal-compute` feature enabled.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
use ruvllm::kernels::AttentionConfig;
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
use ruvllm::metal::{MetalConfig, MetalContext};
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_flash_attention_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_flash_attention");
for (seq_len, kv_len) in [(1, 512), (1, 2048), (1, 4096), (4, 512), (4, 2048)] {
let config = AttentionConfig {
num_heads: 32,
num_kv_heads: 8,
head_dim: 128,
max_seq_len: seq_len,
causal: true,
scale: 0.0,
};
let query: Vec<f32> = (0..seq_len * config.num_heads * config.head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let key: Vec<f32> = (0..kv_len * config.num_kv_heads * config.head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let value: Vec<f32> = (0..kv_len * config.num_kv_heads * config.head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
group.bench_with_input(
BenchmarkId::new("metal", format!("seq{}_kv{}", seq_len, kv_len)),
&(&query, &key, &value, &config),
|b, (q, k, v, cfg)| {
b.iter(|| {
ctx.flash_attention(
black_box(*q),
black_box(*k),
black_box(*v),
black_box(*cfg),
)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_gemm_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_gemm");
for size in [128, 256, 512, 1024, 2048] {
let m = size;
let n = size;
let k = size;
let a: Vec<f32> = (0..m * k).map(|i| (i as f32) * 0.001).collect();
let b: Vec<f32> = (0..k * n).map(|i| (i as f32) * 0.001).collect();
group.bench_with_input(
BenchmarkId::new("metal_f32", format!("{}x{}", size, size)),
&(&a, &b, m, n, k),
|bench, (a, b, m, n, k)| {
bench.iter(|| ctx.gemm_f32(black_box(*a), black_box(*b), *m, *n, *k))
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_rms_norm_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_rms_norm");
for hidden_size in [1024, 2048, 4096, 8192] {
let batch_size = 4;
let mut x: Vec<f32> = (0..batch_size * hidden_size)
.map(|i| (i as f32) * 0.001)
.collect();
let weight: Vec<f32> = vec![1.0; hidden_size];
group.bench_with_input(
BenchmarkId::new("metal", format!("hidden{}", hidden_size)),
&(hidden_size, batch_size),
|bench, _| {
bench.iter(|| {
let mut x_clone = x.clone();
ctx.rms_norm(black_box(&mut x_clone), black_box(&weight), 1e-6)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_rope_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_rope");
for num_heads in [8, 16, 32] {
let head_dim = 128;
let batch_size = 4;
let mut x: Vec<f32> = (0..batch_size * num_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
group.bench_with_input(
BenchmarkId::new("metal", format!("heads{}", num_heads)),
&(num_heads, head_dim, batch_size),
|bench, &(nh, hd, bs)| {
bench.iter(|| {
let mut x_clone = x.clone();
ctx.apply_rope(black_box(&mut x_clone), 0, nh, hd, 10000.0)
})
},
);
}
group.finish();
}
// ============ M4 Pro Optimized Benchmarks ============
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_optimized_gemm_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
if !ctx.has_m4_pro_optimizations() {
eprintln!("M4 Pro optimizations not available, skipping optimized GEMM benchmark");
return;
}
println!(
"Available optimizations: {:?}",
ctx.available_optimizations()
);
let mut group = c.benchmark_group("metal_gemm_optimized");
for size in [128, 256, 512, 1024, 2048, 4096] {
let m = size;
let n = size;
let k = size;
let a: Vec<half::f16> = (0..m * k)
.map(|i| half::f16::from_f32((i as f32) * 0.001))
.collect();
let b: Vec<half::f16> = (0..k * n)
.map(|i| half::f16::from_f32((i as f32) * 0.001))
.collect();
// Benchmark standard GEMM
group.bench_with_input(
BenchmarkId::new("standard_f16", format!("{}x{}", size, size)),
&(&a, &b, m, n, k),
|bench, (a, b, m, n, k)| {
bench.iter(|| ctx.gemm_f16(black_box(*a), black_box(*b), *m, *n, *k))
},
);
// Benchmark M4 Pro optimized GEMM (BM=128, BN=128, BK=32)
group.bench_with_input(
BenchmarkId::new("m4_optimized", format!("{}x{}", size, size)),
&(&a, &b, m, n, k),
|bench, (a, b, m, n, k)| {
bench.iter(|| ctx.gemm_optimized(black_box(*a), black_box(*b), *m, *n, *k))
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_fused_attention_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_fused_attention");
for (seq_len, kv_len) in [
(1, 512),
(1, 2048),
(1, 4096),
(4, 512),
(4, 2048),
(16, 2048),
] {
let num_heads = 32;
let num_kv_heads = 8;
let head_dim = 128;
let query: Vec<f32> = (0..seq_len * num_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let key: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let value: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
// Standard attention (legacy)
let config = AttentionConfig {
num_heads,
num_kv_heads,
head_dim,
max_seq_len: seq_len,
causal: true,
scale: 0.0,
};
group.bench_with_input(
BenchmarkId::new("standard", format!("seq{}_kv{}", seq_len, kv_len)),
&(&query, &key, &value, &config),
|b, (q, k, v, cfg)| {
b.iter(|| {
ctx.flash_attention(
black_box(*q),
black_box(*k),
black_box(*v),
black_box(*cfg),
)
})
},
);
// Fused Flash Attention 2
group.bench_with_input(
BenchmarkId::new("fused_fa2", format!("seq{}_kv{}", seq_len, kv_len)),
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|b, (q, k, v, nh, nkv, hd)| {
b.iter(|| {
ctx.fused_attention(
black_box(*q),
black_box(*k),
black_box(*v),
*nh,
*nkv,
*hd,
true,
)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_fused_norm_residual_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
if ctx
.available_optimizations()
.iter()
.find(|&&s| s == "fused_layernorm_residual")
.is_none()
{
eprintln!("Fused LayerNorm+Residual not available, skipping benchmark");
return;
}
let mut group = c.benchmark_group("metal_fused_norm");
for hidden_size in [1024, 2048, 4096, 8192] {
let batch_size = 4;
let x: Vec<f32> = (0..batch_size * hidden_size)
.map(|i| (i as f32) * 0.001)
.collect();
let residual: Vec<f32> = (0..batch_size * hidden_size)
.map(|i| (i as f32) * 0.0005)
.collect();
let weight: Vec<f32> = vec![1.0; hidden_size];
let bias: Vec<f32> = vec![0.0; hidden_size];
// Separate RMSNorm
group.bench_with_input(
BenchmarkId::new("separate_rmsnorm", format!("hidden{}", hidden_size)),
&(hidden_size, batch_size),
|bench, _| {
bench.iter(|| {
let mut x_clone = x.clone();
// Add residual manually then normalize
for i in 0..x_clone.len() {
x_clone[i] += residual[i];
}
ctx.rms_norm(black_box(&mut x_clone), black_box(&weight), 1e-6)
})
},
);
// Fused RMSNorm + Residual
group.bench_with_input(
BenchmarkId::new("fused_rmsnorm_residual", format!("hidden{}", hidden_size)),
&(hidden_size, batch_size),
|bench, _| {
bench.iter(|| {
let mut x_clone = x.clone();
ctx.fused_rmsnorm_residual(
black_box(&mut x_clone),
black_box(&residual),
black_box(&weight),
1e-6,
)
})
},
);
// Fused LayerNorm + Residual
group.bench_with_input(
BenchmarkId::new("fused_layernorm_residual", format!("hidden{}", hidden_size)),
&(hidden_size, batch_size),
|bench, _| {
bench.iter(|| {
let mut x_clone = x.clone();
ctx.fused_layernorm_residual(
black_box(&mut x_clone),
black_box(&residual),
black_box(&weight),
black_box(&bias),
1e-6,
)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_rope_attention_fusion_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
let mut group = c.benchmark_group("metal_rope_attention_fusion");
for (seq_len, kv_len) in [(1, 512), (1, 2048), (4, 2048)] {
let num_heads = 32;
let num_kv_heads = 8;
let head_dim = 128;
let rope_theta = 10000.0;
let query: Vec<f32> = (0..seq_len * num_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let key: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
let value: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
.map(|i| (i as f32) * 0.001)
.collect();
// Separate RoPE + Attention (baseline)
group.bench_with_input(
BenchmarkId::new("separate", format!("seq{}_kv{}", seq_len, kv_len)),
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|b, (q, k, v, nh, nkv, hd)| {
b.iter(|| {
let mut q_clone = (*q).clone();
let mut k_clone = (*k).clone();
let _ = ctx.apply_rope(&mut q_clone, 0, *nh, *hd, rope_theta);
let _ = ctx.apply_rope(&mut k_clone, 0, *nkv, *hd, rope_theta);
ctx.fused_attention(
black_box(&q_clone),
black_box(&k_clone),
black_box(*v),
*nh,
*nkv,
*hd,
true,
)
})
},
);
// Fused RoPE + Attention
group.bench_with_input(
BenchmarkId::new("fused", format!("seq{}_kv{}", seq_len, kv_len)),
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|b, (q, k, v, nh, nkv, hd)| {
b.iter(|| {
ctx.rope_then_attention(
black_box(*q),
black_box(*k),
black_box(*v),
*nh,
*nkv,
*hd,
0,
rope_theta,
true,
)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
fn bench_swiglu_metal(c: &mut Criterion) {
let ctx = match MetalContext::new(MetalConfig::default()) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Failed to create Metal context: {}", e);
return;
}
};
if ctx
.available_optimizations()
.iter()
.find(|&&s| s == "fused_swiglu")
.is_none()
{
eprintln!("Fused SwiGLU not available, skipping benchmark");
return;
}
let mut group = c.benchmark_group("metal_swiglu");
for size in [1024, 4096, 11008, 14336] {
let gate: Vec<f32> = (0..size).map(|i| (i as f32) * 0.001 - 0.5).collect();
let up: Vec<f32> = (0..size).map(|i| (i as f32) * 0.001).collect();
// Fused SwiGLU
group.bench_with_input(
BenchmarkId::new("fused", format!("size{}", size)),
&(&gate, &up),
|b, (g, u)| b.iter(|| ctx.fused_swiglu(black_box(*g), black_box(*u))),
);
// CPU baseline for comparison
group.bench_with_input(
BenchmarkId::new("cpu_baseline", format!("size{}", size)),
&(&gate, &up),
|b, (g, u)| {
b.iter(|| {
let result: Vec<f32> = g
.iter()
.zip(u.iter())
.map(|(&g_val, &u_val)| {
// SwiGLU: swish(gate) * up
let swish = g_val / (1.0 + (-g_val).exp());
swish * u_val
})
.collect();
black_box(result)
})
},
);
}
group.finish();
}
// CPU baseline comparison
fn bench_cpu_gemm(c: &mut Criterion) {
let mut group = c.benchmark_group("cpu_gemm");
for size in [128, 256, 512] {
let m = size;
let n = size;
let k = size;
let a: Vec<f32> = (0..m * k).map(|i| (i as f32) * 0.001).collect();
let b: Vec<f32> = (0..k * n).map(|i| (i as f32) * 0.001).collect();
group.bench_with_input(
BenchmarkId::new("naive", format!("{}x{}", size, size)),
&(&a, &b, m, n, k),
|bench, (a, b, m, n, k)| {
bench.iter(|| {
let mut c = vec![0.0f32; *m * *n];
for i in 0..*m {
for j in 0..*n {
let mut sum = 0.0f32;
for l in 0..*k {
sum += a[i * *k + l] * b[l * *n + j];
}
c[i * *n + j] = sum;
}
}
black_box(c)
})
},
);
}
group.finish();
}
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
criterion_group!(
metal_benches,
// Legacy benchmarks
bench_flash_attention_metal,
bench_gemm_metal,
bench_rms_norm_metal,
bench_rope_metal,
// M4 Pro optimized benchmarks
bench_optimized_gemm_metal,
bench_fused_attention_metal,
bench_fused_norm_residual_metal,
bench_rope_attention_fusion_metal,
bench_swiglu_metal,
// CPU baseline
bench_cpu_gemm,
);
#[cfg(not(all(target_os = "macos", feature = "metal-compute")))]
criterion_group!(metal_benches, bench_cpu_gemm,);
criterion_main!(metal_benches);