Files
wifi-densepose/vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs

801 lines
24 KiB
Rust

//! SIMD-Specific Benchmarks for Prime-Radiant Coherence Engine
//!
//! This benchmark suite compares naive/scalar implementations against
//! SIMD-optimized versions for core coherence operations.
//!
//! ## Benchmark Categories
//! 1. Dense Matrix Multiply - naive vs SIMD
//! 2. Vector Norm Computation - naive vs SIMD
//! 3. Batch Residual Computation - naive vs SIMD
//! 4. Dot Products and Reductions
//!
//! ## Architecture Notes
//! - x86_64: AVX2 (256-bit, f32x8) or AVX-512 (512-bit, f32x16)
//! - aarch64: NEON (128-bit, f32x4)
//! - WASM: SIMD128 (128-bit)
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
// ============================================================================
// TEST DATA GENERATION
// ============================================================================
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
(0..len)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
(0..rows * cols)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
})
.collect()
}
// ============================================================================
// NAIVE IMPLEMENTATIONS (BASELINE)
// ============================================================================
/// Naive matrix-vector multiply: y = Ax
#[inline(never)]
fn matmul_naive(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let mut sum = 0.0f32;
let row_start = i * cols;
for j in 0..cols {
sum += matrix[row_start + j] * x[j];
}
y[i] = sum;
}
}
/// Naive squared norm: |v|^2
#[inline(never)]
fn norm_sq_naive(v: &[f32]) -> f32 {
let mut sum = 0.0f32;
for &x in v {
sum += x * x;
}
sum
}
/// Naive dot product: a . b
#[inline(never)]
fn dot_naive(a: &[f32], b: &[f32]) -> f32 {
let mut sum = 0.0f32;
for i in 0..a.len() {
sum += a[i] * b[i];
}
sum
}
/// Naive residual norm: |a - b|^2
#[inline(never)]
fn residual_norm_naive(a: &[f32], b: &[f32]) -> f32 {
let mut sum = 0.0f32;
for i in 0..a.len() {
let diff = a[i] - b[i];
sum += diff * diff;
}
sum
}
/// Naive batch residual computation
#[inline(never)]
fn batch_residual_naive(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_naive(src, tgt);
}
total
}
// ============================================================================
// SIMD-FRIENDLY IMPLEMENTATIONS
// ============================================================================
/// Unrolled matrix-vector multiply (auto-vectorization friendly)
#[inline(never)]
fn matmul_unrolled(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let row_start = i * cols;
// Process in chunks of 8
let chunks = cols / 8;
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
let mut acc4 = 0.0f32;
let mut acc5 = 0.0f32;
let mut acc6 = 0.0f32;
let mut acc7 = 0.0f32;
for c in 0..chunks {
let base = row_start + c * 8;
acc0 += matrix[base] * x[c * 8];
acc1 += matrix[base + 1] * x[c * 8 + 1];
acc2 += matrix[base + 2] * x[c * 8 + 2];
acc3 += matrix[base + 3] * x[c * 8 + 3];
acc4 += matrix[base + 4] * x[c * 8 + 4];
acc5 += matrix[base + 5] * x[c * 8 + 5];
acc6 += matrix[base + 6] * x[c * 8 + 6];
acc7 += matrix[base + 7] * x[c * 8 + 7];
}
let mut sum = acc0 + acc1 + acc2 + acc3 + acc4 + acc5 + acc6 + acc7;
// Handle remainder
for j in (chunks * 8)..cols {
sum += matrix[row_start + j] * x[j];
}
y[i] = sum;
}
}
/// Unrolled squared norm with 4 accumulators
#[inline(never)]
fn norm_sq_unrolled(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(4);
let remainder = chunks.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for chunk in chunks {
acc0 += chunk[0] * chunk[0];
acc1 += chunk[1] * chunk[1];
acc2 += chunk[2] * chunk[2];
acc3 += chunk[3] * chunk[3];
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for &x in remainder {
sum += x * x;
}
sum
}
/// Unrolled squared norm with 8 accumulators (better for wider SIMD)
#[inline(never)]
fn norm_sq_unrolled_8(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(8);
let remainder = chunks.remainder();
let mut acc = [0.0f32; 8];
for chunk in chunks {
acc[0] += chunk[0] * chunk[0];
acc[1] += chunk[1] * chunk[1];
acc[2] += chunk[2] * chunk[2];
acc[3] += chunk[3] * chunk[3];
acc[4] += chunk[4] * chunk[4];
acc[5] += chunk[5] * chunk[5];
acc[6] += chunk[6] * chunk[6];
acc[7] += chunk[7] * chunk[7];
}
let mut sum: f32 = acc.iter().sum();
for &x in remainder {
sum += x * x;
}
sum
}
/// Iterator-based squared norm (relies on auto-vectorization)
#[inline(never)]
fn norm_sq_iter(v: &[f32]) -> f32 {
v.iter().map(|x| x * x).sum()
}
/// Unrolled dot product
#[inline(never)]
fn dot_unrolled(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for (ca, cb) in chunks_a.zip(chunks_b) {
acc0 += ca[0] * cb[0];
acc1 += ca[1] * cb[1];
acc2 += ca[2] * cb[2];
acc3 += ca[3] * cb[3];
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
sum += a * b;
}
sum
}
/// Unrolled residual norm
#[inline(never)]
fn residual_norm_unrolled(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
for (ca, cb) in chunks_a.zip(chunks_b) {
let d0 = ca[0] - cb[0];
let d1 = ca[1] - cb[1];
let d2 = ca[2] - cb[2];
let d3 = ca[3] - cb[3];
acc0 += d0 * d0;
acc1 += d1 * d1;
acc2 += d2 * d2;
acc3 += d3 * d3;
}
let mut sum = acc0 + acc1 + acc2 + acc3;
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
let d = a - b;
sum += d * d;
}
sum
}
/// Batch residual with unrolled inner loop
#[inline(never)]
fn batch_residual_unrolled(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_unrolled(src, tgt);
}
total
}
// ============================================================================
// EXPLICIT SIMD (when wide crate is available)
// ============================================================================
#[cfg(feature = "simd")]
mod simd_impl {
use wide::f32x8;
/// SIMD squared norm using f32x8
#[inline(never)]
pub fn norm_sq_simd(v: &[f32]) -> f32 {
let chunks = v.chunks_exact(8);
let remainder = chunks.remainder();
let mut acc = f32x8::ZERO;
for chunk in chunks {
let vals = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
acc += vals * vals;
}
let mut sum: f32 = acc.reduce_add();
for &x in remainder {
sum += x * x;
}
sum
}
/// SIMD dot product using f32x8
#[inline(never)]
pub fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(8);
let chunks_b = b.chunks_exact(8);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc = f32x8::ZERO;
for (ca, cb) in chunks_a.zip(chunks_b) {
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
acc += va * vb;
}
let mut sum: f32 = acc.reduce_add();
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
sum += a * b;
}
sum
}
/// SIMD residual norm using f32x8
#[inline(never)]
pub fn residual_norm_simd(a: &[f32], b: &[f32]) -> f32 {
let chunks_a = a.chunks_exact(8);
let chunks_b = b.chunks_exact(8);
let rem_a = chunks_a.remainder();
let rem_b = chunks_b.remainder();
let mut acc = f32x8::ZERO;
for (ca, cb) in chunks_a.zip(chunks_b) {
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
let diff = va - vb;
acc += diff * diff;
}
let mut sum: f32 = acc.reduce_add();
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
let d = a - b;
sum += d * d;
}
sum
}
/// SIMD matrix-vector multiply
#[inline(never)]
pub fn matmul_simd(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
for i in 0..rows {
let row_start = i * cols;
let row = &matrix[row_start..row_start + cols];
let chunks_m = row.chunks_exact(8);
let chunks_x = x.chunks_exact(8);
let rem_m = chunks_m.remainder();
let rem_x = chunks_x.remainder();
let mut acc = f32x8::ZERO;
for (cm, cx) in chunks_m.zip(chunks_x) {
let vm = f32x8::from(<[f32; 8]>::try_from(cm).unwrap());
let vx = f32x8::from(<[f32; 8]>::try_from(cx).unwrap());
acc += vm * vx;
}
let mut sum: f32 = acc.reduce_add();
for (&m, &xv) in rem_m.iter().zip(rem_x.iter()) {
sum += m * xv;
}
y[i] = sum;
}
}
/// SIMD batch residual
#[inline(never)]
pub fn batch_residual_simd(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
let mut total = 0.0f32;
for (src, tgt) in sources.iter().zip(targets.iter()) {
total += residual_norm_simd(src, tgt);
}
total
}
}
// ============================================================================
// DENSE MATRIX MULTIPLY BENCHMARKS
// ============================================================================
fn bench_dense_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_matmul");
// Test matrix sizes: 64x64, 128x128, 256x256
for size in [64, 128, 256] {
let matrix = generate_matrix(size, size, 42);
let x = generate_vec(size, 123);
let mut y = vec![0.0f32; size];
group.throughput(Throughput::Elements((size * size) as u64));
group.bench_with_input(BenchmarkId::new("naive", size), &size, |b, _| {
b.iter(|| {
matmul_naive(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
group.bench_with_input(BenchmarkId::new("unrolled", size), &size, |b, _| {
b.iter(|| {
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", size), &size, |b, _| {
b.iter(|| {
simd_impl::matmul_simd(black_box(&matrix), black_box(&x), &mut y, size, size);
black_box(y[0])
})
});
}
group.finish();
}
/// Benchmark non-square matrix multiply (projection)
fn bench_projection_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_matmul_projection");
// Common projection sizes in coherence: 64->32, 128->64, 256->128
for (in_dim, out_dim) in [(64, 32), (128, 64), (256, 128)] {
let matrix = generate_matrix(out_dim, in_dim, 42);
let x = generate_vec(in_dim, 123);
let mut y = vec![0.0f32; out_dim];
group.throughput(Throughput::Elements((out_dim * in_dim) as u64));
group.bench_with_input(
BenchmarkId::new("naive", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
matmul_naive(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
black_box(y[0])
})
},
);
group.bench_with_input(
BenchmarkId::new("unrolled", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
black_box(y[0])
})
},
);
#[cfg(feature = "simd")]
group.bench_with_input(
BenchmarkId::new("simd", format!("{}x{}", in_dim, out_dim)),
&(in_dim, out_dim),
|b, _| {
b.iter(|| {
simd_impl::matmul_simd(
black_box(&matrix),
black_box(&x),
&mut y,
out_dim,
in_dim,
);
black_box(y[0])
})
},
);
}
group.finish();
}
// ============================================================================
// NORM COMPUTATION BENCHMARKS
// ============================================================================
fn bench_norm_computation(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_norm");
// Test dimensions aligned for SIMD
for dim in [64, 128, 256, 512, 1024] {
let v = generate_vec(dim, 42);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_naive(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("iter", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_iter(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("unrolled_4", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_unrolled(black_box(&v))))
});
group.bench_with_input(BenchmarkId::new("unrolled_8", dim), &dim, |b, _| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd_f32x8", dim), &dim, |b, _| {
b.iter(|| black_box(simd_impl::norm_sq_simd(black_box(&v))))
});
}
group.finish();
}
// ============================================================================
// DOT PRODUCT BENCHMARKS
// ============================================================================
fn bench_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_dot");
for dim in [64, 256, 1024] {
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(dot_naive(black_box(&a), black_box(&b))))
});
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(dot_unrolled(black_box(&a), black_box(&b))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(simd_impl::dot_simd(black_box(&a), black_box(&b))))
});
}
group.finish();
}
// ============================================================================
// RESIDUAL NORM BENCHMARKS (CORE COHERENCE OPERATION)
// ============================================================================
fn bench_residual_norm(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_residual_norm");
for dim in [64, 256, 1024] {
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(residual_norm_naive(black_box(&a), black_box(&b))))
});
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
});
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
b_iter.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
});
}
group.finish();
}
// ============================================================================
// BATCH RESIDUAL BENCHMARKS
// ============================================================================
fn bench_batch_residual(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_batch_residual");
let dim = 64;
for batch_size in [100, 1000, 10000] {
let sources: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_vec(dim, i as u64))
.collect();
let targets: Vec<Vec<f32>> = (0..batch_size)
.map(|i| generate_vec(dim, i as u64 + 10000))
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("naive", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
black_box(batch_residual_naive(
black_box(&sources),
black_box(&targets),
))
})
},
);
group.bench_with_input(
BenchmarkId::new("unrolled", batch_size),
&batch_size,
|b, _| {
b.iter(|| {
black_box(batch_residual_unrolled(
black_box(&sources),
black_box(&targets),
))
})
},
);
#[cfg(feature = "simd")]
group.bench_with_input(BenchmarkId::new("simd", batch_size), &batch_size, |b, _| {
b.iter(|| {
black_box(simd_impl::batch_residual_simd(
black_box(&sources),
black_box(&targets),
))
})
});
}
group.finish();
}
// ============================================================================
// MEMORY ALIGNMENT BENCHMARKS
// ============================================================================
fn bench_alignment_impact(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_alignment");
let dim = 256;
// Aligned (multiple of 8)
{
let v = generate_vec(dim, 42);
group.bench_function("aligned_256", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
// Misaligned (not multiple of 8)
{
let v = generate_vec(dim + 3, 42);
group.bench_function("misaligned_259", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
// Small vector (below SIMD threshold)
{
let v = generate_vec(7, 42);
group.bench_function("small_7", |b| {
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
});
}
group.finish();
}
// ============================================================================
// THROUGHPUT SCALING BENCHMARKS
// ============================================================================
fn bench_throughput_scaling(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_throughput_scaling");
// Test how throughput scales with vector size
let sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
for &size in &sizes {
let a = generate_vec(size, 42);
let b = generate_vec(size, 123);
group.throughput(Throughput::Bytes((size * 4 * 2) as u64)); // 2 vectors, 4 bytes each
group.bench_with_input(
BenchmarkId::new("residual_unrolled", size),
&size,
|bench, _| {
bench.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
},
);
#[cfg(feature = "simd")]
group.bench_with_input(
BenchmarkId::new("residual_simd", size),
&size,
|bench, _| {
bench
.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
},
);
}
group.finish();
}
// ============================================================================
// COHERENCE-SPECIFIC SIMD PATTERNS
// ============================================================================
/// Fused multiply-add pattern for coherence energy
fn bench_fma_pattern(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_fma_pattern");
let dim = 256;
let a = generate_vec(dim, 42);
let b = generate_vec(dim, 123);
let weight = 1.5f32;
// Without FMA (separate multiply and add)
group.bench_function("separate_ops", |bench| {
bench.iter(|| {
let mut sum = 0.0f32;
for i in 0..dim {
let diff = a[i] - b[i];
let sq = diff * diff;
sum += sq;
}
black_box(weight * sum)
})
});
// With potential FMA (compiler may optimize)
group.bench_function("fma_friendly", |bench| {
bench.iter(|| {
let mut acc0 = 0.0f32;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
let chunks = dim / 4;
for c in 0..chunks {
let base = c * 4;
let d0 = a[base] - b[base];
let d1 = a[base + 1] - b[base + 1];
let d2 = a[base + 2] - b[base + 2];
let d3 = a[base + 3] - b[base + 3];
// These can become FMA operations
acc0 = d0.mul_add(d0, acc0);
acc1 = d1.mul_add(d1, acc1);
acc2 = d2.mul_add(d2, acc2);
acc3 = d3.mul_add(d3, acc3);
}
black_box(weight * (acc0 + acc1 + acc2 + acc3))
})
});
group.finish();
}
// ============================================================================
// CRITERION CONFIGURATION
// ============================================================================
criterion_group!(matmul_benches, bench_dense_matmul, bench_projection_matmul,);
criterion_group!(
vector_ops_benches,
bench_norm_computation,
bench_dot_product,
bench_residual_norm,
);
criterion_group!(batch_benches, bench_batch_residual,);
criterion_group!(
optimization_benches,
bench_alignment_impact,
bench_throughput_scaling,
bench_fma_pattern,
);
criterion_main!(
matmul_benches,
vector_ops_benches,
batch_benches,
optimization_benches
);