Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
800
vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
vendored
Normal file
800
vendor/ruvector/crates/prime-radiant/benches/simd_benchmarks.rs
vendored
Normal file
@@ -0,0 +1,800 @@
|
||||
//! SIMD-Specific Benchmarks for Prime-Radiant Coherence Engine
|
||||
//!
|
||||
//! This benchmark suite compares naive/scalar implementations against
|
||||
//! SIMD-optimized versions for core coherence operations.
|
||||
//!
|
||||
//! ## Benchmark Categories
|
||||
//! 1. Dense Matrix Multiply - naive vs SIMD
|
||||
//! 2. Vector Norm Computation - naive vs SIMD
|
||||
//! 3. Batch Residual Computation - naive vs SIMD
|
||||
//! 4. Dot Products and Reductions
|
||||
//!
|
||||
//! ## Architecture Notes
|
||||
//! - x86_64: AVX2 (256-bit, f32x8) or AVX-512 (512-bit, f32x16)
|
||||
//! - aarch64: NEON (128-bit, f32x4)
|
||||
//! - WASM: SIMD128 (128-bit)
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
// ============================================================================
|
||||
// TEST DATA GENERATION
|
||||
// ============================================================================
|
||||
|
||||
fn generate_vec(len: usize, seed: u64) -> Vec<f32> {
|
||||
(0..len)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_matrix(rows: usize, cols: usize, seed: u64) -> Vec<f32> {
|
||||
(0..rows * cols)
|
||||
.map(|i| {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
(seed, i).hash(&mut hasher);
|
||||
(hasher.finish() % 1000) as f32 / 1000.0 - 0.5
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// NAIVE IMPLEMENTATIONS (BASELINE)
|
||||
// ============================================================================
|
||||
|
||||
/// Naive matrix-vector multiply: y = Ax
|
||||
#[inline(never)]
|
||||
fn matmul_naive(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let mut sum = 0.0f32;
|
||||
let row_start = i * cols;
|
||||
for j in 0..cols {
|
||||
sum += matrix[row_start + j] * x[j];
|
||||
}
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Naive squared norm: |v|^2
|
||||
#[inline(never)]
|
||||
fn norm_sq_naive(v: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for &x in v {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive dot product: a . b
|
||||
#[inline(never)]
|
||||
fn dot_naive(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..a.len() {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive residual norm: |a - b|^2
|
||||
#[inline(never)]
|
||||
fn residual_norm_naive(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..a.len() {
|
||||
let diff = a[i] - b[i];
|
||||
sum += diff * diff;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Naive batch residual computation
|
||||
#[inline(never)]
|
||||
fn batch_residual_naive(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_naive(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SIMD-FRIENDLY IMPLEMENTATIONS
|
||||
// ============================================================================
|
||||
|
||||
/// Unrolled matrix-vector multiply (auto-vectorization friendly)
|
||||
#[inline(never)]
|
||||
fn matmul_unrolled(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let row_start = i * cols;
|
||||
|
||||
// Process in chunks of 8
|
||||
let chunks = cols / 8;
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
let mut acc4 = 0.0f32;
|
||||
let mut acc5 = 0.0f32;
|
||||
let mut acc6 = 0.0f32;
|
||||
let mut acc7 = 0.0f32;
|
||||
|
||||
for c in 0..chunks {
|
||||
let base = row_start + c * 8;
|
||||
acc0 += matrix[base] * x[c * 8];
|
||||
acc1 += matrix[base + 1] * x[c * 8 + 1];
|
||||
acc2 += matrix[base + 2] * x[c * 8 + 2];
|
||||
acc3 += matrix[base + 3] * x[c * 8 + 3];
|
||||
acc4 += matrix[base + 4] * x[c * 8 + 4];
|
||||
acc5 += matrix[base + 5] * x[c * 8 + 5];
|
||||
acc6 += matrix[base + 6] * x[c * 8 + 6];
|
||||
acc7 += matrix[base + 7] * x[c * 8 + 7];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3 + acc4 + acc5 + acc6 + acc7;
|
||||
|
||||
// Handle remainder
|
||||
for j in (chunks * 8)..cols {
|
||||
sum += matrix[row_start + j] * x[j];
|
||||
}
|
||||
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Unrolled squared norm with 4 accumulators
|
||||
#[inline(never)]
|
||||
fn norm_sq_unrolled(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(4);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for chunk in chunks {
|
||||
acc0 += chunk[0] * chunk[0];
|
||||
acc1 += chunk[1] * chunk[1];
|
||||
acc2 += chunk[2] * chunk[2];
|
||||
acc3 += chunk[3] * chunk[3];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Unrolled squared norm with 8 accumulators (better for wider SIMD)
|
||||
#[inline(never)]
|
||||
fn norm_sq_unrolled_8(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(8);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc = [0.0f32; 8];
|
||||
|
||||
for chunk in chunks {
|
||||
acc[0] += chunk[0] * chunk[0];
|
||||
acc[1] += chunk[1] * chunk[1];
|
||||
acc[2] += chunk[2] * chunk[2];
|
||||
acc[3] += chunk[3] * chunk[3];
|
||||
acc[4] += chunk[4] * chunk[4];
|
||||
acc[5] += chunk[5] * chunk[5];
|
||||
acc[6] += chunk[6] * chunk[6];
|
||||
acc[7] += chunk[7] * chunk[7];
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.iter().sum();
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Iterator-based squared norm (relies on auto-vectorization)
|
||||
#[inline(never)]
|
||||
fn norm_sq_iter(v: &[f32]) -> f32 {
|
||||
v.iter().map(|x| x * x).sum()
|
||||
}
|
||||
|
||||
/// Unrolled dot product
|
||||
#[inline(never)]
|
||||
fn dot_unrolled(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(4);
|
||||
let chunks_b = b.chunks_exact(4);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
acc0 += ca[0] * cb[0];
|
||||
acc1 += ca[1] * cb[1];
|
||||
acc2 += ca[2] * cb[2];
|
||||
acc3 += ca[3] * cb[3];
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
sum += a * b;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Unrolled residual norm
|
||||
#[inline(never)]
|
||||
fn residual_norm_unrolled(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(4);
|
||||
let chunks_b = b.chunks_exact(4);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let d0 = ca[0] - cb[0];
|
||||
let d1 = ca[1] - cb[1];
|
||||
let d2 = ca[2] - cb[2];
|
||||
let d3 = ca[3] - cb[3];
|
||||
acc0 += d0 * d0;
|
||||
acc1 += d1 * d1;
|
||||
acc2 += d2 * d2;
|
||||
acc3 += d3 * d3;
|
||||
}
|
||||
|
||||
let mut sum = acc0 + acc1 + acc2 + acc3;
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
let d = a - b;
|
||||
sum += d * d;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// Batch residual with unrolled inner loop
|
||||
#[inline(never)]
|
||||
fn batch_residual_unrolled(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_unrolled(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// EXPLICIT SIMD (when wide crate is available)
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
mod simd_impl {
|
||||
use wide::f32x8;
|
||||
|
||||
/// SIMD squared norm using f32x8
|
||||
#[inline(never)]
|
||||
pub fn norm_sq_simd(v: &[f32]) -> f32 {
|
||||
let chunks = v.chunks_exact(8);
|
||||
let remainder = chunks.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for chunk in chunks {
|
||||
let vals = f32x8::from(<[f32; 8]>::try_from(chunk).unwrap());
|
||||
acc += vals * vals;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for &x in remainder {
|
||||
sum += x * x;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD dot product using f32x8
|
||||
#[inline(never)]
|
||||
pub fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(8);
|
||||
let chunks_b = b.chunks_exact(8);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
|
||||
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
|
||||
acc += va * vb;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
sum += a * b;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD residual norm using f32x8
|
||||
#[inline(never)]
|
||||
pub fn residual_norm_simd(a: &[f32], b: &[f32]) -> f32 {
|
||||
let chunks_a = a.chunks_exact(8);
|
||||
let chunks_b = b.chunks_exact(8);
|
||||
let rem_a = chunks_a.remainder();
|
||||
let rem_b = chunks_b.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (ca, cb) in chunks_a.zip(chunks_b) {
|
||||
let va = f32x8::from(<[f32; 8]>::try_from(ca).unwrap());
|
||||
let vb = f32x8::from(<[f32; 8]>::try_from(cb).unwrap());
|
||||
let diff = va - vb;
|
||||
acc += diff * diff;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&a, &b) in rem_a.iter().zip(rem_b.iter()) {
|
||||
let d = a - b;
|
||||
sum += d * d;
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
/// SIMD matrix-vector multiply
|
||||
#[inline(never)]
|
||||
pub fn matmul_simd(matrix: &[f32], x: &[f32], y: &mut [f32], rows: usize, cols: usize) {
|
||||
for i in 0..rows {
|
||||
let row_start = i * cols;
|
||||
let row = &matrix[row_start..row_start + cols];
|
||||
|
||||
let chunks_m = row.chunks_exact(8);
|
||||
let chunks_x = x.chunks_exact(8);
|
||||
let rem_m = chunks_m.remainder();
|
||||
let rem_x = chunks_x.remainder();
|
||||
|
||||
let mut acc = f32x8::ZERO;
|
||||
|
||||
for (cm, cx) in chunks_m.zip(chunks_x) {
|
||||
let vm = f32x8::from(<[f32; 8]>::try_from(cm).unwrap());
|
||||
let vx = f32x8::from(<[f32; 8]>::try_from(cx).unwrap());
|
||||
acc += vm * vx;
|
||||
}
|
||||
|
||||
let mut sum: f32 = acc.reduce_add();
|
||||
for (&m, &xv) in rem_m.iter().zip(rem_x.iter()) {
|
||||
sum += m * xv;
|
||||
}
|
||||
|
||||
y[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// SIMD batch residual
|
||||
#[inline(never)]
|
||||
pub fn batch_residual_simd(sources: &[Vec<f32>], targets: &[Vec<f32>]) -> f32 {
|
||||
let mut total = 0.0f32;
|
||||
for (src, tgt) in sources.iter().zip(targets.iter()) {
|
||||
total += residual_norm_simd(src, tgt);
|
||||
}
|
||||
total
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DENSE MATRIX MULTIPLY BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_dense_matmul(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_matmul");
|
||||
|
||||
// Test matrix sizes: 64x64, 128x128, 256x256
|
||||
for size in [64, 128, 256] {
|
||||
let matrix = generate_matrix(size, size, 42);
|
||||
let x = generate_vec(size, 123);
|
||||
let mut y = vec![0.0f32; size];
|
||||
|
||||
group.throughput(Throughput::Elements((size * size) as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
matmul_naive(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", size), &size, |b, _| {
|
||||
b.iter(|| {
|
||||
simd_impl::matmul_simd(black_box(&matrix), black_box(&x), &mut y, size, size);
|
||||
black_box(y[0])
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// Benchmark non-square matrix multiply (projection)
|
||||
fn bench_projection_matmul(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_matmul_projection");
|
||||
|
||||
// Common projection sizes in coherence: 64->32, 128->64, 256->128
|
||||
for (in_dim, out_dim) in [(64, 32), (128, 64), (256, 128)] {
|
||||
let matrix = generate_matrix(out_dim, in_dim, 42);
|
||||
let x = generate_vec(in_dim, 123);
|
||||
let mut y = vec![0.0f32; out_dim];
|
||||
|
||||
group.throughput(Throughput::Elements((out_dim * in_dim) as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("naive", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
matmul_naive(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("unrolled", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
matmul_unrolled(black_box(&matrix), black_box(&x), &mut y, out_dim, in_dim);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("simd", format!("{}x{}", in_dim, out_dim)),
|
||||
&(in_dim, out_dim),
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
simd_impl::matmul_simd(
|
||||
black_box(&matrix),
|
||||
black_box(&x),
|
||||
&mut y,
|
||||
out_dim,
|
||||
in_dim,
|
||||
);
|
||||
black_box(y[0])
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// NORM COMPUTATION BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_norm_computation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_norm");
|
||||
|
||||
// Test dimensions aligned for SIMD
|
||||
for dim in [64, 128, 256, 512, 1024] {
|
||||
let v = generate_vec(dim, 42);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_naive(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("iter", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_iter(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled_4", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_unrolled(black_box(&v))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled_8", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd_f32x8", dim), &dim, |b, _| {
|
||||
b.iter(|| black_box(simd_impl::norm_sq_simd(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DOT PRODUCT BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_dot_product(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_dot");
|
||||
|
||||
for dim in [64, 256, 1024] {
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(dot_naive(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(dot_unrolled(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(simd_impl::dot_simd(black_box(&a), black_box(&b))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// RESIDUAL NORM BENCHMARKS (CORE COHERENCE OPERATION)
|
||||
// ============================================================================
|
||||
|
||||
fn bench_residual_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_residual_norm");
|
||||
|
||||
for dim in [64, 256, 1024] {
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("naive", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(residual_norm_naive(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("unrolled", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
|
||||
});
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |b_iter, _| {
|
||||
b_iter.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// BATCH RESIDUAL BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_batch_residual(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_batch_residual");
|
||||
|
||||
let dim = 64;
|
||||
|
||||
for batch_size in [100, 1000, 10000] {
|
||||
let sources: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_vec(dim, i as u64))
|
||||
.collect();
|
||||
let targets: Vec<Vec<f32>> = (0..batch_size)
|
||||
.map(|i| generate_vec(dim, i as u64 + 10000))
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("naive", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_residual_naive(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("unrolled", batch_size),
|
||||
&batch_size,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_residual_unrolled(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(BenchmarkId::new("simd", batch_size), &batch_size, |b, _| {
|
||||
b.iter(|| {
|
||||
black_box(simd_impl::batch_residual_simd(
|
||||
black_box(&sources),
|
||||
black_box(&targets),
|
||||
))
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MEMORY ALIGNMENT BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_alignment_impact(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_alignment");
|
||||
|
||||
let dim = 256;
|
||||
|
||||
// Aligned (multiple of 8)
|
||||
{
|
||||
let v = generate_vec(dim, 42);
|
||||
group.bench_function("aligned_256", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
// Misaligned (not multiple of 8)
|
||||
{
|
||||
let v = generate_vec(dim + 3, 42);
|
||||
group.bench_function("misaligned_259", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
// Small vector (below SIMD threshold)
|
||||
{
|
||||
let v = generate_vec(7, 42);
|
||||
group.bench_function("small_7", |b| {
|
||||
b.iter(|| black_box(norm_sq_unrolled_8(black_box(&v))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// THROUGHPUT SCALING BENCHMARKS
|
||||
// ============================================================================
|
||||
|
||||
fn bench_throughput_scaling(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_throughput_scaling");
|
||||
|
||||
// Test how throughput scales with vector size
|
||||
let sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
|
||||
|
||||
for &size in &sizes {
|
||||
let a = generate_vec(size, 42);
|
||||
let b = generate_vec(size, 123);
|
||||
|
||||
group.throughput(Throughput::Bytes((size * 4 * 2) as u64)); // 2 vectors, 4 bytes each
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("residual_unrolled", size),
|
||||
&size,
|
||||
|bench, _| {
|
||||
bench.iter(|| black_box(residual_norm_unrolled(black_box(&a), black_box(&b))))
|
||||
},
|
||||
);
|
||||
|
||||
#[cfg(feature = "simd")]
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("residual_simd", size),
|
||||
&size,
|
||||
|bench, _| {
|
||||
bench
|
||||
.iter(|| black_box(simd_impl::residual_norm_simd(black_box(&a), black_box(&b))))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// COHERENCE-SPECIFIC SIMD PATTERNS
|
||||
// ============================================================================
|
||||
|
||||
/// Fused multiply-add pattern for coherence energy
|
||||
fn bench_fma_pattern(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("simd_fma_pattern");
|
||||
|
||||
let dim = 256;
|
||||
let a = generate_vec(dim, 42);
|
||||
let b = generate_vec(dim, 123);
|
||||
let weight = 1.5f32;
|
||||
|
||||
// Without FMA (separate multiply and add)
|
||||
group.bench_function("separate_ops", |bench| {
|
||||
bench.iter(|| {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..dim {
|
||||
let diff = a[i] - b[i];
|
||||
let sq = diff * diff;
|
||||
sum += sq;
|
||||
}
|
||||
black_box(weight * sum)
|
||||
})
|
||||
});
|
||||
|
||||
// With potential FMA (compiler may optimize)
|
||||
group.bench_function("fma_friendly", |bench| {
|
||||
bench.iter(|| {
|
||||
let mut acc0 = 0.0f32;
|
||||
let mut acc1 = 0.0f32;
|
||||
let mut acc2 = 0.0f32;
|
||||
let mut acc3 = 0.0f32;
|
||||
|
||||
let chunks = dim / 4;
|
||||
for c in 0..chunks {
|
||||
let base = c * 4;
|
||||
let d0 = a[base] - b[base];
|
||||
let d1 = a[base + 1] - b[base + 1];
|
||||
let d2 = a[base + 2] - b[base + 2];
|
||||
let d3 = a[base + 3] - b[base + 3];
|
||||
|
||||
// These can become FMA operations
|
||||
acc0 = d0.mul_add(d0, acc0);
|
||||
acc1 = d1.mul_add(d1, acc1);
|
||||
acc2 = d2.mul_add(d2, acc2);
|
||||
acc3 = d3.mul_add(d3, acc3);
|
||||
}
|
||||
|
||||
black_box(weight * (acc0 + acc1 + acc2 + acc3))
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CRITERION CONFIGURATION
|
||||
// ============================================================================
|
||||
|
||||
criterion_group!(matmul_benches, bench_dense_matmul, bench_projection_matmul,);
|
||||
|
||||
criterion_group!(
|
||||
vector_ops_benches,
|
||||
bench_norm_computation,
|
||||
bench_dot_product,
|
||||
bench_residual_norm,
|
||||
);
|
||||
|
||||
criterion_group!(batch_benches, bench_batch_residual,);
|
||||
|
||||
criterion_group!(
|
||||
optimization_benches,
|
||||
bench_alignment_impact,
|
||||
bench_throughput_scaling,
|
||||
bench_fma_pattern,
|
||||
);
|
||||
|
||||
criterion_main!(
|
||||
matmul_benches,
|
||||
vector_ops_benches,
|
||||
batch_benches,
|
||||
optimization_benches
|
||||
);
|
||||
Reference in New Issue
Block a user