Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
1228
vendor/ruvector/crates/ruvllm/benches/ane_bench.rs
vendored
Normal file
1228
vendor/ruvector/crates/ruvllm/benches/ane_bench.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
738
vendor/ruvector/crates/ruvllm/benches/attention_bench.rs
vendored
Normal file
738
vendor/ruvector/crates/ruvllm/benches/attention_bench.rs
vendored
Normal file
@@ -0,0 +1,738 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! Attention Kernel Benchmarks for M4 Pro
|
||||
//!
|
||||
//! Benchmarks for Flash Attention 2, Paged Attention, MQA, and GQA implementations.
|
||||
//!
|
||||
//! Performance targets for M4 Pro:
|
||||
//! - Flash attention (256 seq): <2ms
|
||||
//! - Flash attention (512 seq): <5ms
|
||||
//! - Flash attention (1024 seq): <15ms
|
||||
//! - Paged attention: Similar to flash attention + 10% overhead
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use rand::Rng;
|
||||
|
||||
// Re-create the kernel functions inline since we can't import from the crate easily in benches
|
||||
// In production, these would be imported from ruvllm::kernels
|
||||
|
||||
/// SIMD lane width for NEON (128-bit = 4 floats)
|
||||
const NEON_LANE_WIDTH: usize = 4;
|
||||
const UNROLL_FACTOR: usize = 4;
|
||||
|
||||
/// Paged KV cache for efficient memory management
|
||||
#[derive(Clone)]
|
||||
struct PagedKvCache {
|
||||
key_blocks: Vec<Vec<f32>>,
|
||||
value_blocks: Vec<Vec<f32>>,
|
||||
block_size: usize,
|
||||
num_kv_heads: usize,
|
||||
head_dim: usize,
|
||||
num_tokens: usize,
|
||||
}
|
||||
|
||||
impl PagedKvCache {
|
||||
fn new(block_size: usize, num_kv_heads: usize, head_dim: usize) -> Self {
|
||||
Self {
|
||||
key_blocks: Vec::new(),
|
||||
value_blocks: Vec::new(),
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
num_tokens: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn append(&mut self, keys: &[f32], values: &[f32]) {
|
||||
let stride = self.num_kv_heads * self.head_dim;
|
||||
let num_tokens = keys.len() / stride;
|
||||
|
||||
for i in 0..num_tokens {
|
||||
let offset = i * stride;
|
||||
|
||||
if self.num_tokens % self.block_size == 0 {
|
||||
let block_capacity = self.block_size * stride;
|
||||
self.key_blocks.push(vec![0.0; block_capacity]);
|
||||
self.value_blocks.push(vec![0.0; block_capacity]);
|
||||
}
|
||||
|
||||
let block_idx = self.num_tokens / self.block_size;
|
||||
let pos_in_block = (self.num_tokens % self.block_size) * stride;
|
||||
|
||||
self.key_blocks[block_idx][pos_in_block..pos_in_block + stride]
|
||||
.copy_from_slice(&keys[offset..offset + stride]);
|
||||
self.value_blocks[block_idx][pos_in_block..pos_in_block + stride]
|
||||
.copy_from_slice(&values[offset..offset + stride]);
|
||||
|
||||
self.num_tokens += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn get_keys(&self) -> Vec<f32> {
|
||||
let stride = self.num_kv_heads * self.head_dim;
|
||||
let mut result = Vec::with_capacity(self.num_tokens * stride);
|
||||
for (block_idx, block) in self.key_blocks.iter().enumerate() {
|
||||
let tokens_in_block = if block_idx == self.key_blocks.len() - 1 {
|
||||
let rem = self.num_tokens % self.block_size;
|
||||
if rem == 0 {
|
||||
self.block_size
|
||||
} else {
|
||||
rem
|
||||
}
|
||||
} else {
|
||||
self.block_size
|
||||
};
|
||||
result.extend_from_slice(&block[..tokens_in_block * stride]);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn get_values(&self) -> Vec<f32> {
|
||||
let stride = self.num_kv_heads * self.head_dim;
|
||||
let mut result = Vec::with_capacity(self.num_tokens * stride);
|
||||
for (block_idx, block) in self.value_blocks.iter().enumerate() {
|
||||
let tokens_in_block = if block_idx == self.value_blocks.len() - 1 {
|
||||
let rem = self.num_tokens % self.block_size;
|
||||
if rem == 0 {
|
||||
self.block_size
|
||||
} else {
|
||||
rem
|
||||
}
|
||||
} else {
|
||||
self.block_size
|
||||
};
|
||||
result.extend_from_slice(&block[..tokens_in_block * stride]);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Attention configuration
|
||||
#[derive(Clone, Copy)]
|
||||
struct AttentionConfig {
|
||||
num_heads: usize,
|
||||
num_kv_heads: usize,
|
||||
head_dim: usize,
|
||||
max_seq_len: usize,
|
||||
causal: bool,
|
||||
scale: f32,
|
||||
}
|
||||
|
||||
impl Default for AttentionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_heads: 32,
|
||||
num_kv_heads: 8,
|
||||
head_dim: 128,
|
||||
max_seq_len: 4096,
|
||||
causal: true,
|
||||
scale: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AttentionConfig {
|
||||
fn effective_scale(&self) -> f32 {
|
||||
if self.scale == 0.0 {
|
||||
1.0 / (self.head_dim as f32).sqrt()
|
||||
} else {
|
||||
self.scale
|
||||
}
|
||||
}
|
||||
|
||||
fn gqa_ratio(&self) -> usize {
|
||||
self.num_heads / self.num_kv_heads
|
||||
}
|
||||
}
|
||||
|
||||
/// Flash Attention 2 with NEON SIMD optimization
|
||||
#[inline(always)]
|
||||
fn flash_attention_neon(
|
||||
query: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
scale: f32,
|
||||
causal: bool,
|
||||
) -> Vec<f32> {
|
||||
let head_dim = if !query.is_empty() && !key.is_empty() {
|
||||
query.len()
|
||||
} else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
let kv_len = key.len() / head_dim;
|
||||
if kv_len == 0 {
|
||||
return vec![0.0; head_dim];
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
flash_attention_neon_impl(query, key, value, head_dim, kv_len, scale, causal)
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
flash_attention_scalar(query, key, value, head_dim, kv_len, scale, causal)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn flash_attention_neon_impl(
|
||||
query: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
head_dim: usize,
|
||||
kv_len: usize,
|
||||
scale: f32,
|
||||
_causal: bool,
|
||||
) -> Vec<f32> {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let q_ptr = query.as_ptr();
|
||||
let k_ptr = key.as_ptr();
|
||||
let v_ptr = value.as_ptr();
|
||||
|
||||
let mut max_score = f32::NEG_INFINITY;
|
||||
let mut sum_exp = 0.0f32;
|
||||
let mut output = vec![0.0f32; head_dim];
|
||||
let out_ptr = output.as_mut_ptr();
|
||||
|
||||
let scale_vec = vdupq_n_f32(scale);
|
||||
|
||||
for t in 0..kv_len {
|
||||
let k_offset = t * head_dim;
|
||||
|
||||
let mut dot = vdupq_n_f32(0.0);
|
||||
let chunks = head_dim / (NEON_LANE_WIDTH * UNROLL_FACTOR);
|
||||
|
||||
let mut idx = 0usize;
|
||||
for _ in 0..chunks {
|
||||
let q0 = vld1q_f32(q_ptr.add(idx));
|
||||
let k0 = vld1q_f32(k_ptr.add(k_offset + idx));
|
||||
dot = vfmaq_f32(dot, q0, k0);
|
||||
|
||||
let q1 = vld1q_f32(q_ptr.add(idx + 4));
|
||||
let k1 = vld1q_f32(k_ptr.add(k_offset + idx + 4));
|
||||
dot = vfmaq_f32(dot, q1, k1);
|
||||
|
||||
let q2 = vld1q_f32(q_ptr.add(idx + 8));
|
||||
let k2 = vld1q_f32(k_ptr.add(k_offset + idx + 8));
|
||||
dot = vfmaq_f32(dot, q2, k2);
|
||||
|
||||
let q3 = vld1q_f32(q_ptr.add(idx + 12));
|
||||
let k3 = vld1q_f32(k_ptr.add(k_offset + idx + 12));
|
||||
dot = vfmaq_f32(dot, q3, k3);
|
||||
|
||||
idx += 16;
|
||||
}
|
||||
|
||||
let remaining_chunks = (head_dim - idx) / NEON_LANE_WIDTH;
|
||||
for _ in 0..remaining_chunks {
|
||||
let q_v = vld1q_f32(q_ptr.add(idx));
|
||||
let k_v = vld1q_f32(k_ptr.add(k_offset + idx));
|
||||
dot = vfmaq_f32(dot, q_v, k_v);
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
let mut score = vaddvq_f32(vmulq_f32(dot, scale_vec));
|
||||
|
||||
for i in idx..head_dim {
|
||||
score += *q_ptr.add(i) * *k_ptr.add(k_offset + i) * scale;
|
||||
}
|
||||
|
||||
if score > max_score {
|
||||
let exp_diff = (max_score - score).exp();
|
||||
sum_exp = sum_exp * exp_diff + 1.0;
|
||||
max_score = score;
|
||||
|
||||
let rescale = vdupq_n_f32(exp_diff);
|
||||
let mut out_idx = 0usize;
|
||||
let out_chunks = head_dim / NEON_LANE_WIDTH;
|
||||
for _ in 0..out_chunks {
|
||||
let out_v = vld1q_f32(out_ptr.add(out_idx));
|
||||
vst1q_f32(out_ptr.add(out_idx), vmulq_f32(out_v, rescale));
|
||||
out_idx += 4;
|
||||
}
|
||||
for i in out_idx..head_dim {
|
||||
*out_ptr.add(i) *= exp_diff;
|
||||
}
|
||||
} else {
|
||||
sum_exp += (score - max_score).exp();
|
||||
}
|
||||
|
||||
let weight = (score - max_score).exp();
|
||||
let weight_vec = vdupq_n_f32(weight);
|
||||
|
||||
let mut out_idx = 0usize;
|
||||
let out_chunks = head_dim / (NEON_LANE_WIDTH * UNROLL_FACTOR);
|
||||
for _ in 0..out_chunks {
|
||||
let v0 = vld1q_f32(v_ptr.add(t * head_dim + out_idx));
|
||||
let o0 = vld1q_f32(out_ptr.add(out_idx));
|
||||
vst1q_f32(out_ptr.add(out_idx), vfmaq_f32(o0, v0, weight_vec));
|
||||
|
||||
let v1 = vld1q_f32(v_ptr.add(t * head_dim + out_idx + 4));
|
||||
let o1 = vld1q_f32(out_ptr.add(out_idx + 4));
|
||||
vst1q_f32(out_ptr.add(out_idx + 4), vfmaq_f32(o1, v1, weight_vec));
|
||||
|
||||
let v2 = vld1q_f32(v_ptr.add(t * head_dim + out_idx + 8));
|
||||
let o2 = vld1q_f32(out_ptr.add(out_idx + 8));
|
||||
vst1q_f32(out_ptr.add(out_idx + 8), vfmaq_f32(o2, v2, weight_vec));
|
||||
|
||||
let v3 = vld1q_f32(v_ptr.add(t * head_dim + out_idx + 12));
|
||||
let o3 = vld1q_f32(out_ptr.add(out_idx + 12));
|
||||
vst1q_f32(out_ptr.add(out_idx + 12), vfmaq_f32(o3, v3, weight_vec));
|
||||
|
||||
out_idx += 16;
|
||||
}
|
||||
|
||||
let remaining_out = (head_dim - out_idx) / NEON_LANE_WIDTH;
|
||||
for _ in 0..remaining_out {
|
||||
let v_v = vld1q_f32(v_ptr.add(t * head_dim + out_idx));
|
||||
let o_v = vld1q_f32(out_ptr.add(out_idx));
|
||||
vst1q_f32(out_ptr.add(out_idx), vfmaq_f32(o_v, v_v, weight_vec));
|
||||
out_idx += 4;
|
||||
}
|
||||
|
||||
for i in out_idx..head_dim {
|
||||
*out_ptr.add(i) += weight * *v_ptr.add(t * head_dim + i);
|
||||
}
|
||||
}
|
||||
|
||||
if sum_exp > 0.0 {
|
||||
let inv_sum = 1.0 / sum_exp;
|
||||
let inv_sum_vec = vdupq_n_f32(inv_sum);
|
||||
|
||||
let mut idx = 0usize;
|
||||
let chunks = head_dim / NEON_LANE_WIDTH;
|
||||
for _ in 0..chunks {
|
||||
let o = vld1q_f32(out_ptr.add(idx));
|
||||
vst1q_f32(out_ptr.add(idx), vmulq_f32(o, inv_sum_vec));
|
||||
idx += 4;
|
||||
}
|
||||
for i in idx..head_dim {
|
||||
*out_ptr.add(i) *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn flash_attention_scalar(
|
||||
query: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
head_dim: usize,
|
||||
kv_len: usize,
|
||||
scale: f32,
|
||||
_causal: bool,
|
||||
) -> Vec<f32> {
|
||||
let mut scores = Vec::with_capacity(kv_len);
|
||||
|
||||
for t in 0..kv_len {
|
||||
let k_offset = t * head_dim;
|
||||
let score: f32 = query
|
||||
.iter()
|
||||
.zip(&key[k_offset..k_offset + head_dim])
|
||||
.map(|(q, k)| q * k * scale)
|
||||
.sum();
|
||||
scores.push(score);
|
||||
}
|
||||
|
||||
let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let exp_scores: Vec<f32> = scores.iter().map(|s| (s - max_score).exp()).collect();
|
||||
let sum_exp: f32 = exp_scores.iter().sum();
|
||||
let attn_weights: Vec<f32> = exp_scores.iter().map(|e| e / sum_exp).collect();
|
||||
|
||||
let mut output = vec![0.0; head_dim];
|
||||
for (t, weight) in attn_weights.iter().enumerate() {
|
||||
let v_offset = t * head_dim;
|
||||
for (i, v) in value[v_offset..v_offset + head_dim].iter().enumerate() {
|
||||
output[i] += weight * v;
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
fn paged_attention_neon(
|
||||
query: &[f32],
|
||||
kv_cache: &PagedKvCache,
|
||||
_block_tables: &[usize],
|
||||
scale: f32,
|
||||
) -> Vec<f32> {
|
||||
if kv_cache.num_tokens == 0 {
|
||||
return vec![0.0; query.len()];
|
||||
}
|
||||
|
||||
let keys = kv_cache.get_keys();
|
||||
let values = kv_cache.get_values();
|
||||
|
||||
flash_attention_neon(query, &keys, &values, scale, false)
|
||||
}
|
||||
|
||||
fn multi_query_attention_neon(
|
||||
queries: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
config: &AttentionConfig,
|
||||
) -> Vec<f32> {
|
||||
let head_dim = config.head_dim;
|
||||
let num_heads = config.num_heads;
|
||||
let scale = config.effective_scale();
|
||||
|
||||
let mut output = vec![0.0; num_heads * head_dim];
|
||||
|
||||
for h in 0..num_heads {
|
||||
let q_offset = h * head_dim;
|
||||
let q_slice = &queries[q_offset..q_offset + head_dim];
|
||||
|
||||
let head_output = flash_attention_neon(q_slice, key, value, scale, config.causal);
|
||||
|
||||
output[q_offset..q_offset + head_dim].copy_from_slice(&head_output);
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
fn grouped_query_attention_neon(
|
||||
queries: &[f32],
|
||||
keys: &[f32],
|
||||
values: &[f32],
|
||||
config: &AttentionConfig,
|
||||
) -> Vec<f32> {
|
||||
let head_dim = config.head_dim;
|
||||
let num_heads = config.num_heads;
|
||||
let num_kv_heads = config.num_kv_heads;
|
||||
let gqa_ratio = config.gqa_ratio();
|
||||
let scale = config.effective_scale();
|
||||
|
||||
let kv_len = keys.len() / (num_kv_heads * head_dim);
|
||||
let mut output = vec![0.0; num_heads * head_dim];
|
||||
|
||||
for h in 0..num_heads {
|
||||
let kv_head = h / gqa_ratio;
|
||||
let q_offset = h * head_dim;
|
||||
let q_slice = &queries[q_offset..q_offset + head_dim];
|
||||
|
||||
let mut kv_keys = Vec::with_capacity(kv_len * head_dim);
|
||||
let mut kv_values = Vec::with_capacity(kv_len * head_dim);
|
||||
|
||||
for t in 0..kv_len {
|
||||
let kv_offset = (t * num_kv_heads + kv_head) * head_dim;
|
||||
kv_keys.extend_from_slice(&keys[kv_offset..kv_offset + head_dim]);
|
||||
kv_values.extend_from_slice(&values[kv_offset..kv_offset + head_dim]);
|
||||
}
|
||||
|
||||
let head_output = flash_attention_neon(q_slice, &kv_keys, &kv_values, scale, config.causal);
|
||||
|
||||
output[q_offset..q_offset + head_dim].copy_from_slice(&head_output);
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
// Helper function to generate random tensor data
|
||||
fn random_tensor(size: usize) -> Vec<f32> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
|
||||
}
|
||||
|
||||
// === Benchmark Functions ===
|
||||
|
||||
fn bench_flash_attention(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("flash_attention");
|
||||
group.sample_size(50);
|
||||
|
||||
// Test various sequence lengths and head dimensions
|
||||
for seq_len in [128, 256, 512, 1024, 2048] {
|
||||
for head_dim in [64, 128] {
|
||||
let query = random_tensor(head_dim);
|
||||
let key = random_tensor(seq_len * head_dim);
|
||||
let value = random_tensor(seq_len * head_dim);
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("seq_{}_head_{}", seq_len, head_dim),
|
||||
seq_len * head_dim,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((seq_len * head_dim) as u64));
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(query.clone(), key.clone(), value.clone()),
|
||||
|b, (q, k, v)| {
|
||||
b.iter(|| {
|
||||
flash_attention_neon(black_box(q), black_box(k), black_box(v), scale, true)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_flash_attention_batched(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("flash_attention_batched");
|
||||
group.sample_size(30);
|
||||
|
||||
// Test batch processing for multi-head attention
|
||||
let head_dim = 128;
|
||||
let num_heads = 32;
|
||||
|
||||
for seq_len in [128, 256, 512] {
|
||||
let queries = random_tensor(num_heads * head_dim);
|
||||
let key = random_tensor(seq_len * head_dim);
|
||||
let value = random_tensor(seq_len * head_dim);
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
let id = BenchmarkId::new(format!("heads_{}_seq_{}", num_heads, seq_len), seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(num_heads * seq_len * head_dim) as u64,
|
||||
));
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(queries.clone(), key.clone(), value.clone()),
|
||||
|b, (q, k, v)| {
|
||||
b.iter(|| {
|
||||
// Process all heads
|
||||
let mut outputs = Vec::with_capacity(num_heads * head_dim);
|
||||
for h in 0..num_heads {
|
||||
let q_offset = h * head_dim;
|
||||
let q_slice = &q[q_offset..q_offset + head_dim];
|
||||
let out = flash_attention_neon(
|
||||
black_box(q_slice),
|
||||
black_box(k),
|
||||
black_box(v),
|
||||
scale,
|
||||
true,
|
||||
);
|
||||
outputs.extend(out);
|
||||
}
|
||||
outputs
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_paged_attention(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("paged_attention");
|
||||
group.sample_size(50);
|
||||
|
||||
// Test various block sizes and sequence lengths
|
||||
for block_size in [16, 32, 64] {
|
||||
for num_tokens in [64, 128, 256, 512] {
|
||||
let head_dim = 128;
|
||||
let num_kv_heads = 8;
|
||||
|
||||
// Create and populate KV cache
|
||||
let mut kv_cache = PagedKvCache::new(block_size, num_kv_heads, head_dim);
|
||||
let stride = num_kv_heads * head_dim;
|
||||
|
||||
for _ in 0..num_tokens {
|
||||
let keys = random_tensor(stride);
|
||||
let values = random_tensor(stride);
|
||||
kv_cache.append(&keys, &values);
|
||||
}
|
||||
|
||||
let query = random_tensor(head_dim);
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("block_{}_tokens_{}", block_size, num_tokens),
|
||||
num_tokens,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((num_tokens * head_dim) as u64));
|
||||
group.bench_with_input(id, &(query.clone(), kv_cache.clone()), |b, (q, cache)| {
|
||||
b.iter(|| paged_attention_neon(black_box(q), black_box(cache), &[], scale))
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_mqa(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("multi_query_attention");
|
||||
group.sample_size(30);
|
||||
|
||||
for num_heads in [8, 16, 32] {
|
||||
for seq_len in [128, 256, 512] {
|
||||
let head_dim = 128;
|
||||
|
||||
let config = AttentionConfig {
|
||||
num_heads,
|
||||
num_kv_heads: 1, // MQA: single KV head
|
||||
head_dim,
|
||||
causal: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let queries = random_tensor(num_heads * head_dim);
|
||||
let key = random_tensor(seq_len * head_dim);
|
||||
let value = random_tensor(seq_len * head_dim);
|
||||
|
||||
let id = BenchmarkId::new(format!("heads_{}_seq_{}", num_heads, seq_len), seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(num_heads * seq_len * head_dim) as u64,
|
||||
));
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(queries.clone(), key.clone(), value.clone(), config),
|
||||
|b, (q, k, v, cfg)| {
|
||||
b.iter(|| {
|
||||
multi_query_attention_neon(black_box(q), black_box(k), black_box(v), cfg)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_gqa(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("grouped_query_attention");
|
||||
group.sample_size(30);
|
||||
|
||||
// Test various GQA ratios (num_heads / num_kv_heads)
|
||||
for (num_heads, num_kv_heads) in [(32, 8), (32, 4), (16, 4), (16, 2)] {
|
||||
for seq_len in [128, 256, 512] {
|
||||
let head_dim = 128;
|
||||
|
||||
let config = AttentionConfig {
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
causal: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let queries = random_tensor(num_heads * head_dim);
|
||||
let keys = random_tensor(seq_len * num_kv_heads * head_dim);
|
||||
let values = random_tensor(seq_len * num_kv_heads * head_dim);
|
||||
|
||||
let ratio = num_heads / num_kv_heads;
|
||||
let id = BenchmarkId::new(format!("ratio_{}_seq_{}", ratio, seq_len), seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(num_heads * seq_len * head_dim) as u64,
|
||||
));
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(queries.clone(), keys.clone(), values.clone(), config),
|
||||
|b, (q, k, v, cfg)| {
|
||||
b.iter(|| {
|
||||
grouped_query_attention_neon(black_box(q), black_box(k), black_box(v), cfg)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_attention_memory_efficiency(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("attention_memory");
|
||||
group.sample_size(20);
|
||||
|
||||
// Compare memory usage at different sequence lengths
|
||||
for seq_len in [256, 512, 1024, 2048, 4096] {
|
||||
let head_dim = 128;
|
||||
|
||||
let query = random_tensor(head_dim);
|
||||
let key = random_tensor(seq_len * head_dim);
|
||||
let value = random_tensor(seq_len * head_dim);
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
// Memory for Q, K, V in bytes
|
||||
let memory_bytes = (1 + seq_len * 2) * head_dim * 4; // f32 = 4 bytes
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("seq_{}_mem_{}KB", seq_len, memory_bytes / 1024),
|
||||
seq_len,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Bytes(memory_bytes as u64));
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(query.clone(), key.clone(), value.clone()),
|
||||
|b, (q, k, v)| {
|
||||
b.iter(|| {
|
||||
flash_attention_neon(black_box(q), black_box(k), black_box(v), scale, true)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_attention_scaling(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("attention_scaling");
|
||||
group.sample_size(20);
|
||||
|
||||
// Test scaling behavior with increasing sequence length
|
||||
let head_dim = 128;
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
|
||||
for power in 7..=12 {
|
||||
// 128 to 4096
|
||||
let seq_len = 1 << power;
|
||||
|
||||
let query = random_tensor(head_dim);
|
||||
let key = random_tensor(seq_len * head_dim);
|
||||
let value = random_tensor(seq_len * head_dim);
|
||||
|
||||
let id = BenchmarkId::new(format!("seq_{}", seq_len), seq_len);
|
||||
|
||||
// Measure FLOPs: 2*seq_len*head_dim for QK^T + 2*seq_len*head_dim for AV
|
||||
let flops = 4 * seq_len * head_dim;
|
||||
group.throughput(Throughput::Elements(flops as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
id,
|
||||
&(query.clone(), key.clone(), value.clone()),
|
||||
|b, (q, k, v)| {
|
||||
b.iter(|| {
|
||||
flash_attention_neon(black_box(q), black_box(k), black_box(v), scale, true)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_flash_attention,
|
||||
bench_flash_attention_batched,
|
||||
bench_paged_attention,
|
||||
bench_mqa,
|
||||
bench_gqa,
|
||||
bench_attention_memory_efficiency,
|
||||
bench_attention_scaling,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
707
vendor/ruvector/crates/ruvllm/benches/e2e_bench.rs
vendored
Normal file
707
vendor/ruvector/crates/ruvllm/benches/e2e_bench.rs
vendored
Normal file
@@ -0,0 +1,707 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! End-to-End LLM Inference Benchmarks for M4 Pro
|
||||
//!
|
||||
//! Comprehensive benchmarks for complete inference pipeline:
|
||||
//! - Time to first token (TTFT)
|
||||
//! - Tokens per second (throughput)
|
||||
//! - Memory usage tracking
|
||||
//! - Full transformer layer forward pass
|
||||
//!
|
||||
//! Performance targets for M4 Pro:
|
||||
//! - TTFT: <100ms for 7B model
|
||||
//! - Throughput: 100+ tokens/sec for 7B model
|
||||
//! - Memory: <16GB for 7B model inference
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use rand::Rng;
|
||||
use std::time::Instant;
|
||||
|
||||
// Simulated model configuration
|
||||
#[derive(Clone, Copy)]
|
||||
struct ModelConfig {
|
||||
hidden_size: usize,
|
||||
intermediate_size: usize,
|
||||
num_attention_heads: usize,
|
||||
num_kv_heads: usize,
|
||||
head_dim: usize,
|
||||
num_layers: usize,
|
||||
vocab_size: usize,
|
||||
max_seq_len: usize,
|
||||
}
|
||||
|
||||
impl ModelConfig {
|
||||
fn llama2_7b() -> Self {
|
||||
Self {
|
||||
hidden_size: 4096,
|
||||
intermediate_size: 11008,
|
||||
num_attention_heads: 32,
|
||||
num_kv_heads: 32,
|
||||
head_dim: 128,
|
||||
num_layers: 32,
|
||||
vocab_size: 32000,
|
||||
max_seq_len: 4096,
|
||||
}
|
||||
}
|
||||
|
||||
fn llama2_13b() -> Self {
|
||||
Self {
|
||||
hidden_size: 5120,
|
||||
intermediate_size: 13824,
|
||||
num_attention_heads: 40,
|
||||
num_kv_heads: 40,
|
||||
head_dim: 128,
|
||||
num_layers: 40,
|
||||
vocab_size: 32000,
|
||||
max_seq_len: 4096,
|
||||
}
|
||||
}
|
||||
|
||||
fn llama3_8b() -> Self {
|
||||
Self {
|
||||
hidden_size: 4096,
|
||||
intermediate_size: 14336,
|
||||
num_attention_heads: 32,
|
||||
num_kv_heads: 8, // GQA
|
||||
head_dim: 128,
|
||||
num_layers: 32,
|
||||
vocab_size: 128256,
|
||||
max_seq_len: 8192,
|
||||
}
|
||||
}
|
||||
|
||||
fn mistral_7b() -> Self {
|
||||
Self {
|
||||
hidden_size: 4096,
|
||||
intermediate_size: 14336,
|
||||
num_attention_heads: 32,
|
||||
num_kv_heads: 8, // GQA
|
||||
head_dim: 128,
|
||||
num_layers: 32,
|
||||
vocab_size: 32000,
|
||||
max_seq_len: 32768,
|
||||
}
|
||||
}
|
||||
|
||||
fn params_per_layer(&self) -> usize {
|
||||
// Attention: Q, K, V, O projections
|
||||
let attn_params = self.hidden_size * self.hidden_size * 4;
|
||||
|
||||
// MLP: gate, up, down projections
|
||||
let mlp_params = self.hidden_size * self.intermediate_size * 3;
|
||||
|
||||
// Norms (2 per layer)
|
||||
let norm_params = self.hidden_size * 2;
|
||||
|
||||
attn_params + mlp_params + norm_params
|
||||
}
|
||||
|
||||
fn total_params(&self) -> usize {
|
||||
// Embedding
|
||||
let embed_params = self.vocab_size * self.hidden_size;
|
||||
|
||||
// All layers
|
||||
let layer_params = self.params_per_layer() * self.num_layers;
|
||||
|
||||
// Final norm + LM head
|
||||
let final_params = self.hidden_size + self.vocab_size * self.hidden_size;
|
||||
|
||||
embed_params + layer_params + final_params
|
||||
}
|
||||
|
||||
fn memory_bytes_fp16(&self) -> usize {
|
||||
self.total_params() * 2 // FP16
|
||||
}
|
||||
|
||||
fn memory_bytes_int4(&self) -> usize {
|
||||
self.total_params() / 2 // INT4
|
||||
}
|
||||
}
|
||||
|
||||
// Simulated transformer layer operations
|
||||
struct TransformerLayer {
|
||||
// Weights (simulated)
|
||||
q_proj: Vec<f32>,
|
||||
k_proj: Vec<f32>,
|
||||
v_proj: Vec<f32>,
|
||||
o_proj: Vec<f32>,
|
||||
gate_proj: Vec<f32>,
|
||||
up_proj: Vec<f32>,
|
||||
down_proj: Vec<f32>,
|
||||
input_norm_weight: Vec<f32>,
|
||||
post_attn_norm_weight: Vec<f32>,
|
||||
config: ModelConfig,
|
||||
}
|
||||
|
||||
impl TransformerLayer {
|
||||
fn new(config: ModelConfig) -> Self {
|
||||
let hidden = config.hidden_size;
|
||||
let intermediate = config.intermediate_size;
|
||||
|
||||
Self {
|
||||
q_proj: random_tensor(hidden * hidden),
|
||||
k_proj: random_tensor(
|
||||
hidden * (hidden / config.num_attention_heads * config.num_kv_heads),
|
||||
),
|
||||
v_proj: random_tensor(
|
||||
hidden * (hidden / config.num_attention_heads * config.num_kv_heads),
|
||||
),
|
||||
o_proj: random_tensor(hidden * hidden),
|
||||
gate_proj: random_tensor(hidden * intermediate),
|
||||
up_proj: random_tensor(hidden * intermediate),
|
||||
down_proj: random_tensor(intermediate * hidden),
|
||||
input_norm_weight: random_tensor(hidden),
|
||||
post_attn_norm_weight: random_tensor(hidden),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
// Simulated forward pass for a single token
|
||||
fn forward_single_token(&self, hidden_state: &mut [f32], kv_cache_len: usize) {
|
||||
let hidden = self.config.hidden_size;
|
||||
|
||||
// 1. Input LayerNorm/RMSNorm
|
||||
rms_norm_inplace(hidden_state, &self.input_norm_weight, 1e-6);
|
||||
|
||||
// 2. Attention projections (Q, K, V)
|
||||
let mut q = gemv(&self.q_proj, hidden_state, hidden, hidden);
|
||||
let k = gemv(
|
||||
&self.k_proj,
|
||||
hidden_state,
|
||||
hidden,
|
||||
hidden / self.config.num_attention_heads * self.config.num_kv_heads,
|
||||
);
|
||||
let v = gemv(
|
||||
&self.v_proj,
|
||||
hidden_state,
|
||||
hidden,
|
||||
hidden / self.config.num_attention_heads * self.config.num_kv_heads,
|
||||
);
|
||||
|
||||
// 3. Apply RoPE (simplified)
|
||||
apply_rope_simple(&mut q, self.config.head_dim, kv_cache_len);
|
||||
|
||||
// 4. Attention (simplified - would use flash attention in practice)
|
||||
// For single token decode, this is essentially a dot product with cached KV
|
||||
let attn_output = attention_decode(
|
||||
&q,
|
||||
&k,
|
||||
&v,
|
||||
self.config.num_attention_heads,
|
||||
self.config.head_dim,
|
||||
);
|
||||
|
||||
// 5. Output projection
|
||||
let attn_projected = gemv(&self.o_proj, &attn_output, hidden, hidden);
|
||||
|
||||
// 6. Residual connection
|
||||
for i in 0..hidden {
|
||||
hidden_state[i] += attn_projected[i];
|
||||
}
|
||||
|
||||
// 7. Post-attention LayerNorm
|
||||
rms_norm_inplace(hidden_state, &self.post_attn_norm_weight, 1e-6);
|
||||
|
||||
// 8. MLP forward
|
||||
let gate_out = gemv(
|
||||
&self.gate_proj,
|
||||
hidden_state,
|
||||
hidden,
|
||||
self.config.intermediate_size,
|
||||
);
|
||||
let up_out = gemv(
|
||||
&self.up_proj,
|
||||
hidden_state,
|
||||
hidden,
|
||||
self.config.intermediate_size,
|
||||
);
|
||||
|
||||
// SiLU activation and element-wise multiply
|
||||
let mut mlp_intermediate = Vec::with_capacity(self.config.intermediate_size);
|
||||
for i in 0..self.config.intermediate_size {
|
||||
let silu = gate_out[i] / (1.0 + (-gate_out[i]).exp());
|
||||
mlp_intermediate.push(silu * up_out[i]);
|
||||
}
|
||||
|
||||
// Down projection
|
||||
let mlp_output = gemv(
|
||||
&self.down_proj,
|
||||
&mlp_intermediate,
|
||||
self.config.intermediate_size,
|
||||
hidden,
|
||||
);
|
||||
|
||||
// 9. Residual connection
|
||||
for i in 0..hidden {
|
||||
hidden_state[i] += mlp_output[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn random_tensor(size: usize) -> Vec<f32> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..size).map(|_| rng.gen_range(-0.1..0.1)).collect()
|
||||
}
|
||||
|
||||
fn rms_norm_inplace(x: &mut [f32], weight: &[f32], eps: f32) {
|
||||
let sum_sq: f32 = x.iter().map(|v| v * v).sum();
|
||||
let inv_rms = 1.0 / (sum_sq / x.len() as f32 + eps).sqrt();
|
||||
for (i, w) in weight.iter().enumerate() {
|
||||
x[i] = x[i] * inv_rms * w;
|
||||
}
|
||||
}
|
||||
|
||||
fn gemv(matrix: &[f32], vector: &[f32], m: usize, n: usize) -> Vec<f32> {
|
||||
let mut output = vec![0.0f32; n];
|
||||
for j in 0..n {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..m {
|
||||
sum += matrix[i * n + j] * vector[i];
|
||||
}
|
||||
output[j] = sum;
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
fn apply_rope_simple(x: &mut [f32], head_dim: usize, position: usize) {
|
||||
let half_dim = head_dim / 2;
|
||||
for i in 0..half_dim {
|
||||
let freq = 1.0 / 10000.0f32.powf((2 * i) as f32 / head_dim as f32);
|
||||
let theta = position as f32 * freq;
|
||||
let cos_theta = theta.cos();
|
||||
let sin_theta = theta.sin();
|
||||
|
||||
let x0 = x[i * 2];
|
||||
let x1 = x[i * 2 + 1];
|
||||
x[i * 2] = x0 * cos_theta - x1 * sin_theta;
|
||||
x[i * 2 + 1] = x1 * cos_theta + x0 * sin_theta;
|
||||
}
|
||||
}
|
||||
|
||||
fn attention_decode(
|
||||
q: &[f32],
|
||||
k: &[f32],
|
||||
v: &[f32],
|
||||
num_heads: usize,
|
||||
head_dim: usize,
|
||||
) -> Vec<f32> {
|
||||
// Simplified single-token attention decode
|
||||
let mut output = vec![0.0f32; num_heads * head_dim];
|
||||
|
||||
for h in 0..num_heads {
|
||||
let q_offset = h * head_dim;
|
||||
let q_slice = &q[q_offset..q_offset + head_dim];
|
||||
|
||||
// Dot product with single K (simplified - in practice would use KV cache)
|
||||
let k_offset = (h % (k.len() / head_dim)) * head_dim;
|
||||
let k_slice = &k[k_offset..k_offset + head_dim];
|
||||
|
||||
let score: f32 = q_slice.iter().zip(k_slice).map(|(q, k)| q * k).sum();
|
||||
let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
let weight = (score * scale).exp(); // Simplified softmax for single token
|
||||
|
||||
let v_offset = (h % (v.len() / head_dim)) * head_dim;
|
||||
let v_slice = &v[v_offset..v_offset + head_dim];
|
||||
|
||||
for i in 0..head_dim {
|
||||
output[q_offset + i] = v_slice[i] * weight;
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
// KV Cache simulation
|
||||
struct KvCache {
|
||||
keys: Vec<f32>,
|
||||
values: Vec<f32>,
|
||||
num_tokens: usize,
|
||||
num_kv_heads: usize,
|
||||
head_dim: usize,
|
||||
max_seq_len: usize,
|
||||
}
|
||||
|
||||
impl KvCache {
|
||||
fn new(config: &ModelConfig) -> Self {
|
||||
let capacity = config.max_seq_len * config.num_kv_heads * config.head_dim;
|
||||
Self {
|
||||
keys: vec![0.0; capacity],
|
||||
values: vec![0.0; capacity],
|
||||
num_tokens: 0,
|
||||
num_kv_heads: config.num_kv_heads,
|
||||
head_dim: config.head_dim,
|
||||
max_seq_len: config.max_seq_len,
|
||||
}
|
||||
}
|
||||
|
||||
fn append(&mut self, k: &[f32], v: &[f32]) {
|
||||
if self.num_tokens >= self.max_seq_len {
|
||||
return;
|
||||
}
|
||||
|
||||
let stride = self.num_kv_heads * self.head_dim;
|
||||
let offset = self.num_tokens * stride;
|
||||
|
||||
self.keys[offset..offset + stride].copy_from_slice(&k[..stride.min(k.len())]);
|
||||
self.values[offset..offset + stride].copy_from_slice(&v[..stride.min(v.len())]);
|
||||
self.num_tokens += 1;
|
||||
}
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
(self.keys.len() + self.values.len()) * std::mem::size_of::<f32>()
|
||||
}
|
||||
}
|
||||
|
||||
// === Benchmark Functions ===
|
||||
|
||||
fn bench_single_layer_forward(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("single_layer_forward");
|
||||
group.sample_size(30);
|
||||
|
||||
let configs = [
|
||||
("llama2_7b", ModelConfig::llama2_7b()),
|
||||
("llama3_8b", ModelConfig::llama3_8b()),
|
||||
("mistral_7b", ModelConfig::mistral_7b()),
|
||||
];
|
||||
|
||||
for (name, config) in configs {
|
||||
let layer = TransformerLayer::new(config);
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
|
||||
let id = BenchmarkId::new(name, config.hidden_size);
|
||||
|
||||
group.throughput(Throughput::Elements(config.params_per_layer() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut h = hidden_state.clone();
|
||||
layer.forward_single_token(black_box(&mut h), 100);
|
||||
h
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_multi_layer_forward(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("multi_layer_forward");
|
||||
group.sample_size(20);
|
||||
|
||||
let config = ModelConfig::llama2_7b();
|
||||
|
||||
for num_layers in [1, 4, 8, 16, 32] {
|
||||
let layers: Vec<TransformerLayer> = (0..num_layers)
|
||||
.map(|_| TransformerLayer::new(config))
|
||||
.collect();
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
|
||||
let id = BenchmarkId::new(format!("{}_layers", num_layers), num_layers);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(config.params_per_layer() * num_layers) as u64,
|
||||
));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut h = hidden_state.clone();
|
||||
for layer in &layers {
|
||||
layer.forward_single_token(black_box(&mut h), 100);
|
||||
}
|
||||
h
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_kv_cache_operations(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("kv_cache");
|
||||
group.sample_size(50);
|
||||
|
||||
let configs = [
|
||||
("llama2_7b", ModelConfig::llama2_7b()),
|
||||
("llama3_8b", ModelConfig::llama3_8b()),
|
||||
];
|
||||
|
||||
for (name, config) in configs {
|
||||
// Append operation
|
||||
let mut cache = KvCache::new(&config);
|
||||
let k = random_tensor(config.num_kv_heads * config.head_dim);
|
||||
let v = random_tensor(config.num_kv_heads * config.head_dim);
|
||||
|
||||
group.bench_function(
|
||||
BenchmarkId::new(format!("{}_append", name), config.num_kv_heads),
|
||||
|b| {
|
||||
b.iter_batched(
|
||||
|| KvCache::new(&config),
|
||||
|mut cache| {
|
||||
cache.append(black_box(&k), black_box(&v));
|
||||
cache
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
// Memory footprint at various sequence lengths
|
||||
for seq_len in [256, 512, 1024, 2048] {
|
||||
let mut cache = KvCache::new(&config);
|
||||
for _ in 0..seq_len {
|
||||
cache.append(&k, &v);
|
||||
}
|
||||
|
||||
let memory_mb = cache.memory_bytes() / (1024 * 1024);
|
||||
let id = BenchmarkId::new(format!("{}_seq_{}_{}MB", name, seq_len, memory_mb), seq_len);
|
||||
|
||||
group.throughput(Throughput::Bytes(cache.memory_bytes() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut c = KvCache::new(&config);
|
||||
for _ in 0..seq_len {
|
||||
c.append(black_box(&k), black_box(&v));
|
||||
}
|
||||
c
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_decode_throughput(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("decode_throughput");
|
||||
group.sample_size(20);
|
||||
|
||||
// Measure tokens per second for decode phase
|
||||
let config = ModelConfig::llama2_7b();
|
||||
let layers: Vec<TransformerLayer> = (0..config.num_layers)
|
||||
.map(|_| TransformerLayer::new(config))
|
||||
.collect();
|
||||
|
||||
// Simulate decoding multiple tokens
|
||||
for num_tokens in [1, 10, 50, 100] {
|
||||
let id = BenchmarkId::new(format!("{}_tokens", num_tokens), num_tokens);
|
||||
|
||||
group.throughput(Throughput::Elements(num_tokens as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
for token_idx in 0..num_tokens {
|
||||
for layer in &layers {
|
||||
layer.forward_single_token(black_box(&mut hidden_state), token_idx);
|
||||
}
|
||||
}
|
||||
hidden_state
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_prefill_latency(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("prefill_latency");
|
||||
group.sample_size(10);
|
||||
|
||||
// Simulate prefill phase (processing prompt)
|
||||
let config = ModelConfig::llama2_7b();
|
||||
let layer = TransformerLayer::new(config);
|
||||
|
||||
for seq_len in [32, 64, 128, 256] {
|
||||
// Process multiple tokens (simplified - in practice would batch)
|
||||
let id = BenchmarkId::new(format!("seq_{}", seq_len), seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements(seq_len as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut total_output = vec![0.0f32; config.hidden_size];
|
||||
for pos in 0..seq_len {
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
layer.forward_single_token(black_box(&mut hidden_state), pos);
|
||||
// Accumulate (simplified)
|
||||
for i in 0..config.hidden_size {
|
||||
total_output[i] += hidden_state[i] / seq_len as f32;
|
||||
}
|
||||
}
|
||||
total_output
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_model_memory(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("model_memory_estimate");
|
||||
group.sample_size(20);
|
||||
|
||||
let configs = [
|
||||
("llama2_7b", ModelConfig::llama2_7b()),
|
||||
("llama2_13b", ModelConfig::llama2_13b()),
|
||||
("llama3_8b", ModelConfig::llama3_8b()),
|
||||
("mistral_7b", ModelConfig::mistral_7b()),
|
||||
];
|
||||
|
||||
for (name, config) in configs {
|
||||
let fp16_gb = config.memory_bytes_fp16() as f64 / (1024.0 * 1024.0 * 1024.0);
|
||||
let int4_gb = config.memory_bytes_int4() as f64 / (1024.0 * 1024.0 * 1024.0);
|
||||
|
||||
println!(
|
||||
"{}: FP16={:.2}GB, INT4={:.2}GB, params={}M",
|
||||
name,
|
||||
fp16_gb,
|
||||
int4_gb,
|
||||
config.total_params() / 1_000_000
|
||||
);
|
||||
|
||||
// Benchmark single layer to estimate per-layer latency
|
||||
let layer = TransformerLayer::new(config);
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("{}_fp16_{:.1}GB", name, fp16_gb),
|
||||
config.total_params(),
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements(config.params_per_layer() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut h = hidden_state.clone();
|
||||
layer.forward_single_token(black_box(&mut h), 100);
|
||||
h
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_inference_components(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("inference_components");
|
||||
group.sample_size(50);
|
||||
|
||||
let config = ModelConfig::llama2_7b();
|
||||
let hidden = config.hidden_size;
|
||||
let intermediate = config.intermediate_size;
|
||||
|
||||
// Individual component benchmarks
|
||||
let input = random_tensor(hidden);
|
||||
let weight = random_tensor(hidden);
|
||||
|
||||
// RMSNorm
|
||||
group.bench_function("rmsnorm_4096", |b| {
|
||||
b.iter_batched(
|
||||
|| input.clone(),
|
||||
|mut x| {
|
||||
rms_norm_inplace(black_box(&mut x), black_box(&weight), 1e-6);
|
||||
x
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
// Linear projection (hidden -> hidden)
|
||||
let proj_matrix = random_tensor(hidden * hidden);
|
||||
group.bench_function("linear_4096x4096", |b| {
|
||||
b.iter(|| gemv(black_box(&proj_matrix), black_box(&input), hidden, hidden))
|
||||
});
|
||||
|
||||
// Linear projection (hidden -> intermediate)
|
||||
let mlp_up_matrix = random_tensor(hidden * intermediate);
|
||||
group.bench_function("linear_4096x11008", |b| {
|
||||
b.iter(|| {
|
||||
gemv(
|
||||
black_box(&mlp_up_matrix),
|
||||
black_box(&input),
|
||||
hidden,
|
||||
intermediate,
|
||||
)
|
||||
})
|
||||
});
|
||||
|
||||
// RoPE
|
||||
let mut rope_input = random_tensor(config.num_attention_heads * config.head_dim);
|
||||
group.bench_function("rope_32heads", |b| {
|
||||
b.iter_batched(
|
||||
|| rope_input.clone(),
|
||||
|mut x| {
|
||||
for h in 0..config.num_attention_heads {
|
||||
let offset = h * config.head_dim;
|
||||
apply_rope_simple(
|
||||
black_box(&mut x[offset..offset + config.head_dim]),
|
||||
config.head_dim,
|
||||
100,
|
||||
);
|
||||
}
|
||||
x
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tokens_per_second_estimation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("tokens_per_second");
|
||||
group.sample_size(10);
|
||||
|
||||
// Full model throughput estimation
|
||||
let config = ModelConfig::llama2_7b();
|
||||
|
||||
// Create a simplified full model
|
||||
let layers: Vec<TransformerLayer> = (0..4) // Use 4 layers for faster benchmarking
|
||||
.map(|_| TransformerLayer::new(config))
|
||||
.collect();
|
||||
|
||||
let id = BenchmarkId::new("llama2_7b_4layers", 4);
|
||||
|
||||
// Time how long it takes to process tokens
|
||||
group.bench_function(id, |b| {
|
||||
b.iter_custom(|iters| {
|
||||
let mut total_time = std::time::Duration::ZERO;
|
||||
|
||||
for _ in 0..iters {
|
||||
let mut hidden_state = random_tensor(config.hidden_size);
|
||||
let start = Instant::now();
|
||||
|
||||
for layer in &layers {
|
||||
layer.forward_single_token(black_box(&mut hidden_state), 100);
|
||||
}
|
||||
|
||||
total_time += start.elapsed();
|
||||
}
|
||||
|
||||
total_time
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_single_layer_forward,
|
||||
bench_multi_layer_forward,
|
||||
bench_kv_cache_operations,
|
||||
bench_decode_throughput,
|
||||
bench_prefill_latency,
|
||||
bench_model_memory,
|
||||
bench_inference_components,
|
||||
bench_tokens_per_second_estimation,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
710
vendor/ruvector/crates/ruvllm/benches/lora_bench.rs
vendored
Normal file
710
vendor/ruvector/crates/ruvllm/benches/lora_bench.rs
vendored
Normal file
@@ -0,0 +1,710 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! MicroLoRA Benchmarks for M4 Pro
|
||||
//!
|
||||
//! Benchmarks for LoRA adapter operations:
|
||||
//! - Forward pass latency
|
||||
//! - SIMD-optimized forward
|
||||
//! - Gradient accumulation
|
||||
//! - EWC++ overhead
|
||||
//! - Adaptation speed
|
||||
//!
|
||||
//! Performance targets for M4 Pro:
|
||||
//! - MicroLoRA forward (rank=2, dim=768): <500us
|
||||
//! - MicroLoRA forward (rank=2, dim=4096): <1ms
|
||||
//! - Gradient accumulation: <100us
|
||||
//! - EWC++ update: <200us
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use rand::Rng;
|
||||
|
||||
/// Target modules for LoRA adaptation
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
enum TargetModule {
|
||||
QProj,
|
||||
VProj,
|
||||
}
|
||||
|
||||
/// Single LoRA adapter for benchmarking
|
||||
#[derive(Clone)]
|
||||
struct LoraAdapter {
|
||||
lora_a: Vec<f32>,
|
||||
lora_b: Vec<f32>,
|
||||
in_features: usize,
|
||||
out_features: usize,
|
||||
rank: usize,
|
||||
scaling: f32,
|
||||
// Gradients
|
||||
grad_a: Vec<f32>,
|
||||
grad_b: Vec<f32>,
|
||||
grad_count: usize,
|
||||
}
|
||||
|
||||
impl LoraAdapter {
|
||||
fn new(in_features: usize, out_features: usize, rank: usize, alpha: f32) -> Self {
|
||||
let scaling = alpha / rank as f32;
|
||||
|
||||
// Kaiming initialization for A
|
||||
let std_a = (2.0 / in_features as f32).sqrt() * 0.01;
|
||||
let lora_a: Vec<f32> = (0..in_features * rank)
|
||||
.map(|idx| {
|
||||
let seed = idx as f32;
|
||||
((seed * 0.618033988749895) % 1.0 - 0.5) * 2.0 * std_a
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Zero initialization for B
|
||||
let lora_b = vec![0.0; rank * out_features];
|
||||
|
||||
Self {
|
||||
lora_a,
|
||||
lora_b,
|
||||
in_features,
|
||||
out_features,
|
||||
rank,
|
||||
scaling,
|
||||
grad_a: vec![0.0; in_features * rank],
|
||||
grad_b: vec![0.0; rank * out_features],
|
||||
grad_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Forward pass: output = x @ A @ B * scaling
|
||||
fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
debug_assert_eq!(x.len(), self.in_features);
|
||||
|
||||
// Down projection: x @ A -> intermediate (rank,)
|
||||
let mut intermediate = vec![0.0f32; self.rank];
|
||||
for r in 0..self.rank {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..self.in_features {
|
||||
sum += x[i] * self.lora_a[i * self.rank + r];
|
||||
}
|
||||
intermediate[r] = sum;
|
||||
}
|
||||
|
||||
// Up projection: intermediate @ B -> output (out_features,)
|
||||
let mut output = vec![0.0f32; self.out_features];
|
||||
for o in 0..self.out_features {
|
||||
let mut sum = 0.0f32;
|
||||
for r in 0..self.rank {
|
||||
sum += intermediate[r] * self.lora_b[r * self.out_features + o];
|
||||
}
|
||||
output[o] = sum * self.scaling;
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
/// SIMD-optimized forward for flat f32 slices (adds to output)
|
||||
fn forward_simd(&self, input: &[f32], output: &mut [f32]) {
|
||||
debug_assert_eq!(input.len(), self.in_features);
|
||||
debug_assert_eq!(output.len(), self.out_features);
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
self.forward_simd_neon(input, output);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
self.forward_simd_scalar(input, output);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn forward_simd_neon(&self, input: &[f32], output: &mut [f32]) {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
// Down projection with NEON
|
||||
let mut intermediate = vec![0.0f32; self.rank];
|
||||
|
||||
for r in 0..self.rank {
|
||||
let mut sum = vdupq_n_f32(0.0);
|
||||
let chunks = self.in_features / 4;
|
||||
let mut i = 0;
|
||||
|
||||
for _ in 0..chunks {
|
||||
let x_v = vld1q_f32(input.as_ptr().add(i));
|
||||
// Load A column (strided access - not ideal but works for small rank)
|
||||
let a_vals = [
|
||||
self.lora_a[i * self.rank + r],
|
||||
self.lora_a[(i + 1) * self.rank + r],
|
||||
self.lora_a[(i + 2) * self.rank + r],
|
||||
self.lora_a[(i + 3) * self.rank + r],
|
||||
];
|
||||
let a_v = vld1q_f32(a_vals.as_ptr());
|
||||
sum = vfmaq_f32(sum, x_v, a_v);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
let mut sum_val = vaddvq_f32(sum);
|
||||
for ii in i..self.in_features {
|
||||
sum_val += input[ii] * self.lora_a[ii * self.rank + r];
|
||||
}
|
||||
intermediate[r] = sum_val;
|
||||
}
|
||||
|
||||
// Up projection with NEON
|
||||
let scaling_vec = vdupq_n_f32(self.scaling);
|
||||
let chunks = self.out_features / 4;
|
||||
let mut o = 0;
|
||||
|
||||
for _ in 0..chunks {
|
||||
let mut out_v = vld1q_f32(output.as_ptr().add(o));
|
||||
|
||||
for r in 0..self.rank {
|
||||
let inter_val = vdupq_n_f32(intermediate[r]);
|
||||
let b_v = vld1q_f32(self.lora_b.as_ptr().add(r * self.out_features + o));
|
||||
out_v = vfmaq_f32(out_v, vmulq_f32(inter_val, b_v), scaling_vec);
|
||||
}
|
||||
|
||||
vst1q_f32(output.as_mut_ptr().add(o), out_v);
|
||||
o += 4;
|
||||
}
|
||||
|
||||
// Remaining elements
|
||||
for oo in o..self.out_features {
|
||||
let mut sum = output[oo];
|
||||
for r in 0..self.rank {
|
||||
sum += intermediate[r] * self.lora_b[r * self.out_features + oo] * self.scaling;
|
||||
}
|
||||
output[oo] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn forward_simd_scalar(&self, input: &[f32], output: &mut [f32]) {
|
||||
let mut intermediate = vec![0.0f32; self.rank];
|
||||
|
||||
for r in 0..self.rank {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..self.in_features {
|
||||
sum += input[i] * self.lora_a[i * self.rank + r];
|
||||
}
|
||||
intermediate[r] = sum;
|
||||
}
|
||||
|
||||
for o in 0..self.out_features {
|
||||
let mut sum = output[o];
|
||||
for r in 0..self.rank {
|
||||
sum += intermediate[r] * self.lora_b[r * self.out_features + o] * self.scaling;
|
||||
}
|
||||
output[o] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Batched forward pass for efficiency
|
||||
fn forward_batch(&self, x: &[f32], batch_size: usize) -> Vec<f32> {
|
||||
debug_assert_eq!(x.len(), batch_size * self.in_features);
|
||||
|
||||
let mut outputs = vec![0.0f32; batch_size * self.out_features];
|
||||
|
||||
for b in 0..batch_size {
|
||||
let input_offset = b * self.in_features;
|
||||
let output_offset = b * self.out_features;
|
||||
|
||||
let input = &x[input_offset..input_offset + self.in_features];
|
||||
let output = &mut outputs[output_offset..output_offset + self.out_features];
|
||||
|
||||
self.forward_simd(input, output);
|
||||
}
|
||||
|
||||
outputs
|
||||
}
|
||||
|
||||
/// Compute gradients for REINFORCE-style update
|
||||
fn accumulate_gradient(&mut self, input: &[f32], grad_output: &[f32], reward: f32) {
|
||||
debug_assert_eq!(input.len(), self.in_features);
|
||||
debug_assert_eq!(grad_output.len(), self.out_features);
|
||||
|
||||
// Compute intermediate activation
|
||||
let mut intermediate = vec![0.0f32; self.rank];
|
||||
for r in 0..self.rank {
|
||||
let mut sum = 0.0f32;
|
||||
for i in 0..self.in_features {
|
||||
sum += input[i] * self.lora_a[i * self.rank + r];
|
||||
}
|
||||
intermediate[r] = sum;
|
||||
}
|
||||
|
||||
// Gradient for B: outer(intermediate, grad_output) * reward * scaling
|
||||
for r in 0..self.rank {
|
||||
for o in 0..self.out_features {
|
||||
self.grad_b[r * self.out_features + o] +=
|
||||
intermediate[r] * grad_output[o] * reward * self.scaling;
|
||||
}
|
||||
}
|
||||
|
||||
// Gradient for A: input outer grad_intermediate
|
||||
// grad_intermediate = grad_output @ B.T * reward * scaling
|
||||
let mut grad_intermediate = vec![0.0f32; self.rank];
|
||||
for r in 0..self.rank {
|
||||
let mut sum = 0.0f32;
|
||||
for o in 0..self.out_features {
|
||||
sum += grad_output[o] * self.lora_b[r * self.out_features + o];
|
||||
}
|
||||
grad_intermediate[r] = sum * reward * self.scaling;
|
||||
}
|
||||
|
||||
for i in 0..self.in_features {
|
||||
for r in 0..self.rank {
|
||||
self.grad_a[i * self.rank + r] += input[i] * grad_intermediate[r];
|
||||
}
|
||||
}
|
||||
|
||||
self.grad_count += 1;
|
||||
}
|
||||
|
||||
/// Apply accumulated gradients with learning rate
|
||||
fn apply_gradients(&mut self, learning_rate: f32) {
|
||||
if self.grad_count == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let scale = learning_rate / self.grad_count as f32;
|
||||
|
||||
for i in 0..self.lora_a.len() {
|
||||
self.lora_a[i] -= self.grad_a[i] * scale;
|
||||
self.grad_a[i] = 0.0;
|
||||
}
|
||||
|
||||
for i in 0..self.lora_b.len() {
|
||||
self.lora_b[i] -= self.grad_b[i] * scale;
|
||||
self.grad_b[i] = 0.0;
|
||||
}
|
||||
|
||||
self.grad_count = 0;
|
||||
}
|
||||
|
||||
/// Apply gradients with EWC++ regularization
|
||||
fn apply_gradients_with_ewc(
|
||||
&mut self,
|
||||
learning_rate: f32,
|
||||
fisher_a: &[f32],
|
||||
fisher_b: &[f32],
|
||||
optimal_a: &[f32],
|
||||
optimal_b: &[f32],
|
||||
ewc_lambda: f32,
|
||||
) {
|
||||
if self.grad_count == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let scale = learning_rate / self.grad_count as f32;
|
||||
|
||||
// Update A with EWC regularization
|
||||
for i in 0..self.lora_a.len() {
|
||||
let grad = self.grad_a[i] * scale;
|
||||
let ewc_penalty = ewc_lambda * fisher_a[i] * (self.lora_a[i] - optimal_a[i]);
|
||||
self.lora_a[i] -= grad + ewc_penalty * learning_rate;
|
||||
self.grad_a[i] = 0.0;
|
||||
}
|
||||
|
||||
// Update B with EWC regularization
|
||||
for i in 0..self.lora_b.len() {
|
||||
let grad = self.grad_b[i] * scale;
|
||||
let ewc_penalty = ewc_lambda * fisher_b[i] * (self.lora_b[i] - optimal_b[i]);
|
||||
self.lora_b[i] -= grad + ewc_penalty * learning_rate;
|
||||
self.grad_b[i] = 0.0;
|
||||
}
|
||||
|
||||
self.grad_count = 0;
|
||||
}
|
||||
|
||||
fn param_count(&self) -> usize {
|
||||
self.lora_a.len() + self.lora_b.len()
|
||||
}
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
self.param_count() * std::mem::size_of::<f32>()
|
||||
}
|
||||
}
|
||||
|
||||
/// EWC state for benchmarking
|
||||
struct EwcState {
|
||||
fisher_a: Vec<f32>,
|
||||
fisher_b: Vec<f32>,
|
||||
optimal_a: Vec<f32>,
|
||||
optimal_b: Vec<f32>,
|
||||
}
|
||||
|
||||
impl EwcState {
|
||||
fn from_adapter(adapter: &LoraAdapter) -> Self {
|
||||
Self {
|
||||
fisher_a: vec![0.01; adapter.lora_a.len()],
|
||||
fisher_b: vec![0.01; adapter.lora_b.len()],
|
||||
optimal_a: adapter.lora_a.clone(),
|
||||
optimal_b: adapter.lora_b.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn update_fisher(&mut self, grad_a: &[f32], grad_b: &[f32], decay: f32) {
|
||||
for i in 0..self.fisher_a.len() {
|
||||
self.fisher_a[i] = decay * self.fisher_a[i] + (1.0 - decay) * grad_a[i] * grad_a[i];
|
||||
}
|
||||
for i in 0..self.fisher_b.len() {
|
||||
self.fisher_b[i] = decay * self.fisher_b[i] + (1.0 - decay) * grad_b[i] * grad_b[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to generate random tensor data
|
||||
fn random_tensor(size: usize) -> Vec<f32> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
|
||||
}
|
||||
|
||||
// === Benchmark Functions ===
|
||||
|
||||
fn bench_lora_forward(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_forward");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
for rank in [1, 2] {
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let input = random_tensor(in_features);
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("dim_{}_rank_{}", in_features, rank),
|
||||
adapter.param_count(),
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| b.iter(|| adapter.forward(black_box(&input))));
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_forward_simd(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_forward_simd");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
for rank in [1, 2] {
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let input = random_tensor(in_features);
|
||||
let mut output = vec![0.0f32; out_features];
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("dim_{}_rank_{}", in_features, rank),
|
||||
adapter.param_count(),
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
output.fill(0.0);
|
||||
adapter.forward_simd(black_box(&input), black_box(&mut output));
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_forward_batch(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_forward_batch");
|
||||
group.sample_size(50);
|
||||
|
||||
let in_features = 4096;
|
||||
let out_features = 4096;
|
||||
let rank = 2;
|
||||
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
|
||||
for batch_size in [1, 8, 16, 32, 64] {
|
||||
let input = random_tensor(batch_size * in_features);
|
||||
|
||||
let id = BenchmarkId::new(format!("batch_{}", batch_size), batch_size);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(batch_size * adapter.param_count()) as u64,
|
||||
));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| adapter.forward_batch(black_box(&input), batch_size))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_gradient_accumulation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_gradient_accumulation");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
let rank = 2;
|
||||
let mut adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let input = random_tensor(in_features);
|
||||
let grad_output = random_tensor(out_features);
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", in_features), in_features);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
adapter.accumulate_gradient(black_box(&input), black_box(&grad_output), 0.8);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_apply_gradients(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_apply_gradients");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
let rank = 2;
|
||||
let mut adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let input = random_tensor(in_features);
|
||||
let grad_output = random_tensor(out_features);
|
||||
|
||||
// Accumulate some gradients first
|
||||
for _ in 0..10 {
|
||||
adapter.accumulate_gradient(&input, &grad_output, 0.8);
|
||||
}
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", in_features), in_features);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut a = adapter.clone();
|
||||
for _ in 0..10 {
|
||||
a.accumulate_gradient(&input, &grad_output, 0.8);
|
||||
}
|
||||
a
|
||||
},
|
||||
|mut a| {
|
||||
a.apply_gradients(black_box(0.01));
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_ewc_update(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_ewc_update");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
let rank = 2;
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let ewc = EwcState::from_adapter(&adapter);
|
||||
let input = random_tensor(in_features);
|
||||
let grad_output = random_tensor(out_features);
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", in_features), in_features);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter_batched(
|
||||
|| {
|
||||
let mut a = adapter.clone();
|
||||
for _ in 0..10 {
|
||||
a.accumulate_gradient(&input, &grad_output, 0.8);
|
||||
}
|
||||
a
|
||||
},
|
||||
|mut a| {
|
||||
a.apply_gradients_with_ewc(
|
||||
black_box(0.01),
|
||||
black_box(&ewc.fisher_a),
|
||||
black_box(&ewc.fisher_b),
|
||||
black_box(&ewc.optimal_a),
|
||||
black_box(&ewc.optimal_b),
|
||||
black_box(0.1),
|
||||
);
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_adaptation_cycle(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_adaptation_cycle");
|
||||
group.sample_size(50);
|
||||
|
||||
// Full adaptation cycle: forward + gradient + apply
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
let rank = 2;
|
||||
let input = random_tensor(in_features);
|
||||
let grad_output = random_tensor(out_features);
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", in_features), in_features);
|
||||
|
||||
group.bench_function(id, |b| {
|
||||
b.iter_batched(
|
||||
|| LoraAdapter::new(in_features, out_features, rank, 4.0),
|
||||
|mut adapter| {
|
||||
// Forward
|
||||
let _output = adapter.forward(black_box(&input));
|
||||
// Gradient
|
||||
adapter.accumulate_gradient(black_box(&input), black_box(&grad_output), 0.8);
|
||||
// Apply
|
||||
adapter.apply_gradients(black_box(0.01));
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_memory_footprint(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_memory");
|
||||
group.sample_size(100);
|
||||
|
||||
// Test memory efficiency at different scales
|
||||
let configs = [
|
||||
("rank1_768", 768, 768, 1),
|
||||
("rank2_768", 768, 768, 2),
|
||||
("rank1_4096", 4096, 4096, 1),
|
||||
("rank2_4096", 4096, 4096, 2),
|
||||
("rank2_4096x11008", 4096, 11008, 2), // MLP-like
|
||||
];
|
||||
|
||||
for (name, in_features, out_features, rank) in configs {
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let input = random_tensor(in_features);
|
||||
|
||||
let memory_bytes = adapter.memory_bytes();
|
||||
|
||||
let id = BenchmarkId::new(format!("{}_{}KB", name, memory_bytes / 1024), memory_bytes);
|
||||
|
||||
group.throughput(Throughput::Bytes(memory_bytes as u64));
|
||||
group.bench_function(id, |b| b.iter(|| adapter.forward(black_box(&input))));
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_ewc_fisher_update(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("ewc_fisher_update");
|
||||
group.sample_size(100);
|
||||
|
||||
for (in_features, out_features) in [(768, 768), (2048, 2048), (4096, 4096)] {
|
||||
let rank = 2;
|
||||
let adapter = LoraAdapter::new(in_features, out_features, rank, 4.0);
|
||||
let mut ewc = EwcState::from_adapter(&adapter);
|
||||
let grad_a = random_tensor(in_features * rank);
|
||||
let grad_b = random_tensor(rank * out_features);
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", in_features), in_features);
|
||||
|
||||
group.throughput(Throughput::Elements(adapter.param_count() as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
ewc.update_fisher(black_box(&grad_a), black_box(&grad_b), 0.9);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_lora_vs_dense(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("lora_vs_dense_overhead");
|
||||
group.sample_size(50);
|
||||
|
||||
// Compare LoRA overhead vs dense matmul
|
||||
let dim = 4096;
|
||||
let rank = 2;
|
||||
|
||||
let adapter = LoraAdapter::new(dim, dim, rank, 4.0);
|
||||
let input = random_tensor(dim);
|
||||
|
||||
// LoRA forward
|
||||
group.bench_function(BenchmarkId::new("lora_rank2", dim), |b| {
|
||||
b.iter(|| adapter.forward(black_box(&input)))
|
||||
});
|
||||
|
||||
// Equivalent dense GEMV (what LoRA replaces)
|
||||
let dense_weight = random_tensor(dim * dim);
|
||||
|
||||
group.bench_function(BenchmarkId::new("dense_equivalent", dim), |b| {
|
||||
b.iter(|| {
|
||||
let mut dense_output = vec![0.0f32; dim];
|
||||
for i in 0..dim {
|
||||
let mut sum = 0.0f32;
|
||||
for j in 0..dim {
|
||||
sum += input[j] * dense_weight[j * dim + i];
|
||||
}
|
||||
dense_output[i] = sum;
|
||||
}
|
||||
black_box(dense_output)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_multiple_adapters(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("multiple_adapters");
|
||||
group.sample_size(50);
|
||||
|
||||
// Test applying multiple LoRA adapters (Q, K, V, O projections)
|
||||
let dim = 4096;
|
||||
let rank = 2;
|
||||
|
||||
let adapters: Vec<LoraAdapter> = (0..4)
|
||||
.map(|_| LoraAdapter::new(dim, dim, rank, 4.0))
|
||||
.collect();
|
||||
let input = random_tensor(dim);
|
||||
|
||||
group.bench_function(BenchmarkId::new("4_adapters_sequential", 4), |b| {
|
||||
b.iter(|| {
|
||||
let mut outputs: Vec<Vec<f32>> = Vec::with_capacity(4);
|
||||
for adapter in &adapters {
|
||||
outputs.push(adapter.forward(black_box(&input)));
|
||||
}
|
||||
outputs
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_lora_forward,
|
||||
bench_lora_forward_simd,
|
||||
bench_lora_forward_batch,
|
||||
bench_lora_gradient_accumulation,
|
||||
bench_lora_apply_gradients,
|
||||
bench_lora_ewc_update,
|
||||
bench_lora_adaptation_cycle,
|
||||
bench_lora_memory_footprint,
|
||||
bench_ewc_fisher_update,
|
||||
bench_lora_vs_dense,
|
||||
bench_multiple_adapters,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
1278
vendor/ruvector/crates/ruvllm/benches/matmul_bench.rs
vendored
Normal file
1278
vendor/ruvector/crates/ruvllm/benches/matmul_bench.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
596
vendor/ruvector/crates/ruvllm/benches/metal_bench.rs
vendored
Normal file
596
vendor/ruvector/crates/ruvllm/benches/metal_bench.rs
vendored
Normal file
@@ -0,0 +1,596 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! Metal GPU acceleration benchmarks
|
||||
//!
|
||||
//! Benchmarks Metal compute shaders for LLM operations.
|
||||
//! Only runs on macOS with `metal-compute` feature enabled.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
use ruvllm::kernels::AttentionConfig;
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
use ruvllm::metal::{MetalConfig, MetalContext};
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_flash_attention_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_flash_attention");
|
||||
|
||||
for (seq_len, kv_len) in [(1, 512), (1, 2048), (1, 4096), (4, 512), (4, 2048)] {
|
||||
let config = AttentionConfig {
|
||||
num_heads: 32,
|
||||
num_kv_heads: 8,
|
||||
head_dim: 128,
|
||||
max_seq_len: seq_len,
|
||||
causal: true,
|
||||
scale: 0.0,
|
||||
};
|
||||
|
||||
let query: Vec<f32> = (0..seq_len * config.num_heads * config.head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let key: Vec<f32> = (0..kv_len * config.num_kv_heads * config.head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let value: Vec<f32> = (0..kv_len * config.num_kv_heads * config.head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("metal", format!("seq{}_kv{}", seq_len, kv_len)),
|
||||
&(&query, &key, &value, &config),
|
||||
|b, (q, k, v, cfg)| {
|
||||
b.iter(|| {
|
||||
ctx.flash_attention(
|
||||
black_box(*q),
|
||||
black_box(*k),
|
||||
black_box(*v),
|
||||
black_box(*cfg),
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_gemm_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_gemm");
|
||||
|
||||
for size in [128, 256, 512, 1024, 2048] {
|
||||
let m = size;
|
||||
let n = size;
|
||||
let k = size;
|
||||
|
||||
let a: Vec<f32> = (0..m * k).map(|i| (i as f32) * 0.001).collect();
|
||||
let b: Vec<f32> = (0..k * n).map(|i| (i as f32) * 0.001).collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("metal_f32", format!("{}x{}", size, size)),
|
||||
&(&a, &b, m, n, k),
|
||||
|bench, (a, b, m, n, k)| {
|
||||
bench.iter(|| ctx.gemm_f32(black_box(*a), black_box(*b), *m, *n, *k))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_rms_norm_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_rms_norm");
|
||||
|
||||
for hidden_size in [1024, 2048, 4096, 8192] {
|
||||
let batch_size = 4;
|
||||
let mut x: Vec<f32> = (0..batch_size * hidden_size)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let weight: Vec<f32> = vec![1.0; hidden_size];
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("metal", format!("hidden{}", hidden_size)),
|
||||
&(hidden_size, batch_size),
|
||||
|bench, _| {
|
||||
bench.iter(|| {
|
||||
let mut x_clone = x.clone();
|
||||
ctx.rms_norm(black_box(&mut x_clone), black_box(&weight), 1e-6)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_rope_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_rope");
|
||||
|
||||
for num_heads in [8, 16, 32] {
|
||||
let head_dim = 128;
|
||||
let batch_size = 4;
|
||||
let mut x: Vec<f32> = (0..batch_size * num_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("metal", format!("heads{}", num_heads)),
|
||||
&(num_heads, head_dim, batch_size),
|
||||
|bench, &(nh, hd, bs)| {
|
||||
bench.iter(|| {
|
||||
let mut x_clone = x.clone();
|
||||
ctx.apply_rope(black_box(&mut x_clone), 0, nh, hd, 10000.0)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// ============ M4 Pro Optimized Benchmarks ============
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_optimized_gemm_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if !ctx.has_m4_pro_optimizations() {
|
||||
eprintln!("M4 Pro optimizations not available, skipping optimized GEMM benchmark");
|
||||
return;
|
||||
}
|
||||
|
||||
println!(
|
||||
"Available optimizations: {:?}",
|
||||
ctx.available_optimizations()
|
||||
);
|
||||
|
||||
let mut group = c.benchmark_group("metal_gemm_optimized");
|
||||
|
||||
for size in [128, 256, 512, 1024, 2048, 4096] {
|
||||
let m = size;
|
||||
let n = size;
|
||||
let k = size;
|
||||
|
||||
let a: Vec<half::f16> = (0..m * k)
|
||||
.map(|i| half::f16::from_f32((i as f32) * 0.001))
|
||||
.collect();
|
||||
let b: Vec<half::f16> = (0..k * n)
|
||||
.map(|i| half::f16::from_f32((i as f32) * 0.001))
|
||||
.collect();
|
||||
|
||||
// Benchmark standard GEMM
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("standard_f16", format!("{}x{}", size, size)),
|
||||
&(&a, &b, m, n, k),
|
||||
|bench, (a, b, m, n, k)| {
|
||||
bench.iter(|| ctx.gemm_f16(black_box(*a), black_box(*b), *m, *n, *k))
|
||||
},
|
||||
);
|
||||
|
||||
// Benchmark M4 Pro optimized GEMM (BM=128, BN=128, BK=32)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("m4_optimized", format!("{}x{}", size, size)),
|
||||
&(&a, &b, m, n, k),
|
||||
|bench, (a, b, m, n, k)| {
|
||||
bench.iter(|| ctx.gemm_optimized(black_box(*a), black_box(*b), *m, *n, *k))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_fused_attention_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_fused_attention");
|
||||
|
||||
for (seq_len, kv_len) in [
|
||||
(1, 512),
|
||||
(1, 2048),
|
||||
(1, 4096),
|
||||
(4, 512),
|
||||
(4, 2048),
|
||||
(16, 2048),
|
||||
] {
|
||||
let num_heads = 32;
|
||||
let num_kv_heads = 8;
|
||||
let head_dim = 128;
|
||||
|
||||
let query: Vec<f32> = (0..seq_len * num_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let key: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let value: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
|
||||
// Standard attention (legacy)
|
||||
let config = AttentionConfig {
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
max_seq_len: seq_len,
|
||||
causal: true,
|
||||
scale: 0.0,
|
||||
};
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("standard", format!("seq{}_kv{}", seq_len, kv_len)),
|
||||
&(&query, &key, &value, &config),
|
||||
|b, (q, k, v, cfg)| {
|
||||
b.iter(|| {
|
||||
ctx.flash_attention(
|
||||
black_box(*q),
|
||||
black_box(*k),
|
||||
black_box(*v),
|
||||
black_box(*cfg),
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Fused Flash Attention 2
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("fused_fa2", format!("seq{}_kv{}", seq_len, kv_len)),
|
||||
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|
||||
|b, (q, k, v, nh, nkv, hd)| {
|
||||
b.iter(|| {
|
||||
ctx.fused_attention(
|
||||
black_box(*q),
|
||||
black_box(*k),
|
||||
black_box(*v),
|
||||
*nh,
|
||||
*nkv,
|
||||
*hd,
|
||||
true,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_fused_norm_residual_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if ctx
|
||||
.available_optimizations()
|
||||
.iter()
|
||||
.find(|&&s| s == "fused_layernorm_residual")
|
||||
.is_none()
|
||||
{
|
||||
eprintln!("Fused LayerNorm+Residual not available, skipping benchmark");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut group = c.benchmark_group("metal_fused_norm");
|
||||
|
||||
for hidden_size in [1024, 2048, 4096, 8192] {
|
||||
let batch_size = 4;
|
||||
|
||||
let x: Vec<f32> = (0..batch_size * hidden_size)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let residual: Vec<f32> = (0..batch_size * hidden_size)
|
||||
.map(|i| (i as f32) * 0.0005)
|
||||
.collect();
|
||||
let weight: Vec<f32> = vec![1.0; hidden_size];
|
||||
let bias: Vec<f32> = vec![0.0; hidden_size];
|
||||
|
||||
// Separate RMSNorm
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("separate_rmsnorm", format!("hidden{}", hidden_size)),
|
||||
&(hidden_size, batch_size),
|
||||
|bench, _| {
|
||||
bench.iter(|| {
|
||||
let mut x_clone = x.clone();
|
||||
// Add residual manually then normalize
|
||||
for i in 0..x_clone.len() {
|
||||
x_clone[i] += residual[i];
|
||||
}
|
||||
ctx.rms_norm(black_box(&mut x_clone), black_box(&weight), 1e-6)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Fused RMSNorm + Residual
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("fused_rmsnorm_residual", format!("hidden{}", hidden_size)),
|
||||
&(hidden_size, batch_size),
|
||||
|bench, _| {
|
||||
bench.iter(|| {
|
||||
let mut x_clone = x.clone();
|
||||
ctx.fused_rmsnorm_residual(
|
||||
black_box(&mut x_clone),
|
||||
black_box(&residual),
|
||||
black_box(&weight),
|
||||
1e-6,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Fused LayerNorm + Residual
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("fused_layernorm_residual", format!("hidden{}", hidden_size)),
|
||||
&(hidden_size, batch_size),
|
||||
|bench, _| {
|
||||
bench.iter(|| {
|
||||
let mut x_clone = x.clone();
|
||||
ctx.fused_layernorm_residual(
|
||||
black_box(&mut x_clone),
|
||||
black_box(&residual),
|
||||
black_box(&weight),
|
||||
black_box(&bias),
|
||||
1e-6,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_rope_attention_fusion_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("metal_rope_attention_fusion");
|
||||
|
||||
for (seq_len, kv_len) in [(1, 512), (1, 2048), (4, 2048)] {
|
||||
let num_heads = 32;
|
||||
let num_kv_heads = 8;
|
||||
let head_dim = 128;
|
||||
let rope_theta = 10000.0;
|
||||
|
||||
let query: Vec<f32> = (0..seq_len * num_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let key: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
let value: Vec<f32> = (0..kv_len * num_kv_heads * head_dim)
|
||||
.map(|i| (i as f32) * 0.001)
|
||||
.collect();
|
||||
|
||||
// Separate RoPE + Attention (baseline)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("separate", format!("seq{}_kv{}", seq_len, kv_len)),
|
||||
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|
||||
|b, (q, k, v, nh, nkv, hd)| {
|
||||
b.iter(|| {
|
||||
let mut q_clone = (*q).clone();
|
||||
let mut k_clone = (*k).clone();
|
||||
let _ = ctx.apply_rope(&mut q_clone, 0, *nh, *hd, rope_theta);
|
||||
let _ = ctx.apply_rope(&mut k_clone, 0, *nkv, *hd, rope_theta);
|
||||
ctx.fused_attention(
|
||||
black_box(&q_clone),
|
||||
black_box(&k_clone),
|
||||
black_box(*v),
|
||||
*nh,
|
||||
*nkv,
|
||||
*hd,
|
||||
true,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// Fused RoPE + Attention
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("fused", format!("seq{}_kv{}", seq_len, kv_len)),
|
||||
&(&query, &key, &value, num_heads, num_kv_heads, head_dim),
|
||||
|b, (q, k, v, nh, nkv, hd)| {
|
||||
b.iter(|| {
|
||||
ctx.rope_then_attention(
|
||||
black_box(*q),
|
||||
black_box(*k),
|
||||
black_box(*v),
|
||||
*nh,
|
||||
*nkv,
|
||||
*hd,
|
||||
0,
|
||||
rope_theta,
|
||||
true,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
fn bench_swiglu_metal(c: &mut Criterion) {
|
||||
let ctx = match MetalContext::new(MetalConfig::default()) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to create Metal context: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if ctx
|
||||
.available_optimizations()
|
||||
.iter()
|
||||
.find(|&&s| s == "fused_swiglu")
|
||||
.is_none()
|
||||
{
|
||||
eprintln!("Fused SwiGLU not available, skipping benchmark");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut group = c.benchmark_group("metal_swiglu");
|
||||
|
||||
for size in [1024, 4096, 11008, 14336] {
|
||||
let gate: Vec<f32> = (0..size).map(|i| (i as f32) * 0.001 - 0.5).collect();
|
||||
let up: Vec<f32> = (0..size).map(|i| (i as f32) * 0.001).collect();
|
||||
|
||||
// Fused SwiGLU
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("fused", format!("size{}", size)),
|
||||
&(&gate, &up),
|
||||
|b, (g, u)| b.iter(|| ctx.fused_swiglu(black_box(*g), black_box(*u))),
|
||||
);
|
||||
|
||||
// CPU baseline for comparison
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("cpu_baseline", format!("size{}", size)),
|
||||
&(&gate, &up),
|
||||
|b, (g, u)| {
|
||||
b.iter(|| {
|
||||
let result: Vec<f32> = g
|
||||
.iter()
|
||||
.zip(u.iter())
|
||||
.map(|(&g_val, &u_val)| {
|
||||
// SwiGLU: swish(gate) * up
|
||||
let swish = g_val / (1.0 + (-g_val).exp());
|
||||
swish * u_val
|
||||
})
|
||||
.collect();
|
||||
black_box(result)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// CPU baseline comparison
|
||||
fn bench_cpu_gemm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("cpu_gemm");
|
||||
|
||||
for size in [128, 256, 512] {
|
||||
let m = size;
|
||||
let n = size;
|
||||
let k = size;
|
||||
|
||||
let a: Vec<f32> = (0..m * k).map(|i| (i as f32) * 0.001).collect();
|
||||
let b: Vec<f32> = (0..k * n).map(|i| (i as f32) * 0.001).collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("naive", format!("{}x{}", size, size)),
|
||||
&(&a, &b, m, n, k),
|
||||
|bench, (a, b, m, n, k)| {
|
||||
bench.iter(|| {
|
||||
let mut c = vec![0.0f32; *m * *n];
|
||||
for i in 0..*m {
|
||||
for j in 0..*n {
|
||||
let mut sum = 0.0f32;
|
||||
for l in 0..*k {
|
||||
sum += a[i * *k + l] * b[l * *n + j];
|
||||
}
|
||||
c[i * *n + j] = sum;
|
||||
}
|
||||
}
|
||||
black_box(c)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
||||
criterion_group!(
|
||||
metal_benches,
|
||||
// Legacy benchmarks
|
||||
bench_flash_attention_metal,
|
||||
bench_gemm_metal,
|
||||
bench_rms_norm_metal,
|
||||
bench_rope_metal,
|
||||
// M4 Pro optimized benchmarks
|
||||
bench_optimized_gemm_metal,
|
||||
bench_fused_attention_metal,
|
||||
bench_fused_norm_residual_metal,
|
||||
bench_rope_attention_fusion_metal,
|
||||
bench_swiglu_metal,
|
||||
// CPU baseline
|
||||
bench_cpu_gemm,
|
||||
);
|
||||
|
||||
#[cfg(not(all(target_os = "macos", feature = "metal-compute")))]
|
||||
criterion_group!(metal_benches, bench_cpu_gemm,);
|
||||
|
||||
criterion_main!(metal_benches);
|
||||
648
vendor/ruvector/crates/ruvllm/benches/norm_bench.rs
vendored
Normal file
648
vendor/ruvector/crates/ruvllm/benches/norm_bench.rs
vendored
Normal file
@@ -0,0 +1,648 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! Normalization Kernel Benchmarks for M4 Pro
|
||||
//!
|
||||
//! Benchmarks for RMSNorm and LayerNorm implementations.
|
||||
//!
|
||||
//! Performance targets for M4 Pro:
|
||||
//! - RMSNorm (768 dim): <5us
|
||||
//! - RMSNorm (2048 dim): <8us
|
||||
//! - RMSNorm (4096 dim): <10us
|
||||
//! - LayerNorm (4096 dim): <15us
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use rand::Rng;
|
||||
|
||||
const NEON_LANE_WIDTH: usize = 4;
|
||||
const UNROLL_FACTOR: usize = 4;
|
||||
|
||||
/// RMSNorm with NEON optimization
|
||||
#[inline(always)]
|
||||
fn rms_norm_neon(x: &mut [f32], weight: &[f32], eps: f32) {
|
||||
debug_assert_eq!(x.len(), weight.len());
|
||||
|
||||
let len = x.len();
|
||||
if len == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
rms_norm_neon_impl(x, weight, eps);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
rms_norm_scalar(x, weight, eps);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn rms_norm_neon_impl(x: &mut [f32], weight: &[f32], eps: f32) {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let len = x.len();
|
||||
let x_ptr = x.as_mut_ptr();
|
||||
let w_ptr = weight.as_ptr();
|
||||
|
||||
let mut sum0 = vdupq_n_f32(0.0);
|
||||
let mut sum1 = vdupq_n_f32(0.0);
|
||||
let mut sum2 = vdupq_n_f32(0.0);
|
||||
let mut sum3 = vdupq_n_f32(0.0);
|
||||
|
||||
let chunks = len / (NEON_LANE_WIDTH * UNROLL_FACTOR);
|
||||
let mut idx = 0usize;
|
||||
|
||||
for _ in 0..chunks {
|
||||
let v0 = vld1q_f32(x_ptr.add(idx));
|
||||
sum0 = vfmaq_f32(sum0, v0, v0);
|
||||
|
||||
let v1 = vld1q_f32(x_ptr.add(idx + 4));
|
||||
sum1 = vfmaq_f32(sum1, v1, v1);
|
||||
|
||||
let v2 = vld1q_f32(x_ptr.add(idx + 8));
|
||||
sum2 = vfmaq_f32(sum2, v2, v2);
|
||||
|
||||
let v3 = vld1q_f32(x_ptr.add(idx + 12));
|
||||
sum3 = vfmaq_f32(sum3, v3, v3);
|
||||
|
||||
idx += 16;
|
||||
}
|
||||
|
||||
let sum01 = vaddq_f32(sum0, sum1);
|
||||
let sum23 = vaddq_f32(sum2, sum3);
|
||||
let sum = vaddq_f32(sum01, sum23);
|
||||
|
||||
let remaining_chunks = (len - idx) / NEON_LANE_WIDTH;
|
||||
let mut final_sum = sum;
|
||||
for _ in 0..remaining_chunks {
|
||||
let v = vld1q_f32(x_ptr.add(idx));
|
||||
final_sum = vfmaq_f32(final_sum, v, v);
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
let mut sum_sq = vaddvq_f32(final_sum);
|
||||
|
||||
for i in idx..len {
|
||||
let v = *x_ptr.add(i);
|
||||
sum_sq += v * v;
|
||||
}
|
||||
|
||||
let mean_sq = sum_sq / len as f32;
|
||||
let rms = (mean_sq + eps).sqrt();
|
||||
let inv_rms = 1.0 / rms;
|
||||
let inv_rms_vec = vdupq_n_f32(inv_rms);
|
||||
|
||||
idx = 0;
|
||||
for _ in 0..chunks {
|
||||
let x0 = vld1q_f32(x_ptr.add(idx));
|
||||
let w0 = vld1q_f32(w_ptr.add(idx));
|
||||
vst1q_f32(x_ptr.add(idx), vmulq_f32(vmulq_f32(x0, inv_rms_vec), w0));
|
||||
|
||||
let x1 = vld1q_f32(x_ptr.add(idx + 4));
|
||||
let w1 = vld1q_f32(w_ptr.add(idx + 4));
|
||||
vst1q_f32(
|
||||
x_ptr.add(idx + 4),
|
||||
vmulq_f32(vmulq_f32(x1, inv_rms_vec), w1),
|
||||
);
|
||||
|
||||
let x2 = vld1q_f32(x_ptr.add(idx + 8));
|
||||
let w2 = vld1q_f32(w_ptr.add(idx + 8));
|
||||
vst1q_f32(
|
||||
x_ptr.add(idx + 8),
|
||||
vmulq_f32(vmulq_f32(x2, inv_rms_vec), w2),
|
||||
);
|
||||
|
||||
let x3 = vld1q_f32(x_ptr.add(idx + 12));
|
||||
let w3 = vld1q_f32(w_ptr.add(idx + 12));
|
||||
vst1q_f32(
|
||||
x_ptr.add(idx + 12),
|
||||
vmulq_f32(vmulq_f32(x3, inv_rms_vec), w3),
|
||||
);
|
||||
|
||||
idx += 16;
|
||||
}
|
||||
|
||||
for _ in 0..remaining_chunks {
|
||||
let x_v = vld1q_f32(x_ptr.add(idx));
|
||||
let w_v = vld1q_f32(w_ptr.add(idx));
|
||||
vst1q_f32(x_ptr.add(idx), vmulq_f32(vmulq_f32(x_v, inv_rms_vec), w_v));
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
for i in idx..len {
|
||||
*x_ptr.add(i) = *x_ptr.add(i) * inv_rms * *w_ptr.add(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn rms_norm_scalar(x: &mut [f32], weight: &[f32], eps: f32) {
|
||||
let len = x.len();
|
||||
|
||||
let sum_sq: f32 = x.iter().map(|v| v * v).sum();
|
||||
|
||||
let mean_sq = sum_sq / len as f32;
|
||||
let inv_rms = 1.0 / (mean_sq + eps).sqrt();
|
||||
|
||||
for (i, w) in weight.iter().enumerate() {
|
||||
x[i] = x[i] * inv_rms * w;
|
||||
}
|
||||
}
|
||||
|
||||
/// LayerNorm with NEON optimization
|
||||
#[inline(always)]
|
||||
fn layer_norm_neon(x: &mut [f32], weight: &[f32], bias: &[f32], eps: f32) {
|
||||
debug_assert_eq!(x.len(), weight.len());
|
||||
debug_assert_eq!(x.len(), bias.len());
|
||||
|
||||
let len = x.len();
|
||||
if len == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
layer_norm_neon_impl(x, weight, bias, eps);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
layer_norm_scalar(x, weight, bias, eps);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn layer_norm_neon_impl(x: &mut [f32], weight: &[f32], bias: &[f32], eps: f32) {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let len = x.len();
|
||||
let x_ptr = x.as_mut_ptr();
|
||||
let w_ptr = weight.as_ptr();
|
||||
let b_ptr = bias.as_ptr();
|
||||
|
||||
let mut sum0 = vdupq_n_f32(0.0);
|
||||
let mut sum1 = vdupq_n_f32(0.0);
|
||||
let mut sq0 = vdupq_n_f32(0.0);
|
||||
let mut sq1 = vdupq_n_f32(0.0);
|
||||
|
||||
let chunks = len / (NEON_LANE_WIDTH * 2);
|
||||
let mut idx = 0usize;
|
||||
|
||||
for _ in 0..chunks {
|
||||
let v0 = vld1q_f32(x_ptr.add(idx));
|
||||
sum0 = vaddq_f32(sum0, v0);
|
||||
sq0 = vfmaq_f32(sq0, v0, v0);
|
||||
|
||||
let v1 = vld1q_f32(x_ptr.add(idx + 4));
|
||||
sum1 = vaddq_f32(sum1, v1);
|
||||
sq1 = vfmaq_f32(sq1, v1, v1);
|
||||
|
||||
idx += 8;
|
||||
}
|
||||
|
||||
let sum_vec = vaddq_f32(sum0, sum1);
|
||||
let sq_vec = vaddq_f32(sq0, sq1);
|
||||
|
||||
let remaining_chunks = (len - idx) / NEON_LANE_WIDTH;
|
||||
let mut final_sum = sum_vec;
|
||||
let mut final_sq = sq_vec;
|
||||
for _ in 0..remaining_chunks {
|
||||
let v = vld1q_f32(x_ptr.add(idx));
|
||||
final_sum = vaddq_f32(final_sum, v);
|
||||
final_sq = vfmaq_f32(final_sq, v, v);
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
let mut sum = vaddvq_f32(final_sum);
|
||||
let mut sum_sq = vaddvq_f32(final_sq);
|
||||
|
||||
for i in idx..len {
|
||||
let v = *x_ptr.add(i);
|
||||
sum += v;
|
||||
sum_sq += v * v;
|
||||
}
|
||||
|
||||
let n = len as f32;
|
||||
let mean = sum / n;
|
||||
let variance = (sum_sq / n) - (mean * mean);
|
||||
let inv_std = 1.0 / (variance + eps).sqrt();
|
||||
|
||||
let mean_vec = vdupq_n_f32(mean);
|
||||
let inv_std_vec = vdupq_n_f32(inv_std);
|
||||
|
||||
idx = 0;
|
||||
let unroll_chunks = len / (NEON_LANE_WIDTH * UNROLL_FACTOR);
|
||||
for _ in 0..unroll_chunks {
|
||||
let x0 = vld1q_f32(x_ptr.add(idx));
|
||||
let n0 = vmulq_f32(vsubq_f32(x0, mean_vec), inv_std_vec);
|
||||
let w0 = vld1q_f32(w_ptr.add(idx));
|
||||
let b0 = vld1q_f32(b_ptr.add(idx));
|
||||
vst1q_f32(x_ptr.add(idx), vfmaq_f32(b0, n0, w0));
|
||||
|
||||
let x1 = vld1q_f32(x_ptr.add(idx + 4));
|
||||
let n1 = vmulq_f32(vsubq_f32(x1, mean_vec), inv_std_vec);
|
||||
let w1 = vld1q_f32(w_ptr.add(idx + 4));
|
||||
let b1 = vld1q_f32(b_ptr.add(idx + 4));
|
||||
vst1q_f32(x_ptr.add(idx + 4), vfmaq_f32(b1, n1, w1));
|
||||
|
||||
let x2 = vld1q_f32(x_ptr.add(idx + 8));
|
||||
let n2 = vmulq_f32(vsubq_f32(x2, mean_vec), inv_std_vec);
|
||||
let w2 = vld1q_f32(w_ptr.add(idx + 8));
|
||||
let b2 = vld1q_f32(b_ptr.add(idx + 8));
|
||||
vst1q_f32(x_ptr.add(idx + 8), vfmaq_f32(b2, n2, w2));
|
||||
|
||||
let x3 = vld1q_f32(x_ptr.add(idx + 12));
|
||||
let n3 = vmulq_f32(vsubq_f32(x3, mean_vec), inv_std_vec);
|
||||
let w3 = vld1q_f32(w_ptr.add(idx + 12));
|
||||
let b3 = vld1q_f32(b_ptr.add(idx + 12));
|
||||
vst1q_f32(x_ptr.add(idx + 12), vfmaq_f32(b3, n3, w3));
|
||||
|
||||
idx += 16;
|
||||
}
|
||||
|
||||
let remaining = (len - idx) / NEON_LANE_WIDTH;
|
||||
for _ in 0..remaining {
|
||||
let x_v = vld1q_f32(x_ptr.add(idx));
|
||||
let n_v = vmulq_f32(vsubq_f32(x_v, mean_vec), inv_std_vec);
|
||||
let w_v = vld1q_f32(w_ptr.add(idx));
|
||||
let b_v = vld1q_f32(b_ptr.add(idx));
|
||||
vst1q_f32(x_ptr.add(idx), vfmaq_f32(b_v, n_v, w_v));
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
for i in idx..len {
|
||||
let normalized = (*x_ptr.add(i) - mean) * inv_std;
|
||||
*x_ptr.add(i) = normalized * *w_ptr.add(i) + *b_ptr.add(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn layer_norm_scalar(x: &mut [f32], weight: &[f32], bias: &[f32], eps: f32) {
|
||||
let len = x.len();
|
||||
let n = len as f32;
|
||||
|
||||
let sum: f32 = x.iter().sum();
|
||||
let mean = sum / n;
|
||||
|
||||
let variance: f32 = x.iter().map(|v| (v - mean).powi(2)).sum::<f32>() / n;
|
||||
let inv_std = 1.0 / (variance + eps).sqrt();
|
||||
|
||||
for i in 0..len {
|
||||
let normalized = (x[i] - mean) * inv_std;
|
||||
x[i] = normalized * weight[i] + bias[i];
|
||||
}
|
||||
}
|
||||
|
||||
fn batched_rms_norm_neon(x: &mut [f32], weight: &[f32], batch_size: usize, dim: usize, eps: f32) {
|
||||
debug_assert_eq!(x.len(), batch_size * dim);
|
||||
debug_assert_eq!(weight.len(), dim);
|
||||
|
||||
for b in 0..batch_size {
|
||||
let offset = b * dim;
|
||||
rms_norm_neon(&mut x[offset..offset + dim], weight, eps);
|
||||
}
|
||||
}
|
||||
|
||||
fn batched_layer_norm_neon(
|
||||
x: &mut [f32],
|
||||
weight: &[f32],
|
||||
bias: &[f32],
|
||||
batch_size: usize,
|
||||
dim: usize,
|
||||
eps: f32,
|
||||
) {
|
||||
debug_assert_eq!(x.len(), batch_size * dim);
|
||||
debug_assert_eq!(weight.len(), dim);
|
||||
debug_assert_eq!(bias.len(), dim);
|
||||
|
||||
for b in 0..batch_size {
|
||||
let offset = b * dim;
|
||||
layer_norm_neon(&mut x[offset..offset + dim], weight, bias, eps);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn compute_rms(x: &[f32]) -> f32 {
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
compute_rms_neon_impl(x)
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
compute_rms_scalar(x)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn compute_rms_neon_impl(x: &[f32]) -> f32 {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let len = x.len();
|
||||
if len == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let x_ptr = x.as_ptr();
|
||||
let mut sum = vdupq_n_f32(0.0);
|
||||
|
||||
let chunks = len / NEON_LANE_WIDTH;
|
||||
let mut idx = 0usize;
|
||||
|
||||
for _ in 0..chunks {
|
||||
let v = vld1q_f32(x_ptr.add(idx));
|
||||
sum = vfmaq_f32(sum, v, v);
|
||||
idx += 4;
|
||||
}
|
||||
|
||||
let mut sum_sq = vaddvq_f32(sum);
|
||||
|
||||
for i in idx..len {
|
||||
let v = *x_ptr.add(i);
|
||||
sum_sq += v * v;
|
||||
}
|
||||
|
||||
(sum_sq / len as f32).sqrt()
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn compute_rms_scalar(x: &[f32]) -> f32 {
|
||||
let sum_sq: f32 = x.iter().map(|v| v * v).sum();
|
||||
(sum_sq / x.len() as f32).sqrt()
|
||||
}
|
||||
|
||||
// Helper function to generate random tensor data
|
||||
fn random_tensor(size: usize) -> Vec<f32> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
|
||||
}
|
||||
|
||||
// === Benchmark Functions ===
|
||||
|
||||
fn bench_rms_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rms_norm");
|
||||
group.sample_size(100);
|
||||
|
||||
// Test common hidden sizes used in LLMs
|
||||
for dim in [768, 1024, 2048, 4096, 8192] {
|
||||
let mut x = random_tensor(dim);
|
||||
let weight = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", dim), dim);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
rms_norm_neon(black_box(&mut x_copy), black_box(&weight), eps);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_layer_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("layer_norm");
|
||||
group.sample_size(100);
|
||||
|
||||
for dim in [768, 1024, 2048, 4096, 8192] {
|
||||
let mut x = random_tensor(dim);
|
||||
let weight = random_tensor(dim);
|
||||
let bias = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", dim), dim);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
layer_norm_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&weight),
|
||||
black_box(&bias),
|
||||
eps,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_batched_rms_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("batched_rms_norm");
|
||||
group.sample_size(50);
|
||||
|
||||
for batch_size in [1, 8, 32, 128] {
|
||||
for dim in [768, 2048, 4096] {
|
||||
let mut x = random_tensor(batch_size * dim);
|
||||
let weight = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("batch_{}_dim_{}", batch_size, dim),
|
||||
batch_size * dim,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((batch_size * dim) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
batched_rms_norm_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&weight),
|
||||
batch_size,
|
||||
dim,
|
||||
eps,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_batched_layer_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("batched_layer_norm");
|
||||
group.sample_size(50);
|
||||
|
||||
for batch_size in [1, 8, 32, 128] {
|
||||
for dim in [768, 2048, 4096] {
|
||||
let mut x = random_tensor(batch_size * dim);
|
||||
let weight = random_tensor(dim);
|
||||
let bias = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("batch_{}_dim_{}", batch_size, dim),
|
||||
batch_size * dim,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((batch_size * dim) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
batched_layer_norm_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&weight),
|
||||
black_box(&bias),
|
||||
batch_size,
|
||||
dim,
|
||||
eps,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_rms_vs_layer_norm(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rms_vs_layer");
|
||||
group.sample_size(100);
|
||||
|
||||
for dim in [768, 2048, 4096] {
|
||||
let x = random_tensor(dim);
|
||||
let weight = random_tensor(dim);
|
||||
let bias = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
group.bench_function(BenchmarkId::new("rms_norm", dim), |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
rms_norm_neon(black_box(&mut x_copy), black_box(&weight), eps);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
|
||||
group.bench_function(BenchmarkId::new("layer_norm", dim), |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
layer_norm_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&weight),
|
||||
black_box(&bias),
|
||||
eps,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_compute_rms(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("compute_rms");
|
||||
group.sample_size(100);
|
||||
|
||||
for dim in [768, 2048, 4096, 8192] {
|
||||
let x = random_tensor(dim);
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", dim), dim);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
group.bench_function(id, |b| b.iter(|| compute_rms(black_box(&x))));
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_norm_memory_throughput(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("norm_memory_throughput");
|
||||
group.sample_size(50);
|
||||
|
||||
// Test memory bandwidth at different sizes
|
||||
for dim in [256, 512, 1024, 2048, 4096, 8192, 16384] {
|
||||
let x = random_tensor(dim);
|
||||
let weight = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
// Memory: read x (dim * 4), read weight (dim * 4), write x (dim * 4)
|
||||
let memory_bytes = dim * 4 * 3;
|
||||
|
||||
let id = BenchmarkId::new(format!("dim_{}", dim), dim);
|
||||
|
||||
group.throughput(Throughput::Bytes(memory_bytes as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
rms_norm_neon(black_box(&mut x_copy), black_box(&weight), eps);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_norm_llm_sizes(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("norm_llm_sizes");
|
||||
group.sample_size(50);
|
||||
|
||||
// Real-world LLM hidden sizes
|
||||
let llm_configs = [
|
||||
("llama2_7b", 4096),
|
||||
("llama2_13b", 5120),
|
||||
("llama2_70b", 8192),
|
||||
("llama3_8b", 4096),
|
||||
("mistral_7b", 4096),
|
||||
("qwen2_7b", 3584),
|
||||
];
|
||||
|
||||
for (name, dim) in llm_configs {
|
||||
let x = random_tensor(dim);
|
||||
let weight = random_tensor(dim);
|
||||
let eps = 1e-6;
|
||||
|
||||
let id = BenchmarkId::new(name, dim);
|
||||
|
||||
group.throughput(Throughput::Elements(dim as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
rms_norm_neon(black_box(&mut x_copy), black_box(&weight), eps);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_rms_norm,
|
||||
bench_layer_norm,
|
||||
bench_batched_rms_norm,
|
||||
bench_batched_layer_norm,
|
||||
bench_rms_vs_layer_norm,
|
||||
bench_compute_rms,
|
||||
bench_norm_memory_throughput,
|
||||
bench_norm_llm_sizes,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
716
vendor/ruvector/crates/ruvllm/benches/rope_bench.rs
vendored
Normal file
716
vendor/ruvector/crates/ruvllm/benches/rope_bench.rs
vendored
Normal file
@@ -0,0 +1,716 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! RoPE (Rotary Position Embedding) Benchmarks for M4 Pro
|
||||
//!
|
||||
//! Benchmarks for RoPE operations including:
|
||||
//! - Standard RoPE application
|
||||
//! - Table precomputation
|
||||
//! - Scaled RoPE variants (NTK, YaRN)
|
||||
//!
|
||||
//! Performance targets for M4 Pro:
|
||||
//! - RoPE apply (128 head_dim, 1 token): <5us
|
||||
//! - RoPE apply (128 head_dim, 32 tokens): <50us
|
||||
//! - Table precomputation (4096 seq): <1ms
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use rand::Rng;
|
||||
|
||||
const NEON_LANE_WIDTH: usize = 4;
|
||||
const UNROLL_FACTOR: usize = 4;
|
||||
|
||||
/// RoPE configuration
|
||||
#[derive(Clone, Copy)]
|
||||
struct RopeConfig {
|
||||
base: f32,
|
||||
head_dim: usize,
|
||||
max_seq_len: usize,
|
||||
scaling_factor: f32,
|
||||
ntk_aware: bool,
|
||||
original_max_len: usize,
|
||||
}
|
||||
|
||||
impl Default for RopeConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
base: 10000.0,
|
||||
head_dim: 128,
|
||||
max_seq_len: 4096,
|
||||
scaling_factor: 1.0,
|
||||
ntk_aware: false,
|
||||
original_max_len: 4096,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RopeConfig {
|
||||
fn llama2(head_dim: usize, max_seq_len: usize) -> Self {
|
||||
Self {
|
||||
base: 10000.0,
|
||||
head_dim,
|
||||
max_seq_len,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn llama3(head_dim: usize, max_seq_len: usize) -> Self {
|
||||
Self {
|
||||
base: 500000.0,
|
||||
head_dim,
|
||||
max_seq_len,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ntk(mut self, original_max_len: usize) -> Self {
|
||||
self.ntk_aware = true;
|
||||
self.original_max_len = original_max_len;
|
||||
self
|
||||
}
|
||||
|
||||
fn with_scaling(mut self, scaling_factor: f32) -> Self {
|
||||
self.scaling_factor = scaling_factor;
|
||||
self
|
||||
}
|
||||
|
||||
fn effective_base(&self) -> f32 {
|
||||
if self.ntk_aware && self.max_seq_len > self.original_max_len {
|
||||
let scale = self.max_seq_len as f32 / self.original_max_len as f32;
|
||||
self.base * scale.powf((self.head_dim as f32) / (self.head_dim as f32 - 2.0))
|
||||
} else {
|
||||
self.base
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct RopeTables {
|
||||
cos: Vec<f32>,
|
||||
sin: Vec<f32>,
|
||||
half_dim: usize,
|
||||
max_seq_len: usize,
|
||||
}
|
||||
|
||||
impl RopeTables {
|
||||
fn get(&self, position: usize) -> (&[f32], &[f32]) {
|
||||
let offset = position * self.half_dim;
|
||||
(
|
||||
&self.cos[offset..offset + self.half_dim],
|
||||
&self.sin[offset..offset + self.half_dim],
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn precompute_rope_tables(max_seq_len: usize, head_dim: usize, base: f32) -> (Vec<f32>, Vec<f32>) {
|
||||
let half_dim = head_dim / 2;
|
||||
let mut cos_table = vec![0.0; max_seq_len * half_dim];
|
||||
let mut sin_table = vec![0.0; max_seq_len * half_dim];
|
||||
|
||||
let inv_freq: Vec<f32> = (0..half_dim)
|
||||
.map(|i| 1.0 / base.powf((2 * i) as f32 / head_dim as f32))
|
||||
.collect();
|
||||
|
||||
for pos in 0..max_seq_len {
|
||||
let offset = pos * half_dim;
|
||||
for (i, &freq) in inv_freq.iter().enumerate() {
|
||||
let theta = pos as f32 * freq;
|
||||
cos_table[offset + i] = theta.cos();
|
||||
sin_table[offset + i] = theta.sin();
|
||||
}
|
||||
}
|
||||
|
||||
(cos_table, sin_table)
|
||||
}
|
||||
|
||||
fn precompute_rope_tables_with_config(config: &RopeConfig) -> RopeTables {
|
||||
let base = config.effective_base();
|
||||
let (cos, sin) = precompute_rope_tables(config.max_seq_len, config.head_dim, base);
|
||||
|
||||
let (cos, sin) = if config.scaling_factor != 1.0 {
|
||||
let half_dim = config.head_dim / 2;
|
||||
let mut scaled_cos = vec![0.0; config.max_seq_len * half_dim];
|
||||
let mut scaled_sin = vec![0.0; config.max_seq_len * half_dim];
|
||||
|
||||
for pos in 0..config.max_seq_len {
|
||||
let scaled_pos = pos as f32 / config.scaling_factor;
|
||||
let lower_pos = scaled_pos.floor() as usize;
|
||||
let upper_pos = (lower_pos + 1).min(config.max_seq_len - 1);
|
||||
let frac = scaled_pos - lower_pos as f32;
|
||||
|
||||
let offset = pos * half_dim;
|
||||
let lower_offset = lower_pos * half_dim;
|
||||
let upper_offset = upper_pos * half_dim;
|
||||
|
||||
for i in 0..half_dim {
|
||||
scaled_cos[offset + i] =
|
||||
cos[lower_offset + i] * (1.0 - frac) + cos[upper_offset + i] * frac;
|
||||
scaled_sin[offset + i] =
|
||||
sin[lower_offset + i] * (1.0 - frac) + sin[upper_offset + i] * frac;
|
||||
}
|
||||
}
|
||||
|
||||
(scaled_cos, scaled_sin)
|
||||
} else {
|
||||
(cos, sin)
|
||||
};
|
||||
|
||||
RopeTables {
|
||||
cos,
|
||||
sin,
|
||||
half_dim: config.head_dim / 2,
|
||||
max_seq_len: config.max_seq_len,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn apply_rope_neon(x: &mut [f32], positions: &[usize], head_dim: usize, base: f32) {
|
||||
let half_dim = head_dim / 2;
|
||||
let num_tokens = positions.len();
|
||||
let stride = head_dim;
|
||||
|
||||
debug_assert_eq!(x.len(), num_tokens * head_dim);
|
||||
|
||||
let inv_freq: Vec<f32> = (0..half_dim)
|
||||
.map(|i| 1.0 / base.powf((2 * i) as f32 / head_dim as f32))
|
||||
.collect();
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
apply_rope_neon_impl(x, positions, &inv_freq, half_dim, stride);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
apply_rope_scalar(x, positions, &inv_freq, half_dim, stride);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn apply_rope_neon_impl(
|
||||
x: &mut [f32],
|
||||
positions: &[usize],
|
||||
inv_freq: &[f32],
|
||||
half_dim: usize,
|
||||
stride: usize,
|
||||
) {
|
||||
let x_ptr = x.as_mut_ptr();
|
||||
let inv_freq_ptr = inv_freq.as_ptr();
|
||||
|
||||
for (tok_idx, &pos) in positions.iter().enumerate() {
|
||||
let tok_offset = tok_idx * stride;
|
||||
|
||||
let chunks = half_dim / (NEON_LANE_WIDTH / 2);
|
||||
|
||||
let mut freq_idx = 0usize;
|
||||
for _ in 0..chunks {
|
||||
let freq0 = *inv_freq_ptr.add(freq_idx);
|
||||
let freq1 = *inv_freq_ptr.add(freq_idx + 1);
|
||||
|
||||
let theta0 = pos as f32 * freq0;
|
||||
let theta1 = pos as f32 * freq1;
|
||||
|
||||
let cos0 = theta0.cos();
|
||||
let sin0 = theta0.sin();
|
||||
let cos1 = theta1.cos();
|
||||
let sin1 = theta1.sin();
|
||||
|
||||
let x_offset = tok_offset + freq_idx * 2;
|
||||
let x0 = *x_ptr.add(x_offset);
|
||||
let x1 = *x_ptr.add(x_offset + 1);
|
||||
let x2 = *x_ptr.add(x_offset + 2);
|
||||
let x3 = *x_ptr.add(x_offset + 3);
|
||||
|
||||
*x_ptr.add(x_offset) = x0 * cos0 - x1 * sin0;
|
||||
*x_ptr.add(x_offset + 1) = x1 * cos0 + x0 * sin0;
|
||||
*x_ptr.add(x_offset + 2) = x2 * cos1 - x3 * sin1;
|
||||
*x_ptr.add(x_offset + 3) = x3 * cos1 + x2 * sin1;
|
||||
|
||||
freq_idx += 2;
|
||||
}
|
||||
|
||||
while freq_idx < half_dim {
|
||||
let freq = *inv_freq_ptr.add(freq_idx);
|
||||
let theta = pos as f32 * freq;
|
||||
let cos_val = theta.cos();
|
||||
let sin_val = theta.sin();
|
||||
|
||||
let x_offset = tok_offset + freq_idx * 2;
|
||||
let x0 = *x_ptr.add(x_offset);
|
||||
let x1 = *x_ptr.add(x_offset + 1);
|
||||
|
||||
*x_ptr.add(x_offset) = x0 * cos_val - x1 * sin_val;
|
||||
*x_ptr.add(x_offset + 1) = x1 * cos_val + x0 * sin_val;
|
||||
|
||||
freq_idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn apply_rope_scalar(
|
||||
x: &mut [f32],
|
||||
positions: &[usize],
|
||||
inv_freq: &[f32],
|
||||
half_dim: usize,
|
||||
stride: usize,
|
||||
) {
|
||||
for (tok_idx, &pos) in positions.iter().enumerate() {
|
||||
let tok_offset = tok_idx * stride;
|
||||
|
||||
for (i, &freq) in inv_freq.iter().enumerate() {
|
||||
let theta = pos as f32 * freq;
|
||||
let cos_val = theta.cos();
|
||||
let sin_val = theta.sin();
|
||||
|
||||
let x_offset = tok_offset + i * 2;
|
||||
let x0 = x[x_offset];
|
||||
let x1 = x[x_offset + 1];
|
||||
|
||||
x[x_offset] = x0 * cos_val - x1 * sin_val;
|
||||
x[x_offset + 1] = x1 * cos_val + x0 * sin_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn apply_rope_with_tables(x: &mut [f32], positions: &[usize], tables: &RopeTables) {
|
||||
let half_dim = tables.half_dim;
|
||||
let num_tokens = positions.len();
|
||||
let head_dim = half_dim * 2;
|
||||
|
||||
debug_assert_eq!(x.len(), num_tokens * head_dim);
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
apply_rope_tables_neon_impl(x, positions, tables, half_dim);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
apply_rope_tables_scalar(x, positions, tables, half_dim);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
#[inline(always)]
|
||||
unsafe fn apply_rope_tables_neon_impl(
|
||||
x: &mut [f32],
|
||||
positions: &[usize],
|
||||
tables: &RopeTables,
|
||||
half_dim: usize,
|
||||
) {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let x_ptr = x.as_mut_ptr();
|
||||
let head_dim = half_dim * 2;
|
||||
|
||||
for (tok_idx, &pos) in positions.iter().enumerate() {
|
||||
debug_assert!(pos < tables.max_seq_len);
|
||||
|
||||
let tok_offset = tok_idx * head_dim;
|
||||
let table_offset = pos * half_dim;
|
||||
|
||||
let cos_ptr = tables.cos.as_ptr().add(table_offset);
|
||||
let sin_ptr = tables.sin.as_ptr().add(table_offset);
|
||||
|
||||
let chunks = half_dim / UNROLL_FACTOR;
|
||||
|
||||
let mut freq_idx = 0usize;
|
||||
for _ in 0..chunks {
|
||||
let cos_vec = vld1q_f32(cos_ptr.add(freq_idx));
|
||||
let sin_vec = vld1q_f32(sin_ptr.add(freq_idx));
|
||||
|
||||
let x_offset = tok_offset + freq_idx * 2;
|
||||
|
||||
let x_01 = vld1q_f32(x_ptr.add(x_offset));
|
||||
let x_23 = vld1q_f32(x_ptr.add(x_offset + 4));
|
||||
|
||||
let x_even = vuzp1q_f32(x_01, x_23);
|
||||
let x_odd = vuzp2q_f32(x_01, x_23);
|
||||
|
||||
let x_new_even = vfmsq_f32(vmulq_f32(x_even, cos_vec), x_odd, sin_vec);
|
||||
let x_new_odd = vfmaq_f32(vmulq_f32(x_odd, cos_vec), x_even, sin_vec);
|
||||
|
||||
let out_01 = vzip1q_f32(x_new_even, x_new_odd);
|
||||
let out_23 = vzip2q_f32(x_new_even, x_new_odd);
|
||||
|
||||
vst1q_f32(x_ptr.add(x_offset), out_01);
|
||||
vst1q_f32(x_ptr.add(x_offset + 4), out_23);
|
||||
|
||||
freq_idx += 4;
|
||||
}
|
||||
|
||||
while freq_idx < half_dim {
|
||||
let cos_val = *cos_ptr.add(freq_idx);
|
||||
let sin_val = *sin_ptr.add(freq_idx);
|
||||
|
||||
let x_offset = tok_offset + freq_idx * 2;
|
||||
let x0 = *x_ptr.add(x_offset);
|
||||
let x1 = *x_ptr.add(x_offset + 1);
|
||||
|
||||
*x_ptr.add(x_offset) = x0 * cos_val - x1 * sin_val;
|
||||
*x_ptr.add(x_offset + 1) = x1 * cos_val + x0 * sin_val;
|
||||
|
||||
freq_idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn apply_rope_tables_scalar(
|
||||
x: &mut [f32],
|
||||
positions: &[usize],
|
||||
tables: &RopeTables,
|
||||
half_dim: usize,
|
||||
) {
|
||||
let head_dim = half_dim * 2;
|
||||
|
||||
for (tok_idx, &pos) in positions.iter().enumerate() {
|
||||
let tok_offset = tok_idx * head_dim;
|
||||
let (cos_slice, sin_slice) = tables.get(pos);
|
||||
|
||||
for i in 0..half_dim {
|
||||
let cos_val = cos_slice[i];
|
||||
let sin_val = sin_slice[i];
|
||||
|
||||
let x_offset = tok_offset + i * 2;
|
||||
let x0 = x[x_offset];
|
||||
let x1 = x[x_offset + 1];
|
||||
|
||||
x[x_offset] = x0 * cos_val - x1 * sin_val;
|
||||
x[x_offset + 1] = x1 * cos_val + x0 * sin_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_inverse_rope_neon(x: &mut [f32], positions: &[usize], head_dim: usize, base: f32) {
|
||||
let half_dim = head_dim / 2;
|
||||
let stride = head_dim;
|
||||
|
||||
let inv_freq: Vec<f32> = (0..half_dim)
|
||||
.map(|i| -1.0 / base.powf((2 * i) as f32 / head_dim as f32))
|
||||
.collect();
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe {
|
||||
apply_rope_neon_impl(x, positions, &inv_freq, half_dim, stride);
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "aarch64"))]
|
||||
{
|
||||
apply_rope_scalar(x, positions, &inv_freq, half_dim, stride);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to generate random tensor data
|
||||
fn random_tensor(size: usize) -> Vec<f32> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..size).map(|_| rng.gen_range(-1.0..1.0)).collect()
|
||||
}
|
||||
|
||||
// === Benchmark Functions ===
|
||||
|
||||
fn bench_apply_rope(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_apply");
|
||||
group.sample_size(100);
|
||||
|
||||
for head_dim in [64, 128] {
|
||||
for num_tokens in [1, 8, 32, 128] {
|
||||
let mut x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
let base = 10000.0;
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("dim_{}_tokens_{}", head_dim, num_tokens),
|
||||
num_tokens,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((num_tokens * head_dim) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&positions),
|
||||
head_dim,
|
||||
base,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_apply_rope_with_tables(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_apply_tables");
|
||||
group.sample_size(100);
|
||||
|
||||
for head_dim in [64, 128] {
|
||||
let config = RopeConfig {
|
||||
head_dim,
|
||||
max_seq_len: 4096,
|
||||
base: 10000.0,
|
||||
..Default::default()
|
||||
};
|
||||
let tables = precompute_rope_tables_with_config(&config);
|
||||
|
||||
for num_tokens in [1, 8, 32, 128] {
|
||||
let x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("dim_{}_tokens_{}", head_dim, num_tokens),
|
||||
num_tokens,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((num_tokens * head_dim) as u64));
|
||||
group.bench_with_input(id, &(x.clone(), tables.clone()), |b, (x, tables)| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_with_tables(black_box(&mut x_copy), black_box(&positions), tables);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_precompute_tables(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_precompute");
|
||||
group.sample_size(50);
|
||||
|
||||
for max_seq_len in [512, 1024, 2048, 4096, 8192] {
|
||||
for head_dim in [64, 128] {
|
||||
let id = BenchmarkId::new(format!("seq_{}_dim_{}", max_seq_len, head_dim), max_seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements((max_seq_len * head_dim) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
precompute_rope_tables(black_box(max_seq_len), black_box(head_dim), 10000.0)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_precompute_with_config(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_precompute_config");
|
||||
group.sample_size(50);
|
||||
|
||||
// Test different model configurations
|
||||
let configs = [
|
||||
("llama2_4k", RopeConfig::llama2(128, 4096)),
|
||||
("llama3_4k", RopeConfig::llama3(128, 4096)),
|
||||
(
|
||||
"llama2_8k_ntk",
|
||||
RopeConfig::llama2(128, 8192).with_ntk(4096),
|
||||
),
|
||||
(
|
||||
"llama2_8k_scaled",
|
||||
RopeConfig::llama2(128, 8192).with_scaling(2.0),
|
||||
),
|
||||
];
|
||||
|
||||
for (name, config) in configs {
|
||||
let id = BenchmarkId::new(name, config.max_seq_len);
|
||||
|
||||
group.throughput(Throughput::Elements(
|
||||
(config.max_seq_len * config.head_dim) as u64,
|
||||
));
|
||||
group.bench_with_input(id, &config, |b, cfg| {
|
||||
b.iter(|| precompute_rope_tables_with_config(black_box(cfg)))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_rope_vs_tables(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_comparison");
|
||||
group.sample_size(100);
|
||||
|
||||
let head_dim = 128;
|
||||
let max_seq_len = 4096;
|
||||
let num_tokens = 32;
|
||||
let base = 10000.0;
|
||||
|
||||
let config = RopeConfig {
|
||||
head_dim,
|
||||
max_seq_len,
|
||||
base,
|
||||
..Default::default()
|
||||
};
|
||||
let tables = precompute_rope_tables_with_config(&config);
|
||||
|
||||
let x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
|
||||
// Benchmark without tables
|
||||
group.bench_function("without_tables", |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&positions),
|
||||
head_dim,
|
||||
base,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
|
||||
// Benchmark with tables
|
||||
group.bench_with_input("with_tables", &tables, |b, tables| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_with_tables(black_box(&mut x_copy), black_box(&positions), tables);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_inverse_rope(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_inverse");
|
||||
group.sample_size(100);
|
||||
|
||||
for head_dim in [64, 128] {
|
||||
for num_tokens in [1, 8, 32] {
|
||||
let mut x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
let base = 10000.0;
|
||||
|
||||
let id = BenchmarkId::new(
|
||||
format!("dim_{}_tokens_{}", head_dim, num_tokens),
|
||||
num_tokens,
|
||||
);
|
||||
|
||||
group.throughput(Throughput::Elements((num_tokens * head_dim) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_inverse_rope_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&positions),
|
||||
head_dim,
|
||||
base,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_rope_roundtrip(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_roundtrip");
|
||||
group.sample_size(50);
|
||||
|
||||
let head_dim = 128;
|
||||
let base = 10000.0;
|
||||
|
||||
for num_tokens in [1, 8, 32] {
|
||||
let x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
|
||||
let id = BenchmarkId::new(format!("tokens_{}", num_tokens), num_tokens);
|
||||
|
||||
group.throughput(Throughput::Elements((num_tokens * head_dim * 2) as u64));
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&positions),
|
||||
head_dim,
|
||||
base,
|
||||
);
|
||||
apply_inverse_rope_neon(
|
||||
black_box(&mut x_copy),
|
||||
black_box(&positions),
|
||||
head_dim,
|
||||
base,
|
||||
);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_rope_scaling_variants(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("rope_scaling");
|
||||
group.sample_size(50);
|
||||
|
||||
let head_dim = 128;
|
||||
let num_tokens = 32;
|
||||
let x = random_tensor(num_tokens * head_dim);
|
||||
let positions: Vec<usize> = (0..num_tokens).collect();
|
||||
|
||||
// Different scaling configurations
|
||||
let configs = [
|
||||
("standard", RopeConfig::llama2(head_dim, 4096)),
|
||||
("ntk_2x", RopeConfig::llama2(head_dim, 8192).with_ntk(4096)),
|
||||
("ntk_4x", RopeConfig::llama2(head_dim, 16384).with_ntk(4096)),
|
||||
(
|
||||
"linear_2x",
|
||||
RopeConfig::llama2(head_dim, 8192).with_scaling(2.0),
|
||||
),
|
||||
(
|
||||
"linear_4x",
|
||||
RopeConfig::llama2(head_dim, 16384).with_scaling(4.0),
|
||||
),
|
||||
];
|
||||
|
||||
for (name, config) in configs {
|
||||
let tables = precompute_rope_tables_with_config(&config);
|
||||
|
||||
let id = BenchmarkId::new(name, config.max_seq_len);
|
||||
|
||||
group.bench_with_input(id, &tables, |b, tables| {
|
||||
b.iter(|| {
|
||||
let mut x_copy = x.clone();
|
||||
apply_rope_with_tables(black_box(&mut x_copy), black_box(&positions), tables);
|
||||
x_copy
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_apply_rope,
|
||||
bench_apply_rope_with_tables,
|
||||
bench_precompute_tables,
|
||||
bench_precompute_with_config,
|
||||
bench_rope_vs_tables,
|
||||
bench_inverse_rope,
|
||||
bench_rope_roundtrip,
|
||||
bench_rope_scaling_variants,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
1263
vendor/ruvector/crates/ruvllm/benches/ruvltra_benchmark.rs
vendored
Normal file
1263
vendor/ruvector/crates/ruvllm/benches/ruvltra_benchmark.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
378
vendor/ruvector/crates/ruvllm/benches/serving_bench.rs
vendored
Normal file
378
vendor/ruvector/crates/ruvllm/benches/serving_bench.rs
vendored
Normal file
@@ -0,0 +1,378 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! Benchmarks comparing continuous batching to sequential serving
|
||||
//!
|
||||
//! Run with: cargo bench --bench serving_bench
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use ruvllm::backends::{GenerateParams, NoopBackend};
|
||||
use ruvllm::serving::{
|
||||
ContinuousBatchScheduler, InferenceRequest, KvCachePoolConfig, RequestQueue, SchedulerConfig,
|
||||
ServingEngine, ServingEngineConfig,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Simulates sequential request processing (no batching)
|
||||
fn sequential_process(requests: &[InferenceRequest]) -> Vec<Duration> {
|
||||
let mut latencies = Vec::with_capacity(requests.len());
|
||||
|
||||
for request in requests {
|
||||
let start = Instant::now();
|
||||
|
||||
// Simulate prefill
|
||||
let prefill_time = Duration::from_micros((request.prompt_len() * 100) as u64);
|
||||
std::thread::sleep(prefill_time);
|
||||
|
||||
// Simulate decode (one token at a time)
|
||||
let decode_time = Duration::from_micros((request.params.max_tokens * 50) as u64);
|
||||
std::thread::sleep(decode_time);
|
||||
|
||||
latencies.push(start.elapsed());
|
||||
}
|
||||
|
||||
latencies
|
||||
}
|
||||
|
||||
/// Simulates continuous batching with scheduler
|
||||
fn continuous_batching_process(requests: Vec<InferenceRequest>) -> Vec<Duration> {
|
||||
let config = SchedulerConfig::default();
|
||||
let kv_config = KvCachePoolConfig {
|
||||
num_slots: 64,
|
||||
max_seq_len: 512,
|
||||
block_size: 16,
|
||||
total_blocks: 1024,
|
||||
num_kv_heads: 8,
|
||||
head_dim: 128,
|
||||
num_layers: 32,
|
||||
};
|
||||
|
||||
let mut scheduler = ContinuousBatchScheduler::new(config, kv_config);
|
||||
let mut queue = RequestQueue::new();
|
||||
let mut latencies = Vec::new();
|
||||
let request_times: std::collections::HashMap<_, _> =
|
||||
requests.iter().map(|r| (r.id, Instant::now())).collect();
|
||||
|
||||
// Add all requests to queue
|
||||
for request in requests {
|
||||
queue.add(request);
|
||||
}
|
||||
|
||||
// Process iterations until all complete
|
||||
let mut iteration = 0;
|
||||
let max_iterations = 1000;
|
||||
|
||||
while !queue.is_empty() && iteration < max_iterations {
|
||||
let batch = scheduler.schedule(&mut queue);
|
||||
|
||||
if batch.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Simulate batch processing
|
||||
// Prefill tokens can be processed in parallel
|
||||
let prefill_tokens: usize = batch
|
||||
.requests
|
||||
.iter()
|
||||
.filter(|r| r.is_prefill)
|
||||
.map(|r| r.num_tokens())
|
||||
.sum();
|
||||
|
||||
// Decode tokens are processed together
|
||||
let decode_count = batch.requests.iter().filter(|r| !r.is_prefill).count();
|
||||
|
||||
// Batched prefill is much faster per token
|
||||
if prefill_tokens > 0 {
|
||||
let batch_prefill_time = Duration::from_micros((prefill_tokens * 20) as u64); // 5x faster
|
||||
std::thread::sleep(batch_prefill_time);
|
||||
}
|
||||
|
||||
// Batched decode is faster per request
|
||||
if decode_count > 0 {
|
||||
let batch_decode_time = Duration::from_micros((decode_count * 30) as u64); // ~1.7x faster
|
||||
std::thread::sleep(batch_decode_time);
|
||||
|
||||
// Mark completion for decode requests that finished
|
||||
for req in &batch.requests {
|
||||
if !req.is_prefill {
|
||||
if let Some(running) = queue.running.get_mut(&req.request_id) {
|
||||
running.add_token(0); // Simulate token generation
|
||||
|
||||
if running.is_complete() {
|
||||
if let Some(start) = request_times.get(&req.request_id) {
|
||||
latencies.push(start.elapsed());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
iteration += 1;
|
||||
}
|
||||
|
||||
latencies
|
||||
}
|
||||
|
||||
fn create_test_requests(
|
||||
count: usize,
|
||||
prompt_len: usize,
|
||||
max_tokens: usize,
|
||||
) -> Vec<InferenceRequest> {
|
||||
(0..count)
|
||||
.map(|_| {
|
||||
let prompt_tokens: Vec<u32> = (0..prompt_len as u32).collect();
|
||||
let params = GenerateParams::default().with_max_tokens(max_tokens);
|
||||
InferenceRequest::new(prompt_tokens, params)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn bench_scheduler_overhead(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("scheduler_overhead");
|
||||
|
||||
for batch_size in [1, 4, 16, 64, 128] {
|
||||
group.throughput(Throughput::Elements(batch_size as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("schedule", batch_size),
|
||||
&batch_size,
|
||||
|b, &size| {
|
||||
let config = SchedulerConfig::default();
|
||||
let kv_config = KvCachePoolConfig::default();
|
||||
let mut scheduler = ContinuousBatchScheduler::new(config, kv_config);
|
||||
|
||||
b.iter(|| {
|
||||
let mut queue = RequestQueue::new();
|
||||
let requests = create_test_requests(size, 100, 50);
|
||||
for request in requests {
|
||||
queue.add(request);
|
||||
}
|
||||
let batch = scheduler.schedule(&mut queue);
|
||||
black_box(batch)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_batch_creation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("batch_creation");
|
||||
|
||||
for num_requests in [1, 8, 32, 128] {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("create_batch", num_requests),
|
||||
&num_requests,
|
||||
|b, &count| {
|
||||
let config = SchedulerConfig::default();
|
||||
let kv_config = KvCachePoolConfig {
|
||||
num_slots: 256,
|
||||
max_seq_len: 512,
|
||||
block_size: 16,
|
||||
total_blocks: 4096,
|
||||
..Default::default()
|
||||
};
|
||||
let mut scheduler = ContinuousBatchScheduler::new(config, kv_config);
|
||||
|
||||
b.iter(|| {
|
||||
let mut queue = RequestQueue::new();
|
||||
let requests = create_test_requests(count, 64, 32);
|
||||
for request in requests {
|
||||
queue.add(request);
|
||||
}
|
||||
scheduler.schedule(&mut queue)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_kv_cache_allocation(c: &mut Criterion) {
|
||||
use ruvllm::serving::{KvCacheManager, RequestId};
|
||||
|
||||
let mut group = c.benchmark_group("kv_cache_allocation");
|
||||
|
||||
for max_seq_len in [128, 512, 2048, 4096] {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("allocate", max_seq_len),
|
||||
&max_seq_len,
|
||||
|b, &seq_len| {
|
||||
let config = KvCachePoolConfig {
|
||||
num_slots: 128,
|
||||
max_seq_len: seq_len,
|
||||
block_size: 16,
|
||||
total_blocks: 8192,
|
||||
..Default::default()
|
||||
};
|
||||
let mut manager = KvCacheManager::new(config);
|
||||
|
||||
b.iter(|| {
|
||||
let request_id = RequestId::new();
|
||||
let slot = manager.allocate(request_id, seq_len);
|
||||
if let Ok(_) = slot {
|
||||
manager.free(request_id);
|
||||
}
|
||||
black_box(slot)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_request_throughput(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("request_throughput");
|
||||
group.measurement_time(Duration::from_secs(5));
|
||||
|
||||
for num_requests in [10, 50, 100] {
|
||||
group.throughput(Throughput::Elements(num_requests as u64));
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("continuous_batching", num_requests),
|
||||
&num_requests,
|
||||
|b, &count| {
|
||||
b.iter(|| {
|
||||
let requests = create_test_requests(count, 32, 16);
|
||||
continuous_batching_process(requests)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_serving_engine(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("serving_engine");
|
||||
|
||||
group.bench_function("submit_request", |b| {
|
||||
let backend = Arc::new(NoopBackend);
|
||||
let config = ServingEngineConfig {
|
||||
kv_cache: KvCachePoolConfig {
|
||||
num_slots: 64,
|
||||
max_seq_len: 256,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let engine = ServingEngine::new(backend, config);
|
||||
|
||||
b.iter(|| {
|
||||
let params = GenerateParams::default().with_max_tokens(10);
|
||||
let request = InferenceRequest::new(vec![1, 2, 3, 4, 5], params);
|
||||
engine.submit(request)
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("run_iteration", |b| {
|
||||
let backend = Arc::new(NoopBackend);
|
||||
let config = ServingEngineConfig {
|
||||
kv_cache: KvCachePoolConfig {
|
||||
num_slots: 64,
|
||||
max_seq_len: 256,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let engine = ServingEngine::new(backend, config);
|
||||
|
||||
// Pre-populate with some requests
|
||||
for _ in 0..10 {
|
||||
let params = GenerateParams::default().with_max_tokens(5);
|
||||
let request = InferenceRequest::new(vec![1, 2, 3], params);
|
||||
let _ = engine.submit(request);
|
||||
}
|
||||
|
||||
b.iter(|| engine.run_iteration());
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_mixed_workload(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mixed_workload");
|
||||
group.measurement_time(Duration::from_secs(3));
|
||||
|
||||
// Simulate realistic mixed workload
|
||||
group.bench_function("short_prompts_long_gen", |b| {
|
||||
b.iter(|| {
|
||||
let requests: Vec<_> = (0..20)
|
||||
.map(|_| {
|
||||
let prompt_tokens: Vec<u32> = (0..16).collect();
|
||||
let params = GenerateParams::default().with_max_tokens(128);
|
||||
InferenceRequest::new(prompt_tokens, params)
|
||||
})
|
||||
.collect();
|
||||
continuous_batching_process(requests)
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("long_prompts_short_gen", |b| {
|
||||
b.iter(|| {
|
||||
let requests: Vec<_> = (0..20)
|
||||
.map(|_| {
|
||||
let prompt_tokens: Vec<u32> = (0..256).collect();
|
||||
let params = GenerateParams::default().with_max_tokens(16);
|
||||
InferenceRequest::new(prompt_tokens, params)
|
||||
})
|
||||
.collect();
|
||||
continuous_batching_process(requests)
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("mixed_lengths", |b| {
|
||||
b.iter(|| {
|
||||
let mut requests = Vec::new();
|
||||
|
||||
// Mix of short, medium, and long prompts
|
||||
for i in 0..30 {
|
||||
let prompt_len = match i % 3 {
|
||||
0 => 16,
|
||||
1 => 64,
|
||||
_ => 256,
|
||||
};
|
||||
let max_tokens = match i % 3 {
|
||||
0 => 100,
|
||||
1 => 50,
|
||||
_ => 20,
|
||||
};
|
||||
|
||||
let prompt_tokens: Vec<u32> = (0..prompt_len).collect();
|
||||
let params = GenerateParams::default().with_max_tokens(max_tokens);
|
||||
requests.push(InferenceRequest::new(prompt_tokens, params));
|
||||
}
|
||||
|
||||
continuous_batching_process(requests)
|
||||
});
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_scheduler_overhead,
|
||||
bench_batch_creation,
|
||||
bench_kv_cache_allocation,
|
||||
bench_request_throughput,
|
||||
bench_serving_engine,
|
||||
bench_mixed_workload,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
||||
Reference in New Issue
Block a user