1264 lines
38 KiB
Rust
1264 lines
38 KiB
Rust
#![allow(
|
|
clippy::all,
|
|
unused_imports,
|
|
unused_variables,
|
|
dead_code,
|
|
unused_mut,
|
|
unused_assignments,
|
|
non_camel_case_types,
|
|
clippy::approx_constant,
|
|
unexpected_cfgs,
|
|
unused_must_use,
|
|
unused_parens
|
|
)]
|
|
//! RuvLTRA-Small Model Benchmark Suite
|
|
//!
|
|
//! Comprehensive benchmarks for the RuvLTRA-Small (0.5B parameter) model
|
|
//! optimized for Apple Silicon M4 Pro.
|
|
//!
|
|
//! ## Performance Targets (M4 Pro)
|
|
//!
|
|
//! | Metric | Target | Notes |
|
|
//! |--------|--------|-------|
|
|
//! | Decode throughput (Q4) | 80+ tok/s | Single stream |
|
|
//! | First token latency | <50ms | Cold start |
|
|
//! | Memory usage (Q4) | <500MB | Model + KV cache |
|
|
//! | Prefill throughput | 2000+ tok/s | Batch=1 |
|
|
//!
|
|
//! ## Benchmark Scenarios
|
|
//!
|
|
//! 1. **Short prompt (32 tokens) -> 128 token output**
|
|
//! - Prefill latency, decode throughput, E2E latency
|
|
//!
|
|
//! 2. **Medium prompt (256 tokens) -> 256 token output**
|
|
//! - Sustained throughput, memory pressure
|
|
//!
|
|
//! 3. **Long prompt (1024 tokens) -> 512 token output**
|
|
//! - KV cache scaling, attention efficiency
|
|
//!
|
|
//! ## Backend Comparison
|
|
//!
|
|
//! - Pure NEON (CPU SIMD baseline)
|
|
//! - Pure ANE (Apple Neural Engine via CoreML)
|
|
//! - Hybrid (ANE matmul + NEON activations)
|
|
//! - Metal GPU
|
|
//!
|
|
//! ## Quantization Comparison
|
|
//!
|
|
//! - Q4_K_M: 4-bit quantization, medium quality
|
|
//! - Q5_K_M: 5-bit quantization, high quality
|
|
//! - Q8_0: 8-bit quantization, highest quality
|
|
//!
|
|
//! ## Running Benchmarks
|
|
//!
|
|
//! ```bash
|
|
//! # Full benchmark suite
|
|
//! cargo bench -p ruvllm --bench ruvltra_benchmark
|
|
//!
|
|
//! # Specific scenario
|
|
//! cargo bench -p ruvllm --bench ruvltra_benchmark -- short_prompt
|
|
//!
|
|
//! # With Metal GPU
|
|
//! cargo bench -p ruvllm --features metal-compute --bench ruvltra_benchmark
|
|
//!
|
|
//! # With ANE
|
|
//! cargo bench -p ruvllm --features coreml --bench ruvltra_benchmark
|
|
//!
|
|
//! # With parallel execution
|
|
//! cargo bench -p ruvllm --features parallel --bench ruvltra_benchmark
|
|
//! ```
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
|
use rand::Rng;
|
|
use std::alloc::{alloc, dealloc, Layout};
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
use std::time::{Duration, Instant};
|
|
|
|
// ============================================================================
|
|
// RuvLTRA-Small Model Configuration
|
|
// ============================================================================
|
|
|
|
/// RuvLTRA-Small model configuration (0.5B parameters)
|
|
///
|
|
/// Architecture: LLaMA-style with optimizations for edge deployment
|
|
/// - 24 layers (reduced from 32 for 7B)
|
|
/// - 2048 hidden dimension
|
|
/// - 5632 intermediate dimension (2.75x hidden)
|
|
/// - 16 attention heads
|
|
/// - 4 KV heads (GQA 4:1)
|
|
/// - 128 head dimension
|
|
/// - 32000 vocab size
|
|
/// - 4096 max context
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub struct RuvLtraSmallConfig {
|
|
pub hidden_size: usize,
|
|
pub intermediate_size: usize,
|
|
pub num_attention_heads: usize,
|
|
pub num_kv_heads: usize,
|
|
pub head_dim: usize,
|
|
pub num_layers: usize,
|
|
pub vocab_size: usize,
|
|
pub max_seq_len: usize,
|
|
pub rope_theta: f32,
|
|
}
|
|
|
|
impl Default for RuvLtraSmallConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
hidden_size: 2048,
|
|
intermediate_size: 5632,
|
|
num_attention_heads: 16,
|
|
num_kv_heads: 4, // GQA 4:1
|
|
head_dim: 128,
|
|
num_layers: 24,
|
|
vocab_size: 32000,
|
|
max_seq_len: 4096,
|
|
rope_theta: 10000.0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl RuvLtraSmallConfig {
|
|
/// Total parameters (approximate)
|
|
pub fn total_params(&self) -> usize {
|
|
// Embedding: vocab * hidden
|
|
let embed_params = self.vocab_size * self.hidden_size;
|
|
|
|
// Per layer:
|
|
// - QKV projection: hidden * (hidden + 2 * kv_hidden)
|
|
// - O projection: hidden * hidden
|
|
// - MLP: hidden * intermediate * 3
|
|
// - Norms: hidden * 2
|
|
let kv_hidden = self.num_kv_heads * self.head_dim;
|
|
let attn_params = self.hidden_size * self.hidden_size // Q
|
|
+ self.hidden_size * kv_hidden * 2 // K, V
|
|
+ self.hidden_size * self.hidden_size; // O
|
|
let mlp_params = self.hidden_size * self.intermediate_size * 3;
|
|
let norm_params = self.hidden_size * 2;
|
|
let layer_params = attn_params + mlp_params + norm_params;
|
|
|
|
// Final: LM head + norm
|
|
let final_params = self.vocab_size * self.hidden_size + self.hidden_size;
|
|
|
|
embed_params + layer_params * self.num_layers + final_params
|
|
}
|
|
|
|
/// Memory in bytes for different quantization levels
|
|
pub fn memory_bytes(&self, quant: QuantFormat) -> usize {
|
|
let params = self.total_params();
|
|
match quant {
|
|
QuantFormat::F16 => params * 2,
|
|
QuantFormat::Q8_0 => params,
|
|
QuantFormat::Q5_K_M => (params * 5 + 7) / 8 + params / 32 * 2, // 5 bits + scales
|
|
QuantFormat::Q4_K_M => params / 2 + params / 32 * 2, // 4 bits + scales
|
|
}
|
|
}
|
|
|
|
/// KV cache memory for given sequence length
|
|
pub fn kv_cache_bytes(&self, seq_len: usize, quant: QuantFormat) -> usize {
|
|
let kv_elements = seq_len * self.num_kv_heads * self.head_dim * 2 * self.num_layers;
|
|
match quant {
|
|
QuantFormat::F16 => kv_elements * 2,
|
|
QuantFormat::Q8_0 => kv_elements,
|
|
QuantFormat::Q5_K_M => (kv_elements * 5 + 7) / 8,
|
|
QuantFormat::Q4_K_M => kv_elements / 2,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Quantization format
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum QuantFormat {
|
|
F16,
|
|
Q8_0,
|
|
Q5_K_M,
|
|
Q4_K_M,
|
|
}
|
|
|
|
impl QuantFormat {
|
|
pub fn name(&self) -> &'static str {
|
|
match self {
|
|
QuantFormat::F16 => "F16",
|
|
QuantFormat::Q8_0 => "Q8_0",
|
|
QuantFormat::Q5_K_M => "Q5_K_M",
|
|
QuantFormat::Q4_K_M => "Q4_K_M",
|
|
}
|
|
}
|
|
|
|
/// Bits per weight
|
|
pub fn bits(&self) -> f32 {
|
|
match self {
|
|
QuantFormat::F16 => 16.0,
|
|
QuantFormat::Q8_0 => 8.0,
|
|
QuantFormat::Q5_K_M => 5.5, // includes scales overhead
|
|
QuantFormat::Q4_K_M => 4.5,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Compute backend
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum Backend {
|
|
PureNeon,
|
|
PureAne,
|
|
Hybrid, // ANE for matmul, NEON for activations
|
|
MetalGpu,
|
|
}
|
|
|
|
impl Backend {
|
|
pub fn name(&self) -> &'static str {
|
|
match self {
|
|
Backend::PureNeon => "NEON",
|
|
Backend::PureAne => "ANE",
|
|
Backend::Hybrid => "Hybrid",
|
|
Backend::MetalGpu => "Metal",
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Memory Tracking
|
|
// ============================================================================
|
|
|
|
/// Thread-safe memory tracker
|
|
static PEAK_MEMORY: AtomicU64 = AtomicU64::new(0);
|
|
static CURRENT_MEMORY: AtomicU64 = AtomicU64::new(0);
|
|
|
|
fn track_alloc(bytes: usize) {
|
|
let prev = CURRENT_MEMORY.fetch_add(bytes as u64, Ordering::SeqCst);
|
|
let current = prev + bytes as u64;
|
|
PEAK_MEMORY.fetch_max(current, Ordering::SeqCst);
|
|
}
|
|
|
|
fn track_dealloc(bytes: usize) {
|
|
CURRENT_MEMORY.fetch_sub(bytes as u64, Ordering::SeqCst);
|
|
}
|
|
|
|
fn reset_memory_tracking() {
|
|
PEAK_MEMORY.store(0, Ordering::SeqCst);
|
|
CURRENT_MEMORY.store(0, Ordering::SeqCst);
|
|
}
|
|
|
|
fn get_peak_memory() -> u64 {
|
|
PEAK_MEMORY.load(Ordering::SeqCst)
|
|
}
|
|
|
|
/// Tracked allocation for memory benchmarking
|
|
pub struct TrackedBuffer {
|
|
ptr: *mut u8,
|
|
layout: Layout,
|
|
}
|
|
|
|
impl TrackedBuffer {
|
|
pub fn new(size: usize) -> Self {
|
|
let layout = Layout::from_size_align(size, 64).unwrap();
|
|
let ptr = unsafe { alloc(layout) };
|
|
track_alloc(size);
|
|
Self { ptr, layout }
|
|
}
|
|
|
|
pub fn as_slice(&self) -> &[u8] {
|
|
unsafe { std::slice::from_raw_parts(self.ptr, self.layout.size()) }
|
|
}
|
|
|
|
pub fn as_mut_slice(&mut self) -> &mut [u8] {
|
|
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.layout.size()) }
|
|
}
|
|
}
|
|
|
|
impl Drop for TrackedBuffer {
|
|
fn drop(&mut self) {
|
|
track_dealloc(self.layout.size());
|
|
unsafe { dealloc(self.ptr, self.layout) }
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Simulated Transformer Operations
|
|
// ============================================================================
|
|
|
|
/// Simulated transformer layer for RuvLTRA-Small
|
|
struct RuvLtraLayer {
|
|
config: RuvLtraSmallConfig,
|
|
// Weights (simulated as random data)
|
|
q_proj: Vec<f32>,
|
|
k_proj: Vec<f32>,
|
|
v_proj: Vec<f32>,
|
|
o_proj: Vec<f32>,
|
|
gate_proj: Vec<f32>,
|
|
up_proj: Vec<f32>,
|
|
down_proj: Vec<f32>,
|
|
input_norm: Vec<f32>,
|
|
post_attn_norm: Vec<f32>,
|
|
}
|
|
|
|
impl RuvLtraLayer {
|
|
fn new(config: RuvLtraSmallConfig) -> Self {
|
|
let hidden = config.hidden_size;
|
|
let kv_hidden = config.num_kv_heads * config.head_dim;
|
|
let intermediate = config.intermediate_size;
|
|
|
|
Self {
|
|
config,
|
|
q_proj: random_tensor(hidden * hidden),
|
|
k_proj: random_tensor(hidden * kv_hidden),
|
|
v_proj: random_tensor(hidden * kv_hidden),
|
|
o_proj: random_tensor(hidden * hidden),
|
|
gate_proj: random_tensor(hidden * intermediate),
|
|
up_proj: random_tensor(hidden * intermediate),
|
|
down_proj: random_tensor(intermediate * hidden),
|
|
input_norm: random_tensor(hidden),
|
|
post_attn_norm: random_tensor(hidden),
|
|
}
|
|
}
|
|
|
|
/// Prefill forward pass (batch of tokens)
|
|
fn prefill(&self, hidden_states: &mut [f32], seq_len: usize, _kv_cache: &mut KvCache) {
|
|
let hidden = self.config.hidden_size;
|
|
|
|
for pos in 0..seq_len {
|
|
let offset = pos * hidden;
|
|
let state = &mut hidden_states[offset..offset + hidden];
|
|
|
|
// RMSNorm
|
|
rms_norm_inplace(state, &self.input_norm, 1e-6);
|
|
|
|
// QKV projection (simplified)
|
|
let q = gemv(&self.q_proj, state, hidden, hidden);
|
|
let _k = gemv(
|
|
&self.k_proj,
|
|
state,
|
|
hidden,
|
|
self.config.num_kv_heads * self.config.head_dim,
|
|
);
|
|
let _v = gemv(
|
|
&self.v_proj,
|
|
state,
|
|
hidden,
|
|
self.config.num_kv_heads * self.config.head_dim,
|
|
);
|
|
|
|
// Attention output projection
|
|
let attn_out = gemv(&self.o_proj, &q, hidden, hidden);
|
|
|
|
// Residual
|
|
for i in 0..hidden {
|
|
state[i] += attn_out[i];
|
|
}
|
|
|
|
// Post-attention norm
|
|
rms_norm_inplace(state, &self.post_attn_norm, 1e-6);
|
|
|
|
// MLP
|
|
let gate = gemv(
|
|
&self.gate_proj,
|
|
state,
|
|
hidden,
|
|
self.config.intermediate_size,
|
|
);
|
|
let up = gemv(&self.up_proj, state, hidden, self.config.intermediate_size);
|
|
|
|
// SiLU * up
|
|
let mut mlp_out = Vec::with_capacity(self.config.intermediate_size);
|
|
for i in 0..self.config.intermediate_size {
|
|
let silu = gate[i] / (1.0 + (-gate[i]).exp());
|
|
mlp_out.push(silu * up[i]);
|
|
}
|
|
|
|
// Down projection
|
|
let down = gemv(
|
|
&self.down_proj,
|
|
&mlp_out,
|
|
self.config.intermediate_size,
|
|
hidden,
|
|
);
|
|
|
|
// Residual
|
|
for i in 0..hidden {
|
|
state[i] += down[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Decode forward pass (single token)
|
|
fn decode(&self, hidden_state: &mut [f32], kv_cache_len: usize) {
|
|
let hidden = self.config.hidden_size;
|
|
|
|
// RMSNorm
|
|
rms_norm_inplace(hidden_state, &self.input_norm, 1e-6);
|
|
|
|
// QKV projection
|
|
let mut q = gemv(&self.q_proj, hidden_state, hidden, hidden);
|
|
let _k = gemv(
|
|
&self.k_proj,
|
|
hidden_state,
|
|
hidden,
|
|
self.config.num_kv_heads * self.config.head_dim,
|
|
);
|
|
let _v = gemv(
|
|
&self.v_proj,
|
|
hidden_state,
|
|
hidden,
|
|
self.config.num_kv_heads * self.config.head_dim,
|
|
);
|
|
|
|
// RoPE
|
|
apply_rope(
|
|
&mut q,
|
|
self.config.head_dim,
|
|
kv_cache_len,
|
|
self.config.rope_theta,
|
|
);
|
|
|
|
// Simplified attention output
|
|
let attn_out = gemv(&self.o_proj, &q, hidden, hidden);
|
|
|
|
// Residual
|
|
for i in 0..hidden {
|
|
hidden_state[i] += attn_out[i];
|
|
}
|
|
|
|
// Post-attention norm
|
|
rms_norm_inplace(hidden_state, &self.post_attn_norm, 1e-6);
|
|
|
|
// MLP
|
|
let gate = gemv(
|
|
&self.gate_proj,
|
|
hidden_state,
|
|
hidden,
|
|
self.config.intermediate_size,
|
|
);
|
|
let up = gemv(
|
|
&self.up_proj,
|
|
hidden_state,
|
|
hidden,
|
|
self.config.intermediate_size,
|
|
);
|
|
|
|
let mut mlp_out = Vec::with_capacity(self.config.intermediate_size);
|
|
for i in 0..self.config.intermediate_size {
|
|
let silu = gate[i] / (1.0 + (-gate[i]).exp());
|
|
mlp_out.push(silu * up[i]);
|
|
}
|
|
|
|
let down = gemv(
|
|
&self.down_proj,
|
|
&mlp_out,
|
|
self.config.intermediate_size,
|
|
hidden,
|
|
);
|
|
|
|
for i in 0..hidden {
|
|
hidden_state[i] += down[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Simple KV cache for benchmarking
|
|
struct KvCache {
|
|
keys: Vec<f32>,
|
|
values: Vec<f32>,
|
|
num_tokens: usize,
|
|
config: RuvLtraSmallConfig,
|
|
}
|
|
|
|
impl KvCache {
|
|
fn new(config: RuvLtraSmallConfig, max_seq_len: usize) -> Self {
|
|
let capacity = max_seq_len * config.num_kv_heads * config.head_dim * config.num_layers;
|
|
Self {
|
|
keys: vec![0.0; capacity],
|
|
values: vec![0.0; capacity],
|
|
num_tokens: 0,
|
|
config,
|
|
}
|
|
}
|
|
|
|
fn append(&mut self, _k: &[f32], _v: &[f32], _layer: usize) {
|
|
self.num_tokens += 1;
|
|
}
|
|
|
|
fn len(&self) -> usize {
|
|
self.num_tokens
|
|
}
|
|
|
|
fn memory_bytes(&self) -> usize {
|
|
(self.keys.len() + self.values.len()) * std::mem::size_of::<f32>()
|
|
}
|
|
}
|
|
|
|
/// Full model for benchmarking
|
|
struct RuvLtraModel {
|
|
config: RuvLtraSmallConfig,
|
|
layers: Vec<RuvLtraLayer>,
|
|
embed_weights: Vec<f32>,
|
|
lm_head: Vec<f32>,
|
|
final_norm: Vec<f32>,
|
|
}
|
|
|
|
impl RuvLtraModel {
|
|
fn new(config: RuvLtraSmallConfig) -> Self {
|
|
let layers: Vec<_> = (0..config.num_layers)
|
|
.map(|_| RuvLtraLayer::new(config))
|
|
.collect();
|
|
|
|
Self {
|
|
config,
|
|
layers,
|
|
embed_weights: random_tensor(config.vocab_size * config.hidden_size),
|
|
lm_head: random_tensor(config.hidden_size * config.vocab_size),
|
|
final_norm: random_tensor(config.hidden_size),
|
|
}
|
|
}
|
|
|
|
/// Prefill phase: process prompt
|
|
fn prefill(&self, tokens: &[u32], kv_cache: &mut KvCache) -> Vec<f32> {
|
|
let seq_len = tokens.len();
|
|
let hidden = self.config.hidden_size;
|
|
|
|
// Embed tokens
|
|
let mut hidden_states = vec![0.0f32; seq_len * hidden];
|
|
for (i, &token) in tokens.iter().enumerate() {
|
|
let offset = (token as usize % self.config.vocab_size) * hidden;
|
|
hidden_states[i * hidden..(i + 1) * hidden]
|
|
.copy_from_slice(&self.embed_weights[offset..offset + hidden]);
|
|
}
|
|
|
|
// Forward through layers
|
|
for layer in &self.layers {
|
|
layer.prefill(&mut hidden_states, seq_len, kv_cache);
|
|
}
|
|
|
|
// Return last position's hidden state
|
|
hidden_states[(seq_len - 1) * hidden..].to_vec()
|
|
}
|
|
|
|
/// Decode phase: generate single token
|
|
fn decode(&self, prev_token: u32, kv_cache: &mut KvCache) -> u32 {
|
|
let hidden = self.config.hidden_size;
|
|
|
|
// Embed token
|
|
let offset = (prev_token as usize % self.config.vocab_size) * hidden;
|
|
let mut hidden_state = self.embed_weights[offset..offset + hidden].to_vec();
|
|
|
|
// Forward through layers
|
|
let kv_len = kv_cache.len();
|
|
for layer in &self.layers {
|
|
layer.decode(&mut hidden_state, kv_len);
|
|
}
|
|
|
|
// Final norm
|
|
rms_norm_inplace(&mut hidden_state, &self.final_norm, 1e-6);
|
|
|
|
// LM head (simplified - just pick argmax of first 100 logits)
|
|
let logits = gemv(&self.lm_head[..hidden * 100], &hidden_state, hidden, 100);
|
|
logits
|
|
.iter()
|
|
.enumerate()
|
|
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
|
.map(|(i, _)| i as u32)
|
|
.unwrap_or(0)
|
|
}
|
|
|
|
/// E2E inference: prefill + decode
|
|
fn generate(
|
|
&self,
|
|
prompt_tokens: &[u32],
|
|
max_new_tokens: usize,
|
|
) -> (Duration, Duration, Vec<u32>) {
|
|
let mut kv_cache = KvCache::new(self.config, self.config.max_seq_len);
|
|
|
|
// Prefill
|
|
let prefill_start = Instant::now();
|
|
let _last_hidden = self.prefill(prompt_tokens, &mut kv_cache);
|
|
let prefill_time = prefill_start.elapsed();
|
|
|
|
// Decode
|
|
let mut output_tokens = Vec::with_capacity(max_new_tokens);
|
|
let mut prev_token = prompt_tokens.last().copied().unwrap_or(0);
|
|
|
|
let decode_start = Instant::now();
|
|
for _ in 0..max_new_tokens {
|
|
let next_token = self.decode(prev_token, &mut kv_cache);
|
|
output_tokens.push(next_token);
|
|
prev_token = next_token;
|
|
kv_cache.num_tokens += 1;
|
|
}
|
|
let decode_time = decode_start.elapsed();
|
|
|
|
(prefill_time, decode_time, output_tokens)
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// SONA Integration Benchmarks
|
|
// ============================================================================
|
|
|
|
/// Simulated SONA instant loop overhead measurement
|
|
struct SonaOverhead {
|
|
trajectory_buffer: Vec<f32>,
|
|
pattern_cache: Vec<f32>,
|
|
ewc_fisher: Vec<f32>,
|
|
}
|
|
|
|
impl SonaOverhead {
|
|
fn new(hidden_dim: usize) -> Self {
|
|
Self {
|
|
trajectory_buffer: Vec::with_capacity(1024 * hidden_dim),
|
|
pattern_cache: random_tensor(100 * hidden_dim),
|
|
ewc_fisher: random_tensor(hidden_dim),
|
|
}
|
|
}
|
|
|
|
/// Measure instant loop overhead (<1ms target)
|
|
fn instant_loop(&mut self, query_embedding: &[f32], quality_score: f32) -> Duration {
|
|
let start = Instant::now();
|
|
|
|
// 1. Store trajectory (ring buffer append)
|
|
self.trajectory_buffer.extend_from_slice(query_embedding);
|
|
if self.trajectory_buffer.len() > 1024 * query_embedding.len() {
|
|
self.trajectory_buffer.drain(0..query_embedding.len());
|
|
}
|
|
|
|
// 2. Update micro-LoRA (simplified gradient step)
|
|
let lr = 0.01 * quality_score;
|
|
for (i, x) in query_embedding.iter().enumerate() {
|
|
if i < self.ewc_fisher.len() {
|
|
self.ewc_fisher[i] += lr * x * x;
|
|
}
|
|
}
|
|
|
|
// 3. Pattern similarity search (simplified)
|
|
let _similarity: f32 = self
|
|
.pattern_cache
|
|
.chunks(query_embedding.len())
|
|
.take(10)
|
|
.map(|p| {
|
|
p.iter()
|
|
.zip(query_embedding)
|
|
.map(|(a, b)| a * b)
|
|
.sum::<f32>()
|
|
})
|
|
.sum();
|
|
|
|
start.elapsed()
|
|
}
|
|
|
|
/// Measure pattern retrieval latency
|
|
fn pattern_search(&self, query: &[f32], k: usize) -> Duration {
|
|
let start = Instant::now();
|
|
|
|
let mut scores: Vec<(usize, f32)> = self
|
|
.pattern_cache
|
|
.chunks(query.len())
|
|
.enumerate()
|
|
.map(|(i, p)| {
|
|
let sim: f32 = p.iter().zip(query).map(|(a, b)| a * b).sum();
|
|
(i, sim)
|
|
})
|
|
.collect();
|
|
|
|
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
|
black_box(&scores[..k.min(scores.len())]);
|
|
|
|
start.elapsed()
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Helper Functions
|
|
// ============================================================================
|
|
|
|
fn random_tensor(size: usize) -> Vec<f32> {
|
|
let mut rng = rand::thread_rng();
|
|
(0..size).map(|_| rng.gen_range(-0.1..0.1)).collect()
|
|
}
|
|
|
|
fn rms_norm_inplace(x: &mut [f32], weight: &[f32], eps: f32) {
|
|
let sum_sq: f32 = x.iter().map(|v| v * v).sum();
|
|
let inv_rms = 1.0 / (sum_sq / x.len() as f32 + eps).sqrt();
|
|
for (i, w) in weight.iter().enumerate().take(x.len()) {
|
|
x[i] = x[i] * inv_rms * w;
|
|
}
|
|
}
|
|
|
|
fn gemv(matrix: &[f32], vector: &[f32], m: usize, n: usize) -> Vec<f32> {
|
|
let mut output = vec![0.0f32; n];
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
unsafe {
|
|
gemv_neon_impl(matrix, vector, &mut output, m, n);
|
|
}
|
|
|
|
#[cfg(not(target_arch = "aarch64"))]
|
|
{
|
|
for j in 0..n {
|
|
let mut sum = 0.0f32;
|
|
for i in 0..m {
|
|
sum += matrix[i * n + j] * vector[i];
|
|
}
|
|
output[j] = sum;
|
|
}
|
|
}
|
|
|
|
output
|
|
}
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
unsafe fn gemv_neon_impl(matrix: &[f32], vector: &[f32], output: &mut [f32], m: usize, n: usize) {
|
|
use std::arch::aarch64::*;
|
|
|
|
let m_ptr = matrix.as_ptr();
|
|
let v_ptr = vector.as_ptr();
|
|
let o_ptr = output.as_mut_ptr();
|
|
|
|
let mut j = 0usize;
|
|
while j + 4 <= n {
|
|
let mut acc = vdupq_n_f32(0.0);
|
|
|
|
for i in 0..m {
|
|
let v_val = vdupq_n_f32(*v_ptr.add(i));
|
|
let m_v = vld1q_f32(m_ptr.add(i * n + j));
|
|
acc = vfmaq_f32(acc, v_val, m_v);
|
|
}
|
|
|
|
vst1q_f32(o_ptr.add(j), acc);
|
|
j += 4;
|
|
}
|
|
|
|
while j < n {
|
|
let mut sum = 0.0f32;
|
|
for i in 0..m {
|
|
sum += *m_ptr.add(i * n + j) * *v_ptr.add(i);
|
|
}
|
|
*o_ptr.add(j) = sum;
|
|
j += 1;
|
|
}
|
|
}
|
|
|
|
fn apply_rope(x: &mut [f32], head_dim: usize, position: usize, theta: f32) {
|
|
let half_dim = head_dim / 2;
|
|
for i in 0..half_dim {
|
|
let freq = 1.0 / theta.powf((2 * i) as f32 / head_dim as f32);
|
|
let angle = position as f32 * freq;
|
|
let cos_theta = angle.cos();
|
|
let sin_theta = angle.sin();
|
|
|
|
if i * 2 + 1 < x.len() {
|
|
let x0 = x[i * 2];
|
|
let x1 = x[i * 2 + 1];
|
|
x[i * 2] = x0 * cos_theta - x1 * sin_theta;
|
|
x[i * 2 + 1] = x1 * cos_theta + x0 * sin_theta;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Benchmark Functions
|
|
// ============================================================================
|
|
|
|
/// Benchmark prefill phase (prompt processing)
|
|
fn bench_prefill(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_prefill");
|
|
group.sample_size(20);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let model = RuvLtraModel::new(config);
|
|
|
|
// Test different prompt lengths
|
|
let prompt_lengths = [32, 256, 1024];
|
|
|
|
for &prompt_len in &prompt_lengths {
|
|
let prompt_tokens: Vec<u32> = (0..prompt_len).map(|i| i as u32 % 32000).collect();
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
|
|
let throughput = prompt_len as u64;
|
|
let id = BenchmarkId::new(format!("seq_{}", prompt_len), prompt_len);
|
|
|
|
group.throughput(Throughput::Elements(throughput));
|
|
group.bench_function(id, |b| {
|
|
b.iter(|| {
|
|
kv_cache.num_tokens = 0;
|
|
model.prefill(black_box(&prompt_tokens), black_box(&mut kv_cache))
|
|
})
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark decode phase (token generation)
|
|
fn bench_decode(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_decode");
|
|
group.sample_size(50);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let model = RuvLtraModel::new(config);
|
|
|
|
// Test with different KV cache lengths
|
|
let kv_lengths = [32, 256, 1024];
|
|
|
|
for &kv_len in &kv_lengths {
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
kv_cache.num_tokens = kv_len;
|
|
|
|
let id = BenchmarkId::new(format!("kv_len_{}", kv_len), kv_len);
|
|
|
|
group.throughput(Throughput::Elements(1)); // 1 token per iteration
|
|
group.bench_function(id, |b| {
|
|
b.iter(|| model.decode(black_box(42), black_box(&mut kv_cache)))
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark E2E latency (first token + total time)
|
|
fn bench_e2e_latency(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_e2e_latency");
|
|
group.sample_size(10);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let model = RuvLtraModel::new(config);
|
|
|
|
// Benchmark scenarios
|
|
let scenarios = [
|
|
("short", 32, 128), // Short prompt -> 128 tokens
|
|
("medium", 256, 256), // Medium prompt -> 256 tokens
|
|
("long", 1024, 512), // Long prompt -> 512 tokens
|
|
];
|
|
|
|
for (name, prompt_len, output_len) in scenarios {
|
|
let prompt_tokens: Vec<u32> = (0..prompt_len).map(|i| i as u32 % 32000).collect();
|
|
|
|
let id = BenchmarkId::new(
|
|
format!("{}_p{}_o{}", name, prompt_len, output_len),
|
|
prompt_len,
|
|
);
|
|
|
|
group.throughput(Throughput::Elements((prompt_len + output_len) as u64));
|
|
group.bench_function(id, |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
let (prefill, decode, _) =
|
|
model.generate(black_box(&prompt_tokens), output_len);
|
|
total += prefill + decode;
|
|
}
|
|
total
|
|
})
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark throughput (tokens/sec)
|
|
fn bench_throughput(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_throughput");
|
|
group.sample_size(10);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let model = RuvLtraModel::new(config);
|
|
|
|
// Measure decode throughput at different batch points
|
|
let decode_batches = [10, 50, 100];
|
|
|
|
for &num_tokens in &decode_batches {
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
kv_cache.num_tokens = 256; // Assume 256 context
|
|
|
|
let id = BenchmarkId::new(format!("decode_{}_tokens", num_tokens), num_tokens);
|
|
|
|
group.throughput(Throughput::Elements(num_tokens as u64));
|
|
group.bench_function(id, |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
let start = Instant::now();
|
|
let mut prev_token = 42u32;
|
|
for _ in 0..num_tokens {
|
|
prev_token = model.decode(black_box(prev_token), black_box(&mut kv_cache));
|
|
}
|
|
total += start.elapsed();
|
|
kv_cache.num_tokens = 256; // Reset
|
|
}
|
|
total
|
|
})
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark memory usage
|
|
fn bench_memory(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_memory");
|
|
group.sample_size(20);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
|
|
// Print memory estimates
|
|
println!("\n=== RuvLTRA-Small Memory Estimates ===");
|
|
println!("Total parameters: {}M", config.total_params() / 1_000_000);
|
|
|
|
for quant in [
|
|
QuantFormat::F16,
|
|
QuantFormat::Q8_0,
|
|
QuantFormat::Q5_K_M,
|
|
QuantFormat::Q4_K_M,
|
|
] {
|
|
let model_mb = config.memory_bytes(quant) / (1024 * 1024);
|
|
let kv_1k_mb = config.kv_cache_bytes(1024, quant) / (1024 * 1024);
|
|
let kv_4k_mb = config.kv_cache_bytes(4096, quant) / (1024 * 1024);
|
|
|
|
println!(
|
|
"{}: Model={}MB, KV@1K={}MB, KV@4K={}MB, Total@1K={}MB",
|
|
quant.name(),
|
|
model_mb,
|
|
kv_1k_mb,
|
|
kv_4k_mb,
|
|
model_mb + kv_1k_mb
|
|
);
|
|
}
|
|
println!();
|
|
|
|
// Benchmark actual allocation patterns
|
|
let seq_lengths = [256, 512, 1024, 2048];
|
|
|
|
for &seq_len in &seq_lengths {
|
|
let id = BenchmarkId::new(format!("kv_cache_seq_{}", seq_len), seq_len);
|
|
|
|
reset_memory_tracking();
|
|
|
|
group.bench_function(id, |b| {
|
|
b.iter(|| {
|
|
let kv_cache = KvCache::new(config, seq_len);
|
|
black_box(kv_cache.memory_bytes())
|
|
})
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark quantization comparison
|
|
fn bench_quantization(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_quantization");
|
|
group.sample_size(30);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
|
|
// Simulate quantized weight loading and dequant
|
|
let hidden = config.hidden_size;
|
|
let weights_f32 = random_tensor(hidden * hidden);
|
|
|
|
// Q8_0 simulation
|
|
let weights_q8: Vec<i8> = weights_f32
|
|
.iter()
|
|
.map(|&x| (x * 127.0).clamp(-127.0, 127.0) as i8)
|
|
.collect();
|
|
|
|
// Q4 simulation (packed)
|
|
let weights_q4: Vec<u8> = weights_f32
|
|
.chunks(2)
|
|
.map(|chunk| {
|
|
let q0 = ((chunk[0] + 1.0) * 7.5).clamp(0.0, 15.0) as u8;
|
|
let q1 = ((chunk.get(1).copied().unwrap_or(0.0) + 1.0) * 7.5).clamp(0.0, 15.0) as u8;
|
|
(q1 << 4) | q0
|
|
})
|
|
.collect();
|
|
|
|
// Benchmark dequantization overhead
|
|
group.bench_function("dequant_q8_0", |b| {
|
|
let scale = 1.0f32 / 127.0;
|
|
b.iter(|| {
|
|
let dequant: Vec<f32> = weights_q8
|
|
.iter()
|
|
.map(|&q| black_box(q as f32 * scale))
|
|
.collect();
|
|
black_box(dequant)
|
|
})
|
|
});
|
|
|
|
group.bench_function("dequant_q4_k_m", |b| {
|
|
let scale = 1.0f32 / 7.5;
|
|
b.iter(|| {
|
|
let dequant: Vec<f32> = weights_q4
|
|
.iter()
|
|
.flat_map(|&packed| {
|
|
let q0 = (packed & 0x0F) as f32 * scale - 1.0;
|
|
let q1 = ((packed >> 4) & 0x0F) as f32 * scale - 1.0;
|
|
[q0, q1]
|
|
})
|
|
.collect();
|
|
black_box(dequant)
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark SONA overhead
|
|
fn bench_sona_overhead(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_sona_overhead");
|
|
group.sample_size(100);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let mut sona = SonaOverhead::new(config.hidden_size);
|
|
|
|
let query_embedding = random_tensor(config.hidden_size);
|
|
|
|
// Instant loop overhead (target: <1ms)
|
|
group.bench_function("instant_loop", |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
total += sona.instant_loop(black_box(&query_embedding), 0.8);
|
|
}
|
|
total
|
|
})
|
|
});
|
|
|
|
// Pattern retrieval latency
|
|
for k in [5, 10, 20] {
|
|
let id = BenchmarkId::new(format!("pattern_search_top{}", k), k);
|
|
group.bench_function(id, |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
total += sona.pattern_search(black_box(&query_embedding), k);
|
|
}
|
|
total
|
|
})
|
|
});
|
|
}
|
|
|
|
// Combined: with vs without SONA
|
|
let model = RuvLtraModel::new(config);
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
kv_cache.num_tokens = 256;
|
|
|
|
group.bench_function("decode_without_sona", |b| {
|
|
b.iter(|| model.decode(black_box(42), black_box(&mut kv_cache)))
|
|
});
|
|
|
|
group.bench_function("decode_with_sona_instant", |b| {
|
|
b.iter(|| {
|
|
let token = model.decode(black_box(42), black_box(&mut kv_cache));
|
|
sona.instant_loop(&query_embedding, 0.8);
|
|
token
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Benchmark backend comparison (simulated)
|
|
fn bench_backend_comparison(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_backend_comparison");
|
|
group.sample_size(30);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let hidden = config.hidden_size;
|
|
|
|
// Simulate different backend speeds with scaling factors
|
|
// These represent relative performance characteristics
|
|
let matrix_a = random_tensor(hidden * hidden);
|
|
let vector_x = random_tensor(hidden);
|
|
let mut output = vec![0.0f32; hidden];
|
|
|
|
// Pure NEON baseline
|
|
group.bench_function("neon_gemv", |b| {
|
|
b.iter(|| {
|
|
gemv(black_box(&matrix_a), black_box(&vector_x), hidden, hidden);
|
|
})
|
|
});
|
|
|
|
// Simulated ANE (typically 1.3-1.5x faster for supported ops)
|
|
#[cfg(all(target_os = "macos", feature = "coreml"))]
|
|
{
|
|
group.bench_function("ane_gemv_simulated", |b| {
|
|
b.iter(|| {
|
|
// In practice, this would use ruvllm::kernels::ane_ops
|
|
let result = gemv(black_box(&matrix_a), black_box(&vector_x), hidden, hidden);
|
|
// ANE would have ~30% less overhead in practice
|
|
black_box(result)
|
|
})
|
|
});
|
|
}
|
|
|
|
// Simulated hybrid (ANE matmul + NEON activations)
|
|
#[cfg(all(target_os = "macos", feature = "coreml"))]
|
|
{
|
|
group.bench_function("hybrid_layer_simulated", |b| {
|
|
let gate_proj = random_tensor(hidden * config.intermediate_size);
|
|
let up_proj = random_tensor(hidden * config.intermediate_size);
|
|
let down_proj = random_tensor(config.intermediate_size * hidden);
|
|
|
|
b.iter(|| {
|
|
// ANE: matmul
|
|
let gate = gemv(&gate_proj, &vector_x, hidden, config.intermediate_size);
|
|
let up = gemv(&up_proj, &vector_x, hidden, config.intermediate_size);
|
|
|
|
// NEON: SiLU activation
|
|
let mut intermediate = Vec::with_capacity(config.intermediate_size);
|
|
for i in 0..config.intermediate_size {
|
|
let silu = gate[i] / (1.0 + (-gate[i]).exp());
|
|
intermediate.push(silu * up[i]);
|
|
}
|
|
|
|
// ANE: matmul
|
|
let output = gemv(&down_proj, &intermediate, config.intermediate_size, hidden);
|
|
black_box(output)
|
|
})
|
|
});
|
|
}
|
|
|
|
// Metal GPU comparison placeholder
|
|
#[cfg(all(target_os = "macos", feature = "metal-compute"))]
|
|
{
|
|
group.bench_function("metal_gemv_simulated", |b| {
|
|
// In practice, this would use Metal compute shaders
|
|
b.iter(|| gemv(black_box(&matrix_a), black_box(&vector_x), hidden, hidden))
|
|
});
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Summary benchmark with target metrics
|
|
fn bench_targets_summary(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("ruvltra_targets");
|
|
group.sample_size(10);
|
|
|
|
let config = RuvLtraSmallConfig::default();
|
|
let model = RuvLtraModel::new(config);
|
|
|
|
// Target: 80+ tok/s decode (Q4)
|
|
// Measure actual throughput
|
|
{
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
kv_cache.num_tokens = 256;
|
|
|
|
group.bench_function("target_decode_80_toks", |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
let start = Instant::now();
|
|
for _ in 0..80 {
|
|
black_box(model.decode(42, &mut kv_cache));
|
|
}
|
|
total += start.elapsed();
|
|
kv_cache.num_tokens = 256;
|
|
}
|
|
total
|
|
})
|
|
});
|
|
}
|
|
|
|
// Target: <50ms first token
|
|
{
|
|
let prompt_tokens: Vec<u32> = (0..256).map(|i| i as u32 % 32000).collect();
|
|
|
|
group.bench_function("target_first_token_50ms", |b| {
|
|
b.iter_custom(|iters| {
|
|
let mut total = Duration::ZERO;
|
|
for _ in 0..iters {
|
|
let mut kv_cache = KvCache::new(config, config.max_seq_len);
|
|
let start = Instant::now();
|
|
black_box(model.prefill(&prompt_tokens, &mut kv_cache));
|
|
black_box(
|
|
model.decode(prompt_tokens.last().copied().unwrap_or(0), &mut kv_cache),
|
|
);
|
|
total += start.elapsed();
|
|
}
|
|
total
|
|
})
|
|
});
|
|
}
|
|
|
|
// Memory target: <500MB for Q4
|
|
{
|
|
let model_mem = config.memory_bytes(QuantFormat::Q4_K_M);
|
|
let kv_mem = config.kv_cache_bytes(1024, QuantFormat::Q4_K_M);
|
|
let total_mb = (model_mem + kv_mem) / (1024 * 1024);
|
|
|
|
println!("\n=== Memory Target Check ===");
|
|
println!("Q4_K_M model: {} MB", model_mem / (1024 * 1024));
|
|
println!("KV cache @1K: {} MB", kv_mem / (1024 * 1024));
|
|
println!("Total: {} MB (target: <500MB)", total_mb);
|
|
println!("Status: {}", if total_mb < 500 { "PASS" } else { "FAIL" });
|
|
println!();
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
// ============================================================================
|
|
// Criterion Groups
|
|
// ============================================================================
|
|
|
|
criterion_group!(
|
|
name = prefill_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05)
|
|
.noise_threshold(0.02);
|
|
targets = bench_prefill
|
|
);
|
|
|
|
criterion_group!(
|
|
name = decode_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05)
|
|
.noise_threshold(0.02);
|
|
targets = bench_decode
|
|
);
|
|
|
|
criterion_group!(
|
|
name = e2e_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05)
|
|
.noise_threshold(0.05);
|
|
targets = bench_e2e_latency, bench_throughput
|
|
);
|
|
|
|
criterion_group!(
|
|
name = memory_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05);
|
|
targets = bench_memory, bench_quantization
|
|
);
|
|
|
|
criterion_group!(
|
|
name = sona_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05)
|
|
.noise_threshold(0.02);
|
|
targets = bench_sona_overhead
|
|
);
|
|
|
|
criterion_group!(
|
|
name = backend_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05);
|
|
targets = bench_backend_comparison
|
|
);
|
|
|
|
criterion_group!(
|
|
name = target_benches;
|
|
config = Criterion::default()
|
|
.significance_level(0.05)
|
|
.sample_size(10);
|
|
targets = bench_targets_summary
|
|
);
|
|
|
|
criterion_main!(
|
|
prefill_benches,
|
|
decode_benches,
|
|
e2e_benches,
|
|
memory_benches,
|
|
sona_benches,
|
|
backend_benches,
|
|
target_benches
|
|
);
|