Files
wifi-densepose/examples/ruvLLM/esp32/benches/esp32_simulation.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

316 lines
9.8 KiB
Rust

//! ESP32 Simulation Benchmarks
//!
//! Simulates ESP32 performance constraints to validate the implementation
//! will work on actual hardware.
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
use std::time::Duration;
// Import the ESP32 crate (compiled for host for simulation)
#[path = "../src/lib.rs"]
mod ruvllm_esp32;
use ruvllm_esp32::prelude::*;
use ruvllm_esp32::model::ModelConfig;
use ruvllm_esp32::quantized::{QuantizationType, matmul_int8, QuantParams};
use ruvllm_esp32::attention::MicroAttention;
/// ESP32 clock speed in MHz
const ESP32_CLOCK_MHZ: u64 = 240;
/// Estimated cycles per INT8 multiply-accumulate on ESP32
const CYCLES_PER_MAC: u64 = 4;
/// Estimate ESP32 execution time from x86 measurement
fn estimate_esp32_time(x86_duration: Duration, mac_ops: u64) -> Duration {
// ESP32 is roughly 10-20x slower than modern x86 for pure compute
// But INT8 operations are more efficient
let estimated_cycles = mac_ops * CYCLES_PER_MAC;
let esp32_seconds = estimated_cycles as f64 / (ESP32_CLOCK_MHZ as f64 * 1_000_000.0);
Duration::from_secs_f64(esp32_seconds.max(x86_duration.as_secs_f64() * 15.0))
}
fn benchmark_matmul_int8(c: &mut Criterion) {
let mut group = c.benchmark_group("INT8 MatMul");
group.warm_up_time(Duration::from_millis(500));
group.measurement_time(Duration::from_secs(3));
// Test different sizes typical for ESP32 models
for (out_dim, in_dim) in [(32, 32), (64, 64), (128, 64), (64, 128)] {
let weights: Vec<i8> = (0..out_dim * in_dim)
.map(|i| ((i * 17) % 256) as i8 - 128)
.collect();
let input: Vec<i8> = (0..in_dim)
.map(|i| ((i * 13) % 256) as i8 - 128)
.collect();
let mut output = vec![0i32; out_dim];
let params = QuantParams::default();
let mac_ops = (out_dim * in_dim) as u64;
group.bench_with_input(
BenchmarkId::new("size", format!("{}x{}", out_dim, in_dim)),
&(out_dim, in_dim),
|b, _| {
b.iter(|| {
matmul_int8(
black_box(&weights),
black_box(&params),
black_box(&input),
black_box(&params),
black_box(&mut output),
out_dim,
in_dim,
)
})
},
);
// Print ESP32 estimate
println!(
" {}x{}: {} MAC ops, estimated ESP32 time: {:.1} us",
out_dim, in_dim, mac_ops,
mac_ops as f64 * CYCLES_PER_MAC as f64 / ESP32_CLOCK_MHZ as f64
);
}
group.finish();
}
fn benchmark_attention(c: &mut Criterion) {
let mut group = c.benchmark_group("Micro Attention");
group.warm_up_time(Duration::from_millis(500));
group.measurement_time(Duration::from_secs(3));
for (embed_dim, num_heads, seq_len) in [(64, 4, 16), (64, 4, 32), (32, 2, 16)] {
let head_dim = embed_dim / num_heads;
let attn = MicroAttention::new(embed_dim, num_heads);
let query: Vec<i8> = (0..head_dim).map(|i| (i * 7 % 128) as i8).collect();
let keys: Vec<Vec<i8>> = (0..seq_len)
.map(|s| (0..head_dim).map(|i| ((i + s) * 11 % 128) as i8).collect())
.collect();
let key_refs: Vec<&[i8]> = keys.iter().map(|k| k.as_slice()).collect();
let mut scores = vec![0i32; seq_len];
group.bench_with_input(
BenchmarkId::new("config", format!("d{}_h{}_s{}", embed_dim, num_heads, seq_len)),
&seq_len,
|b, _| {
b.iter(|| {
attn.compute_scores(
black_box(&query),
black_box(&key_refs),
black_box(&mut scores),
)
})
},
);
}
group.finish();
}
fn benchmark_full_forward(c: &mut Criterion) {
let mut group = c.benchmark_group("Full Forward Pass");
group.warm_up_time(Duration::from_millis(1000));
group.measurement_time(Duration::from_secs(5));
// Test configurations for different ESP32 variants
let configs = [
("ESP32", ModelConfig {
vocab_size: 256,
embed_dim: 64,
hidden_dim: 128,
num_layers: 2,
num_heads: 4,
max_seq_len: 32,
quant_type: QuantizationType::Int8,
}),
("ESP32-S2", ModelConfig {
vocab_size: 128,
embed_dim: 32,
hidden_dim: 64,
num_layers: 1,
num_heads: 2,
max_seq_len: 16,
quant_type: QuantizationType::Int8,
}),
("ESP32-S3", ModelConfig {
vocab_size: 512,
embed_dim: 64,
hidden_dim: 128,
num_layers: 2,
num_heads: 4,
max_seq_len: 32,
quant_type: QuantizationType::Int8,
}),
];
for (variant, config) in configs {
let model = TinyModel::new(config.clone()).unwrap();
let mut engine = MicroEngine::new(model).unwrap();
let model_size = config.estimate_size();
group.bench_with_input(
BenchmarkId::new("variant", variant),
&variant,
|b, _| {
b.iter(|| {
engine.reset();
black_box(engine.forward_one(black_box(42)).unwrap())
})
},
);
println!(
" {}: model size {} KB, embed_dim {}, layers {}",
variant, model_size / 1024, config.embed_dim, config.num_layers
);
}
group.finish();
}
fn benchmark_generation(c: &mut Criterion) {
let mut group = c.benchmark_group("Token Generation");
group.warm_up_time(Duration::from_millis(1000));
group.measurement_time(Duration::from_secs(5));
group.sample_size(20); // Fewer samples for slower operation
let config = ModelConfig::for_variant(Esp32Variant::Esp32);
let model = TinyModel::new(config).unwrap();
let mut engine = MicroEngine::new(model).unwrap();
let prompt = [1u16, 2, 3, 4, 5];
let gen_config = InferenceConfig {
max_tokens: 10,
greedy: true,
..Default::default()
};
group.bench_function("generate_10_tokens", |b| {
b.iter(|| {
engine.reset();
black_box(engine.generate(black_box(&prompt), black_box(&gen_config)).unwrap())
})
});
group.finish();
}
fn benchmark_memory_constraints(c: &mut Criterion) {
let mut group = c.benchmark_group("Memory Validation");
// Validate that models fit within ESP32 memory constraints
for variant in [
Esp32Variant::Esp32,
Esp32Variant::Esp32S2,
Esp32Variant::Esp32S3,
Esp32Variant::Esp32C3,
Esp32Variant::Esp32C6,
] {
let config = ModelConfig::for_variant(variant);
let model = TinyModel::new(config.clone()).unwrap();
let engine = MicroEngine::new(model).unwrap();
let usage = engine.memory_usage();
let available = variant.max_model_ram();
println!(" {:?}:", variant);
println!(" Available RAM: {} KB", available / 1024);
println!(" Model weights: {} KB", usage.model_weights / 1024);
println!(" Activations: {} KB", usage.activation_buffers / 1024);
println!(" KV cache: {} KB", usage.kv_cache / 1024);
println!(" Total used: {} KB", usage.total / 1024);
println!(" Headroom: {} KB", (available - usage.total) / 1024);
println!();
assert!(
usage.total <= available,
"{:?} exceeds memory: {} > {}",
variant, usage.total, available
);
}
// Dummy benchmark to satisfy criterion
group.bench_function("memory_check", |b| {
b.iter(|| black_box(Esp32Variant::Esp32.max_model_ram()))
});
group.finish();
}
fn benchmark_quantization(c: &mut Criterion) {
let mut group = c.benchmark_group("Quantization");
group.warm_up_time(Duration::from_millis(500));
group.measurement_time(Duration::from_secs(3));
use ruvllm_esp32::quantized::QuantizedTensor;
// Test quantization of different sized tensors
for size in [256, 1024, 4096] {
let data: Vec<f32> = (0..size)
.map(|i| (i as f32 / size as f32) * 2.0 - 1.0)
.collect();
group.bench_with_input(
BenchmarkId::new("int8", size),
&size,
|b, _| {
b.iter(|| {
QuantizedTensor::<16384>::from_f32(
black_box(&data),
&[size],
QuantizationType::Int8,
).unwrap()
})
},
);
group.bench_with_input(
BenchmarkId::new("int4", size),
&size,
|b, _| {
b.iter(|| {
QuantizedTensor::<16384>::from_f32(
black_box(&data),
&[size],
QuantizationType::Int4,
).unwrap()
})
},
);
group.bench_with_input(
BenchmarkId::new("binary", size),
&size,
|b, _| {
b.iter(|| {
QuantizedTensor::<16384>::from_f32(
black_box(&data),
&[size],
QuantizationType::Binary,
).unwrap()
})
},
);
}
group.finish();
}
criterion_group!(
benches,
benchmark_matmul_int8,
benchmark_attention,
benchmark_full_forward,
benchmark_generation,
benchmark_memory_constraints,
benchmark_quantization,
);
criterion_main!(benches);