git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
316 lines
9.8 KiB
Rust
316 lines
9.8 KiB
Rust
//! ESP32 Simulation Benchmarks
|
|
//!
|
|
//! Simulates ESP32 performance constraints to validate the implementation
|
|
//! will work on actual hardware.
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
|
|
use std::time::Duration;
|
|
|
|
// Import the ESP32 crate (compiled for host for simulation)
|
|
#[path = "../src/lib.rs"]
|
|
mod ruvllm_esp32;
|
|
|
|
use ruvllm_esp32::prelude::*;
|
|
use ruvllm_esp32::model::ModelConfig;
|
|
use ruvllm_esp32::quantized::{QuantizationType, matmul_int8, QuantParams};
|
|
use ruvllm_esp32::attention::MicroAttention;
|
|
|
|
/// ESP32 clock speed in MHz
|
|
const ESP32_CLOCK_MHZ: u64 = 240;
|
|
|
|
/// Estimated cycles per INT8 multiply-accumulate on ESP32
|
|
const CYCLES_PER_MAC: u64 = 4;
|
|
|
|
/// Estimate ESP32 execution time from x86 measurement
|
|
fn estimate_esp32_time(x86_duration: Duration, mac_ops: u64) -> Duration {
|
|
// ESP32 is roughly 10-20x slower than modern x86 for pure compute
|
|
// But INT8 operations are more efficient
|
|
let estimated_cycles = mac_ops * CYCLES_PER_MAC;
|
|
let esp32_seconds = estimated_cycles as f64 / (ESP32_CLOCK_MHZ as f64 * 1_000_000.0);
|
|
Duration::from_secs_f64(esp32_seconds.max(x86_duration.as_secs_f64() * 15.0))
|
|
}
|
|
|
|
fn benchmark_matmul_int8(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("INT8 MatMul");
|
|
group.warm_up_time(Duration::from_millis(500));
|
|
group.measurement_time(Duration::from_secs(3));
|
|
|
|
// Test different sizes typical for ESP32 models
|
|
for (out_dim, in_dim) in [(32, 32), (64, 64), (128, 64), (64, 128)] {
|
|
let weights: Vec<i8> = (0..out_dim * in_dim)
|
|
.map(|i| ((i * 17) % 256) as i8 - 128)
|
|
.collect();
|
|
let input: Vec<i8> = (0..in_dim)
|
|
.map(|i| ((i * 13) % 256) as i8 - 128)
|
|
.collect();
|
|
let mut output = vec![0i32; out_dim];
|
|
|
|
let params = QuantParams::default();
|
|
|
|
let mac_ops = (out_dim * in_dim) as u64;
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("size", format!("{}x{}", out_dim, in_dim)),
|
|
&(out_dim, in_dim),
|
|
|b, _| {
|
|
b.iter(|| {
|
|
matmul_int8(
|
|
black_box(&weights),
|
|
black_box(¶ms),
|
|
black_box(&input),
|
|
black_box(¶ms),
|
|
black_box(&mut output),
|
|
out_dim,
|
|
in_dim,
|
|
)
|
|
})
|
|
},
|
|
);
|
|
|
|
// Print ESP32 estimate
|
|
println!(
|
|
" {}x{}: {} MAC ops, estimated ESP32 time: {:.1} us",
|
|
out_dim, in_dim, mac_ops,
|
|
mac_ops as f64 * CYCLES_PER_MAC as f64 / ESP32_CLOCK_MHZ as f64
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
fn benchmark_attention(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("Micro Attention");
|
|
group.warm_up_time(Duration::from_millis(500));
|
|
group.measurement_time(Duration::from_secs(3));
|
|
|
|
for (embed_dim, num_heads, seq_len) in [(64, 4, 16), (64, 4, 32), (32, 2, 16)] {
|
|
let head_dim = embed_dim / num_heads;
|
|
let attn = MicroAttention::new(embed_dim, num_heads);
|
|
|
|
let query: Vec<i8> = (0..head_dim).map(|i| (i * 7 % 128) as i8).collect();
|
|
let keys: Vec<Vec<i8>> = (0..seq_len)
|
|
.map(|s| (0..head_dim).map(|i| ((i + s) * 11 % 128) as i8).collect())
|
|
.collect();
|
|
let key_refs: Vec<&[i8]> = keys.iter().map(|k| k.as_slice()).collect();
|
|
let mut scores = vec![0i32; seq_len];
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("config", format!("d{}_h{}_s{}", embed_dim, num_heads, seq_len)),
|
|
&seq_len,
|
|
|b, _| {
|
|
b.iter(|| {
|
|
attn.compute_scores(
|
|
black_box(&query),
|
|
black_box(&key_refs),
|
|
black_box(&mut scores),
|
|
)
|
|
})
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
fn benchmark_full_forward(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("Full Forward Pass");
|
|
group.warm_up_time(Duration::from_millis(1000));
|
|
group.measurement_time(Duration::from_secs(5));
|
|
|
|
// Test configurations for different ESP32 variants
|
|
let configs = [
|
|
("ESP32", ModelConfig {
|
|
vocab_size: 256,
|
|
embed_dim: 64,
|
|
hidden_dim: 128,
|
|
num_layers: 2,
|
|
num_heads: 4,
|
|
max_seq_len: 32,
|
|
quant_type: QuantizationType::Int8,
|
|
}),
|
|
("ESP32-S2", ModelConfig {
|
|
vocab_size: 128,
|
|
embed_dim: 32,
|
|
hidden_dim: 64,
|
|
num_layers: 1,
|
|
num_heads: 2,
|
|
max_seq_len: 16,
|
|
quant_type: QuantizationType::Int8,
|
|
}),
|
|
("ESP32-S3", ModelConfig {
|
|
vocab_size: 512,
|
|
embed_dim: 64,
|
|
hidden_dim: 128,
|
|
num_layers: 2,
|
|
num_heads: 4,
|
|
max_seq_len: 32,
|
|
quant_type: QuantizationType::Int8,
|
|
}),
|
|
];
|
|
|
|
for (variant, config) in configs {
|
|
let model = TinyModel::new(config.clone()).unwrap();
|
|
let mut engine = MicroEngine::new(model).unwrap();
|
|
|
|
let model_size = config.estimate_size();
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("variant", variant),
|
|
&variant,
|
|
|b, _| {
|
|
b.iter(|| {
|
|
engine.reset();
|
|
black_box(engine.forward_one(black_box(42)).unwrap())
|
|
})
|
|
},
|
|
);
|
|
|
|
println!(
|
|
" {}: model size {} KB, embed_dim {}, layers {}",
|
|
variant, model_size / 1024, config.embed_dim, config.num_layers
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
fn benchmark_generation(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("Token Generation");
|
|
group.warm_up_time(Duration::from_millis(1000));
|
|
group.measurement_time(Duration::from_secs(5));
|
|
group.sample_size(20); // Fewer samples for slower operation
|
|
|
|
let config = ModelConfig::for_variant(Esp32Variant::Esp32);
|
|
let model = TinyModel::new(config).unwrap();
|
|
let mut engine = MicroEngine::new(model).unwrap();
|
|
|
|
let prompt = [1u16, 2, 3, 4, 5];
|
|
let gen_config = InferenceConfig {
|
|
max_tokens: 10,
|
|
greedy: true,
|
|
..Default::default()
|
|
};
|
|
|
|
group.bench_function("generate_10_tokens", |b| {
|
|
b.iter(|| {
|
|
engine.reset();
|
|
black_box(engine.generate(black_box(&prompt), black_box(&gen_config)).unwrap())
|
|
})
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
fn benchmark_memory_constraints(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("Memory Validation");
|
|
|
|
// Validate that models fit within ESP32 memory constraints
|
|
for variant in [
|
|
Esp32Variant::Esp32,
|
|
Esp32Variant::Esp32S2,
|
|
Esp32Variant::Esp32S3,
|
|
Esp32Variant::Esp32C3,
|
|
Esp32Variant::Esp32C6,
|
|
] {
|
|
let config = ModelConfig::for_variant(variant);
|
|
let model = TinyModel::new(config.clone()).unwrap();
|
|
let engine = MicroEngine::new(model).unwrap();
|
|
|
|
let usage = engine.memory_usage();
|
|
let available = variant.max_model_ram();
|
|
|
|
println!(" {:?}:", variant);
|
|
println!(" Available RAM: {} KB", available / 1024);
|
|
println!(" Model weights: {} KB", usage.model_weights / 1024);
|
|
println!(" Activations: {} KB", usage.activation_buffers / 1024);
|
|
println!(" KV cache: {} KB", usage.kv_cache / 1024);
|
|
println!(" Total used: {} KB", usage.total / 1024);
|
|
println!(" Headroom: {} KB", (available - usage.total) / 1024);
|
|
println!();
|
|
|
|
assert!(
|
|
usage.total <= available,
|
|
"{:?} exceeds memory: {} > {}",
|
|
variant, usage.total, available
|
|
);
|
|
}
|
|
|
|
// Dummy benchmark to satisfy criterion
|
|
group.bench_function("memory_check", |b| {
|
|
b.iter(|| black_box(Esp32Variant::Esp32.max_model_ram()))
|
|
});
|
|
|
|
group.finish();
|
|
}
|
|
|
|
fn benchmark_quantization(c: &mut Criterion) {
|
|
let mut group = c.benchmark_group("Quantization");
|
|
group.warm_up_time(Duration::from_millis(500));
|
|
group.measurement_time(Duration::from_secs(3));
|
|
|
|
use ruvllm_esp32::quantized::QuantizedTensor;
|
|
|
|
// Test quantization of different sized tensors
|
|
for size in [256, 1024, 4096] {
|
|
let data: Vec<f32> = (0..size)
|
|
.map(|i| (i as f32 / size as f32) * 2.0 - 1.0)
|
|
.collect();
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("int8", size),
|
|
&size,
|
|
|b, _| {
|
|
b.iter(|| {
|
|
QuantizedTensor::<16384>::from_f32(
|
|
black_box(&data),
|
|
&[size],
|
|
QuantizationType::Int8,
|
|
).unwrap()
|
|
})
|
|
},
|
|
);
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("int4", size),
|
|
&size,
|
|
|b, _| {
|
|
b.iter(|| {
|
|
QuantizedTensor::<16384>::from_f32(
|
|
black_box(&data),
|
|
&[size],
|
|
QuantizationType::Int4,
|
|
).unwrap()
|
|
})
|
|
},
|
|
);
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("binary", size),
|
|
&size,
|
|
|b, _| {
|
|
b.iter(|| {
|
|
QuantizedTensor::<16384>::from_f32(
|
|
black_box(&data),
|
|
&[size],
|
|
QuantizationType::Binary,
|
|
).unwrap()
|
|
})
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
criterion_group!(
|
|
benches,
|
|
benchmark_matmul_int8,
|
|
benchmark_attention,
|
|
benchmark_full_forward,
|
|
benchmark_generation,
|
|
benchmark_memory_constraints,
|
|
benchmark_quantization,
|
|
);
|
|
|
|
criterion_main!(benches);
|