Files
wifi-densepose/examples/ruvLLM/esp32/tests/simulation_tests.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

385 lines
13 KiB
Rust

//! Simulation Tests for ESP32 RuvLLM
//!
//! These tests validate that the implementation will work correctly
//! on ESP32 hardware by simulating memory constraints and operations.
use std::time::Instant;
// Import the crate
use ruvllm_esp32::prelude::*;
use ruvllm_esp32::model::ModelConfig;
use ruvllm_esp32::quantized::{QuantizationType, QuantizedTensor, matmul_int8, binary_xnor_popcount, QuantParams};
use ruvllm_esp32::attention::{MicroAttention, LinearAttention, SlidingWindowAttention};
use ruvllm_esp32::embedding::{EmbeddingTable, RotaryEmbedding, SimpleTokenizer};
/// Validate memory fits within ESP32 constraints
#[test]
fn test_memory_constraints_all_variants() {
println!("\n=== Memory Constraint Validation ===\n");
for variant in [
Esp32Variant::Esp32,
Esp32Variant::Esp32S2,
Esp32Variant::Esp32S3,
Esp32Variant::Esp32C3,
Esp32Variant::Esp32C6,
] {
let config = ModelConfig::for_variant(variant);
// Validate config is correct for variant
assert!(config.validate(variant).is_ok(), "{:?} config validation failed", variant);
let model = TinyModel::new(config.clone()).unwrap();
let engine = MicroEngine::new(model).unwrap();
let usage = engine.memory_usage();
let available = variant.max_model_ram();
println!("{:?}:", variant);
println!(" SRAM: {} KB, Max Model RAM: {} KB", variant.sram_bytes() / 1024, available / 1024);
println!(" Model: {} KB, Buffers: {} KB, KV: {} KB",
usage.model_weights / 1024,
usage.activation_buffers / 1024,
usage.kv_cache / 1024
);
println!(" Total: {} KB, Headroom: {} KB\n",
usage.total / 1024,
(available.saturating_sub(usage.total)) / 1024
);
assert!(
usage.total <= available,
"{:?}: Memory overflow! {} > {} bytes",
variant, usage.total, available
);
// Ensure at least 10KB headroom for stack/runtime
assert!(
available - usage.total >= 10 * 1024,
"{:?}: Insufficient headroom: {} bytes",
variant, available - usage.total
);
}
}
/// Test INT8 matmul correctness
#[test]
fn test_int8_matmul_correctness() {
// Small matrix for verification
let weights = [1i8, 2, 3, 4, 5, 6, 7, 8, 9]; // 3x3
let input = [1i8, 2, 3];
let mut output = [0i32; 3];
let params = QuantParams::default();
matmul_int8(&weights, &params, &input, &params, &mut output, 3, 3);
// Manual calculation:
// output[0] = 1*1 + 2*2 + 3*3 = 14
// output[1] = 4*1 + 5*2 + 6*3 = 32
// output[2] = 7*1 + 8*2 + 9*3 = 50
assert_eq!(output[0], 14);
assert_eq!(output[1], 32);
assert_eq!(output[2], 50);
}
/// Test binary XNOR popcount
#[test]
fn test_binary_xnor_correctness() {
let a = [0b11110000u8, 0b10101010];
let b = [0b11110000u8, 0b10101010];
// Perfect match: all 16 bits same -> popcount = 16
// Result = 16 * 2 - 16 = 16
let result = binary_xnor_popcount(&a, &b);
assert_eq!(result, 16);
// Complete mismatch
let c = [0b00001111u8, 0b01010101];
let result2 = binary_xnor_popcount(&a, &c);
// XNOR of 0b11110000 and 0b00001111 = 0b00000000 -> 0 bits
// XNOR of 0b10101010 and 0b01010101 = 0b00000000 -> 0 bits
// Result = 0 * 2 - 16 = -16
assert_eq!(result2, -16);
}
/// Test quantization compression ratios
#[test]
fn test_quantization_compression() {
let data: Vec<f32> = (0..1024).map(|i| (i as f32 / 512.0) - 1.0).collect();
let int8: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Int8).unwrap();
let int4: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Int4).unwrap();
let binary: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Binary).unwrap();
println!("\nQuantization compression:");
println!(" INT8: {} bytes, {:.1}% savings", int8.compressed_size(), int8.memory_savings() * 100.0);
println!(" INT4: {} bytes, {:.1}% savings", int4.compressed_size(), int4.memory_savings() * 100.0);
println!(" Binary: {} bytes, {:.1}% savings", binary.compressed_size(), binary.memory_savings() * 100.0);
// Verify compression
assert_eq!(int8.compressed_size(), 1024); // 1 byte per value
assert_eq!(int4.compressed_size(), 512); // 0.5 bytes per value
assert_eq!(binary.compressed_size(), 128); // 0.125 bytes per value
}
/// Test attention mechanisms
#[test]
fn test_attention_mechanisms() {
// Micro attention
let attn = MicroAttention::new(64, 4);
let query = [32i8; 16];
let key1 = [32i8; 16];
let key2 = [16i8; 16];
let keys: [&[i8]; 2] = [&key1, &key2];
let mut scores = [0i32; 2];
attn.compute_scores(&query, &keys, &mut scores);
// First key should have higher score (more similar)
assert!(scores[0] > scores[1], "scores[0]={} should be > scores[1]={}", scores[0], scores[1]);
// Softmax should normalize
attn.softmax_fixed(&mut scores);
let sum: i32 = scores.iter().sum();
assert!((sum - 256).abs() < 20, "Softmax sum {} should be ~256", sum);
}
/// Test linear attention
#[test]
fn test_linear_attention() {
let attn = LinearAttention::new(16);
let query = [10i8; 16];
let key = [10i8; 16];
let value = [5i8; 16];
let keys: [&[i8]; 1] = [&key];
let values: [&[i8]; 1] = [&value];
let mut output = [0i32; 16];
attn.forward(&query, &keys, &values, &mut output);
// Output should be non-zero
assert!(output.iter().any(|&x| x != 0), "Linear attention output should be non-zero");
}
/// Test embedding operations
#[test]
fn test_embedding_operations() {
let embed: EmbeddingTable<256, 64> = EmbeddingTable::random(256, 64, 42).unwrap();
let mut output = [0i8; 64];
embed.lookup(42, &mut output).unwrap();
// Should have non-zero values
assert!(output.iter().any(|&x| x != 0));
// Test accumulation
let mut accum = [0i32; 64];
embed.lookup_add(42, &mut accum).unwrap();
embed.lookup_add(42, &mut accum).unwrap();
// Should be 2x the single lookup
for i in 0..64 {
assert_eq!(accum[i], 2 * output[i] as i32);
}
}
/// Test rotary embeddings
#[test]
fn test_rotary_embeddings() {
let mut rope = RotaryEmbedding::new(32, 10000);
// Test different positions
for pos in [0, 5, 10, 20] {
rope.update_cache(pos);
let mut x = [64i8; 32];
let original = x;
rope.apply(&mut x, pos);
// Values should change (except possibly at position 0)
if pos > 0 {
assert!(x != original, "RoPE should modify values at position {}", pos);
}
}
}
/// Test tokenizer
#[test]
fn test_tokenizer() {
let tokenizer = SimpleTokenizer::ascii();
// Test encoding
let tokens = tokenizer.encode("Hello World!");
assert_eq!(tokens.len(), 12);
assert_eq!(tokens[0], 'H' as u16);
// Test decoding
let decoded = tokenizer.decode(&tokens);
assert_eq!(&decoded[..], b"Hello World!");
}
/// Test full inference pipeline
#[test]
fn test_full_inference_pipeline() {
let config = ModelConfig::for_variant(Esp32Variant::Esp32);
let model = TinyModel::new(config).unwrap();
let mut engine = MicroEngine::new(model).unwrap();
// Single token forward pass
let next_token = engine.forward_one(10).unwrap();
assert!(next_token < 256);
// Full generation
engine.reset();
let prompt = [1u16, 2, 3, 4, 5];
let gen_config = InferenceConfig {
max_tokens: 5,
greedy: true,
..Default::default()
};
let result = engine.generate(&prompt, &gen_config).unwrap();
assert!(!result.tokens.is_empty());
assert!(result.tokens.len() <= 5);
println!("\nGeneration test:");
println!(" Prompt: {:?}", prompt);
println!(" Generated: {:?}", result.tokens.as_slice());
println!(" Peak memory: {} KB", result.peak_memory_bytes / 1024);
}
/// Test model serialization
#[test]
fn test_model_serialization() {
let config = ModelConfig::default();
let model = TinyModel::new(config).unwrap();
let header = model.to_bytes();
assert_eq!(&header[0..4], b"RUVM");
assert!(header.len() >= 32);
}
/// Performance simulation test
#[test]
fn test_performance_simulation() {
println!("\n=== Performance Simulation ===\n");
// ESP32 runs at 240MHz
const ESP32_CLOCK_MHZ: f64 = 240.0;
// Estimated cycles per INT8 MAC operation
const CYCLES_PER_MAC: f64 = 4.0;
let config = ModelConfig::for_variant(Esp32Variant::Esp32);
// Count operations per forward pass
let embed_dim = config.embed_dim;
let hidden_dim = config.hidden_dim;
let num_layers = config.num_layers;
let num_heads = config.num_heads;
// Per layer:
// - QKV projection: 3 * embed_dim * embed_dim MACs
// - Attention: seq_len * head_dim * num_heads MACs (simplified)
// - FFN: 3 * embed_dim * hidden_dim MACs
let qkv_macs = 3 * embed_dim * embed_dim;
let attn_macs = 32 * (embed_dim / num_heads) * num_heads; // Assuming seq_len=32
let ffn_macs = 3 * embed_dim * hidden_dim;
let layer_macs = qkv_macs + attn_macs + ffn_macs;
let total_macs = layer_macs * num_layers;
// Estimate time
let cycles = total_macs as f64 * CYCLES_PER_MAC;
let estimated_us = cycles / ESP32_CLOCK_MHZ;
let estimated_tokens_per_sec = 1_000_000.0 / estimated_us;
println!("Model configuration:");
println!(" Embed dim: {}", embed_dim);
println!(" Hidden dim: {}", hidden_dim);
println!(" Layers: {}", num_layers);
println!(" Heads: {}", num_heads);
println!();
println!("Operations per forward pass:");
println!(" QKV projections: {} MACs", qkv_macs * num_layers);
println!(" Attention: {} MACs", attn_macs * num_layers);
println!(" FFN: {} MACs", ffn_macs * num_layers);
println!(" Total: {} MACs ({:.2}M)", total_macs, total_macs as f64 / 1_000_000.0);
println!();
println!("Estimated ESP32 performance:");
println!(" Cycles: {:.0}", cycles);
println!(" Time per token: {:.1} us ({:.2} ms)", estimated_us, estimated_us / 1000.0);
println!(" Tokens per second: {:.1}", estimated_tokens_per_sec);
// Actual benchmark on host
let model = TinyModel::new(config).unwrap();
let mut engine = MicroEngine::new(model).unwrap();
let start = Instant::now();
for _ in 0..100 {
engine.reset();
let _ = engine.forward_one(42).unwrap();
}
let elapsed = start.elapsed();
let host_us_per_token = elapsed.as_micros() as f64 / 100.0;
println!();
println!("Host (x86) performance:");
println!(" Time per token: {:.1} us", host_us_per_token);
println!(" ESP32/Host ratio: {:.1}x slower", estimated_us / host_us_per_token);
// Validate reasonable performance
assert!(estimated_tokens_per_sec > 10.0, "Should achieve >10 tokens/sec on ESP32");
assert!(estimated_us < 100_000.0, "Should be <100ms per token");
}
/// Test edge cases
#[test]
fn test_edge_cases() {
let config = ModelConfig::for_variant(Esp32Variant::Esp32);
let model = TinyModel::new(config.clone()).unwrap();
let mut engine = MicroEngine::new(model).unwrap();
// Empty prompt
let result = engine.generate(&[], &InferenceConfig::default());
assert!(result.is_ok());
// Single token prompt
engine.reset();
let result = engine.generate(&[1], &InferenceConfig::default());
assert!(result.is_ok());
// Max sequence length
engine.reset();
let long_prompt: Vec<u16> = (0..config.max_seq_len as u16).collect();
let result = engine.generate(&long_prompt, &InferenceConfig { max_tokens: 1, ..Default::default() });
// Should handle gracefully (may error or truncate)
}
/// Test determinism
#[test]
fn test_determinism() {
// Use smallest variant to avoid stack overflow in tests
let config = ModelConfig::for_variant(Esp32Variant::Esp32S2);
// Same seed should produce same model - use Box for heap allocation
let model1 = Box::new(TinyModel::new(config.clone()).unwrap());
let model2 = Box::new(TinyModel::new(config.clone()).unwrap());
// Same input should produce same output
let mut engine1 = Box::new(MicroEngine::new(*model1).unwrap());
let mut engine2 = Box::new(MicroEngine::new(*model2).unwrap());
let gen_config = InferenceConfig {
max_tokens: 3,
greedy: true,
seed: 42,
..Default::default()
};
let result1 = engine1.generate(&[1, 2, 3], &gen_config).unwrap();
let result2 = engine2.generate(&[1, 2, 3], &gen_config).unwrap();
assert_eq!(result1.tokens.as_slice(), result2.tokens.as_slice());
}