//! Simulation Tests for ESP32 RuvLLM //! //! These tests validate that the implementation will work correctly //! on ESP32 hardware by simulating memory constraints and operations. use std::time::Instant; // Import the crate use ruvllm_esp32::prelude::*; use ruvllm_esp32::model::ModelConfig; use ruvllm_esp32::quantized::{QuantizationType, QuantizedTensor, matmul_int8, binary_xnor_popcount, QuantParams}; use ruvllm_esp32::attention::{MicroAttention, LinearAttention, SlidingWindowAttention}; use ruvllm_esp32::embedding::{EmbeddingTable, RotaryEmbedding, SimpleTokenizer}; /// Validate memory fits within ESP32 constraints #[test] fn test_memory_constraints_all_variants() { println!("\n=== Memory Constraint Validation ===\n"); for variant in [ Esp32Variant::Esp32, Esp32Variant::Esp32S2, Esp32Variant::Esp32S3, Esp32Variant::Esp32C3, Esp32Variant::Esp32C6, ] { let config = ModelConfig::for_variant(variant); // Validate config is correct for variant assert!(config.validate(variant).is_ok(), "{:?} config validation failed", variant); let model = TinyModel::new(config.clone()).unwrap(); let engine = MicroEngine::new(model).unwrap(); let usage = engine.memory_usage(); let available = variant.max_model_ram(); println!("{:?}:", variant); println!(" SRAM: {} KB, Max Model RAM: {} KB", variant.sram_bytes() / 1024, available / 1024); println!(" Model: {} KB, Buffers: {} KB, KV: {} KB", usage.model_weights / 1024, usage.activation_buffers / 1024, usage.kv_cache / 1024 ); println!(" Total: {} KB, Headroom: {} KB\n", usage.total / 1024, (available.saturating_sub(usage.total)) / 1024 ); assert!( usage.total <= available, "{:?}: Memory overflow! {} > {} bytes", variant, usage.total, available ); // Ensure at least 10KB headroom for stack/runtime assert!( available - usage.total >= 10 * 1024, "{:?}: Insufficient headroom: {} bytes", variant, available - usage.total ); } } /// Test INT8 matmul correctness #[test] fn test_int8_matmul_correctness() { // Small matrix for verification let weights = [1i8, 2, 3, 4, 5, 6, 7, 8, 9]; // 3x3 let input = [1i8, 2, 3]; let mut output = [0i32; 3]; let params = QuantParams::default(); matmul_int8(&weights, ¶ms, &input, ¶ms, &mut output, 3, 3); // Manual calculation: // output[0] = 1*1 + 2*2 + 3*3 = 14 // output[1] = 4*1 + 5*2 + 6*3 = 32 // output[2] = 7*1 + 8*2 + 9*3 = 50 assert_eq!(output[0], 14); assert_eq!(output[1], 32); assert_eq!(output[2], 50); } /// Test binary XNOR popcount #[test] fn test_binary_xnor_correctness() { let a = [0b11110000u8, 0b10101010]; let b = [0b11110000u8, 0b10101010]; // Perfect match: all 16 bits same -> popcount = 16 // Result = 16 * 2 - 16 = 16 let result = binary_xnor_popcount(&a, &b); assert_eq!(result, 16); // Complete mismatch let c = [0b00001111u8, 0b01010101]; let result2 = binary_xnor_popcount(&a, &c); // XNOR of 0b11110000 and 0b00001111 = 0b00000000 -> 0 bits // XNOR of 0b10101010 and 0b01010101 = 0b00000000 -> 0 bits // Result = 0 * 2 - 16 = -16 assert_eq!(result2, -16); } /// Test quantization compression ratios #[test] fn test_quantization_compression() { let data: Vec = (0..1024).map(|i| (i as f32 / 512.0) - 1.0).collect(); let int8: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Int8).unwrap(); let int4: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Int4).unwrap(); let binary: QuantizedTensor<2048> = QuantizedTensor::from_f32(&data, &[1024], QuantizationType::Binary).unwrap(); println!("\nQuantization compression:"); println!(" INT8: {} bytes, {:.1}% savings", int8.compressed_size(), int8.memory_savings() * 100.0); println!(" INT4: {} bytes, {:.1}% savings", int4.compressed_size(), int4.memory_savings() * 100.0); println!(" Binary: {} bytes, {:.1}% savings", binary.compressed_size(), binary.memory_savings() * 100.0); // Verify compression assert_eq!(int8.compressed_size(), 1024); // 1 byte per value assert_eq!(int4.compressed_size(), 512); // 0.5 bytes per value assert_eq!(binary.compressed_size(), 128); // 0.125 bytes per value } /// Test attention mechanisms #[test] fn test_attention_mechanisms() { // Micro attention let attn = MicroAttention::new(64, 4); let query = [32i8; 16]; let key1 = [32i8; 16]; let key2 = [16i8; 16]; let keys: [&[i8]; 2] = [&key1, &key2]; let mut scores = [0i32; 2]; attn.compute_scores(&query, &keys, &mut scores); // First key should have higher score (more similar) assert!(scores[0] > scores[1], "scores[0]={} should be > scores[1]={}", scores[0], scores[1]); // Softmax should normalize attn.softmax_fixed(&mut scores); let sum: i32 = scores.iter().sum(); assert!((sum - 256).abs() < 20, "Softmax sum {} should be ~256", sum); } /// Test linear attention #[test] fn test_linear_attention() { let attn = LinearAttention::new(16); let query = [10i8; 16]; let key = [10i8; 16]; let value = [5i8; 16]; let keys: [&[i8]; 1] = [&key]; let values: [&[i8]; 1] = [&value]; let mut output = [0i32; 16]; attn.forward(&query, &keys, &values, &mut output); // Output should be non-zero assert!(output.iter().any(|&x| x != 0), "Linear attention output should be non-zero"); } /// Test embedding operations #[test] fn test_embedding_operations() { let embed: EmbeddingTable<256, 64> = EmbeddingTable::random(256, 64, 42).unwrap(); let mut output = [0i8; 64]; embed.lookup(42, &mut output).unwrap(); // Should have non-zero values assert!(output.iter().any(|&x| x != 0)); // Test accumulation let mut accum = [0i32; 64]; embed.lookup_add(42, &mut accum).unwrap(); embed.lookup_add(42, &mut accum).unwrap(); // Should be 2x the single lookup for i in 0..64 { assert_eq!(accum[i], 2 * output[i] as i32); } } /// Test rotary embeddings #[test] fn test_rotary_embeddings() { let mut rope = RotaryEmbedding::new(32, 10000); // Test different positions for pos in [0, 5, 10, 20] { rope.update_cache(pos); let mut x = [64i8; 32]; let original = x; rope.apply(&mut x, pos); // Values should change (except possibly at position 0) if pos > 0 { assert!(x != original, "RoPE should modify values at position {}", pos); } } } /// Test tokenizer #[test] fn test_tokenizer() { let tokenizer = SimpleTokenizer::ascii(); // Test encoding let tokens = tokenizer.encode("Hello World!"); assert_eq!(tokens.len(), 12); assert_eq!(tokens[0], 'H' as u16); // Test decoding let decoded = tokenizer.decode(&tokens); assert_eq!(&decoded[..], b"Hello World!"); } /// Test full inference pipeline #[test] fn test_full_inference_pipeline() { let config = ModelConfig::for_variant(Esp32Variant::Esp32); let model = TinyModel::new(config).unwrap(); let mut engine = MicroEngine::new(model).unwrap(); // Single token forward pass let next_token = engine.forward_one(10).unwrap(); assert!(next_token < 256); // Full generation engine.reset(); let prompt = [1u16, 2, 3, 4, 5]; let gen_config = InferenceConfig { max_tokens: 5, greedy: true, ..Default::default() }; let result = engine.generate(&prompt, &gen_config).unwrap(); assert!(!result.tokens.is_empty()); assert!(result.tokens.len() <= 5); println!("\nGeneration test:"); println!(" Prompt: {:?}", prompt); println!(" Generated: {:?}", result.tokens.as_slice()); println!(" Peak memory: {} KB", result.peak_memory_bytes / 1024); } /// Test model serialization #[test] fn test_model_serialization() { let config = ModelConfig::default(); let model = TinyModel::new(config).unwrap(); let header = model.to_bytes(); assert_eq!(&header[0..4], b"RUVM"); assert!(header.len() >= 32); } /// Performance simulation test #[test] fn test_performance_simulation() { println!("\n=== Performance Simulation ===\n"); // ESP32 runs at 240MHz const ESP32_CLOCK_MHZ: f64 = 240.0; // Estimated cycles per INT8 MAC operation const CYCLES_PER_MAC: f64 = 4.0; let config = ModelConfig::for_variant(Esp32Variant::Esp32); // Count operations per forward pass let embed_dim = config.embed_dim; let hidden_dim = config.hidden_dim; let num_layers = config.num_layers; let num_heads = config.num_heads; // Per layer: // - QKV projection: 3 * embed_dim * embed_dim MACs // - Attention: seq_len * head_dim * num_heads MACs (simplified) // - FFN: 3 * embed_dim * hidden_dim MACs let qkv_macs = 3 * embed_dim * embed_dim; let attn_macs = 32 * (embed_dim / num_heads) * num_heads; // Assuming seq_len=32 let ffn_macs = 3 * embed_dim * hidden_dim; let layer_macs = qkv_macs + attn_macs + ffn_macs; let total_macs = layer_macs * num_layers; // Estimate time let cycles = total_macs as f64 * CYCLES_PER_MAC; let estimated_us = cycles / ESP32_CLOCK_MHZ; let estimated_tokens_per_sec = 1_000_000.0 / estimated_us; println!("Model configuration:"); println!(" Embed dim: {}", embed_dim); println!(" Hidden dim: {}", hidden_dim); println!(" Layers: {}", num_layers); println!(" Heads: {}", num_heads); println!(); println!("Operations per forward pass:"); println!(" QKV projections: {} MACs", qkv_macs * num_layers); println!(" Attention: {} MACs", attn_macs * num_layers); println!(" FFN: {} MACs", ffn_macs * num_layers); println!(" Total: {} MACs ({:.2}M)", total_macs, total_macs as f64 / 1_000_000.0); println!(); println!("Estimated ESP32 performance:"); println!(" Cycles: {:.0}", cycles); println!(" Time per token: {:.1} us ({:.2} ms)", estimated_us, estimated_us / 1000.0); println!(" Tokens per second: {:.1}", estimated_tokens_per_sec); // Actual benchmark on host let model = TinyModel::new(config).unwrap(); let mut engine = MicroEngine::new(model).unwrap(); let start = Instant::now(); for _ in 0..100 { engine.reset(); let _ = engine.forward_one(42).unwrap(); } let elapsed = start.elapsed(); let host_us_per_token = elapsed.as_micros() as f64 / 100.0; println!(); println!("Host (x86) performance:"); println!(" Time per token: {:.1} us", host_us_per_token); println!(" ESP32/Host ratio: {:.1}x slower", estimated_us / host_us_per_token); // Validate reasonable performance assert!(estimated_tokens_per_sec > 10.0, "Should achieve >10 tokens/sec on ESP32"); assert!(estimated_us < 100_000.0, "Should be <100ms per token"); } /// Test edge cases #[test] fn test_edge_cases() { let config = ModelConfig::for_variant(Esp32Variant::Esp32); let model = TinyModel::new(config.clone()).unwrap(); let mut engine = MicroEngine::new(model).unwrap(); // Empty prompt let result = engine.generate(&[], &InferenceConfig::default()); assert!(result.is_ok()); // Single token prompt engine.reset(); let result = engine.generate(&[1], &InferenceConfig::default()); assert!(result.is_ok()); // Max sequence length engine.reset(); let long_prompt: Vec = (0..config.max_seq_len as u16).collect(); let result = engine.generate(&long_prompt, &InferenceConfig { max_tokens: 1, ..Default::default() }); // Should handle gracefully (may error or truncate) } /// Test determinism #[test] fn test_determinism() { // Use smallest variant to avoid stack overflow in tests let config = ModelConfig::for_variant(Esp32Variant::Esp32S2); // Same seed should produce same model - use Box for heap allocation let model1 = Box::new(TinyModel::new(config.clone()).unwrap()); let model2 = Box::new(TinyModel::new(config.clone()).unwrap()); // Same input should produce same output let mut engine1 = Box::new(MicroEngine::new(*model1).unwrap()); let mut engine2 = Box::new(MicroEngine::new(*model2).unwrap()); let gen_config = InferenceConfig { max_tokens: 3, greedy: true, seed: 42, ..Default::default() }; let result1 = engine1.generate(&[1, 2, 3], &gen_config).unwrap(); let result2 = engine2.generate(&[1, 2, 3], &gen_config).unwrap(); assert_eq!(result1.tokens.as_slice(), result2.tokens.as_slice()); }