#![allow( clippy::all, unused_imports, unused_variables, dead_code, unused_mut, unused_assignments, non_camel_case_types, clippy::approx_constant, unexpected_cfgs, unused_must_use, unused_parens )] //! RuvLTRA-Small Model Tests //! //! This module provides comprehensive tests for the RuvLTRA-Small inference engine, //! validating model loading, quantization accuracy, SONA integration, and ANE dispatch. //! //! ## Test Categories //! //! - **Model Loading**: Validate GGUF/SafeTensors loading and configuration //! - **Quantization**: Test dequantization accuracy across all quantization formats //! - **SONA Integration**: Test Self-Optimizing Neural Architecture adaptation //! - **ANE Dispatch**: Test Apple Neural Engine routing and fallback behavior //! //! ## Running Tests //! //! ```bash //! # Run all RuvLTRA tests //! cargo test --package ruvllm ruvltra_tests //! //! # Run with ANE support (Apple Silicon only) //! cargo test --package ruvllm --features coreml ruvltra_tests //! //! # Run with full feature set //! cargo test --package ruvllm --all-features ruvltra_tests //! ``` use ruvllm::backends::{ AneCapabilities, ComputeUnits, ModelArchitecture, ModelConfig, Quantization, }; use ruvllm::gguf::quantization::{dequantize_tensor, GgufQuantType, QuantizedTensor}; use ruvllm::kernels::ane_ops::{ get_ane_recommendation, is_ane_available, should_use_ane, should_use_ane_activation, should_use_ane_matmul, }; use std::time::{Duration, Instant}; // ============================================================================ // Test Fixtures and Constants // ============================================================================ /// RuvLTRA-Small model configuration for testing const RUVLTRA_SMALL_CONFIG: RuvLtraTestConfig = RuvLtraTestConfig { vocab_size: 32000, hidden_size: 2048, intermediate_size: 5504, num_hidden_layers: 22, num_attention_heads: 32, num_key_value_heads: 8, max_position_embeddings: 8192, rope_theta: 10000.0, layer_norm_eps: 1e-5, }; /// Test configuration for RuvLTRA-Small #[derive(Debug, Clone, Copy)] #[allow(dead_code)] struct RuvLtraTestConfig { vocab_size: usize, hidden_size: usize, intermediate_size: usize, num_hidden_layers: usize, num_attention_heads: usize, num_key_value_heads: usize, max_position_embeddings: usize, rope_theta: f32, layer_norm_eps: f32, } /// Memory bounds for validation (in bytes) const MEMORY_BOUNDS: MemoryBounds = MemoryBounds { // Q4_K quantization: ~1.2GB for small model max_model_memory: 1_500_000_000, // KV cache for 8K context max_kv_cache_memory: 500_000_000, // Working memory for inference max_working_memory: 200_000_000, }; #[derive(Debug, Clone, Copy)] struct MemoryBounds { max_model_memory: usize, max_kv_cache_memory: usize, max_working_memory: usize, } /// Test tolerance levels const EPSILON: f32 = 1e-4; const LOOSE_EPSILON: f32 = 0.01; const QUANTIZATION_EPSILON: f32 = 0.1; // Higher tolerance for quantized values // ============================================================================ // Model Loading Tests // ============================================================================ mod model_loading { use super::*; #[test] fn test_model_config_creation() { let config = ModelConfig { architecture: ModelArchitecture::Llama, quantization: Some(Quantization::Q4K), max_sequence_length: 8192, vocab_size: Some(RUVLTRA_SMALL_CONFIG.vocab_size), use_flash_attention: true, ..Default::default() }; assert_eq!(config.architecture, ModelArchitecture::Llama); assert_eq!(config.quantization, Some(Quantization::Q4K)); assert_eq!(config.max_sequence_length, 8192); assert_eq!(config.vocab_size, Some(RUVLTRA_SMALL_CONFIG.vocab_size)); assert!(config.use_flash_attention); } #[test] fn test_model_architecture_variants() { let architectures = [ ModelArchitecture::Llama, ModelArchitecture::Mistral, ModelArchitecture::Phi, ModelArchitecture::Qwen, ]; for arch in architectures { let config = ModelConfig { architecture: arch, quantization: Some(Quantization::Q4K), max_sequence_length: 4096, vocab_size: Some(32000), use_flash_attention: false, ..Default::default() }; assert_eq!(config.architecture, arch); // Verify architecture can be formatted/debugged let _ = format!("{:?}", arch); } } #[test] fn test_quantization_format_selection() { let quantizations = [ (Quantization::None, "None", 32.0), (Quantization::F16, "F16", 16.0), (Quantization::Bf16, "Bf16", 16.0), (Quantization::Q8, "Q8", 8.0), (Quantization::Q4K, "Q4K", 4.5), (Quantization::Q4, "Q4", 4.0), (Quantization::Q2K, "Q2K", 2.56), ]; for (quant, name, _expected_bits) in quantizations { let config = ModelConfig { architecture: ModelArchitecture::Llama, quantization: Some(quant), max_sequence_length: 4096, vocab_size: Some(32000), use_flash_attention: false, ..Default::default() }; // Verify quantization is set correctly assert_eq!(config.quantization, Some(quant)); // Verify name format let quant_name = format!("{:?}", quant); assert!( quant_name.contains(name) || !quant_name.is_empty(), "Quantization {:?} should have recognizable name", quant ); } } #[test] fn test_model_config_default_values() { let config = ModelConfig::default(); // Verify sensible defaults assert!(config.max_sequence_length > 0); // vocab_size is now Option, so check it's present or use default behavior } #[test] fn test_invalid_model_path_error() { // This test validates error handling for non-existent paths let result = std::fs::metadata("/nonexistent/path/to/model.gguf"); assert!(result.is_err(), "Non-existent path should fail"); } #[test] fn test_gguf_extension_validation() { let valid_extensions = [".gguf", ".GGUF"]; let invalid_extensions = [".bin", ".safetensors", ".pt", ".pth"]; for ext in valid_extensions { assert!( ext.to_lowercase().ends_with("gguf"), "Extension {} should be valid GGUF", ext ); } for ext in invalid_extensions { assert!( !ext.to_lowercase().ends_with("gguf"), "Extension {} should not be GGUF", ext ); } } #[test] fn test_rope_theta_configuration() { // Test rope theta configuration let config_with_theta = ModelConfig { architecture: ModelArchitecture::Llama, quantization: Some(Quantization::Q4K), max_sequence_length: 4096, vocab_size: Some(32000), rope_theta: Some(10000.0), use_flash_attention: false, ..Default::default() }; assert_eq!(config_with_theta.rope_theta, Some(10000.0)); // Rope theta is the frequency base for rotary position embeddings // The actual implementation depends on the model architecture } #[test] fn test_context_length_bounds() { let context_lengths = [512, 1024, 2048, 4096, 8192, 16384, 32768]; for ctx_len in context_lengths { let config = ModelConfig { architecture: ModelArchitecture::Llama, quantization: Some(Quantization::Q4K), max_sequence_length: ctx_len, vocab_size: Some(32000), use_flash_attention: false, ..Default::default() }; assert_eq!(config.max_sequence_length, ctx_len); assert!(ctx_len > 0, "Context length must be positive"); } } } // ============================================================================ // Quantization Accuracy Tests // ============================================================================ mod quantization_accuracy { use super::*; /// Test Q4_0 dequantization accuracy #[test] fn test_q4_0_dequantization_accuracy() { // Create test Q4_0 block: scale + packed 4-bit values let mut block = vec![0u8; 18]; // Set scale = 0.5 (f16: 0x3800) block[0] = 0x00; block[1] = 0x38; // Pack values: (8 - offset) gives 0, (9 - offset) gives 1, etc. // Q4_0 uses offset of 8 for i in 0..16 { let low = 8u8; // Will become 0 after offset let high = 9u8; // Will become 1 after offset block[2 + i] = low | (high << 4); } let _output = vec![0.0f32; 32]; let dtype = GgufQuantType::Q4_0; // Verify block size assert_eq!(dtype.block_size(), 32); assert_eq!(dtype.type_size(), 18); // Dequantize let result = dequantize_tensor(&block, dtype, 32); assert!(result.is_ok(), "Dequantization should succeed"); let output = result.unwrap(); // Verify pattern: alternating 0.0, 0.5 for i in 0..32 { if i % 2 == 0 { assert!( output[i].abs() < QUANTIZATION_EPSILON, "Even index {} should be ~0.0, got {}", i, output[i] ); } else { assert!( (output[i] - 0.5).abs() < QUANTIZATION_EPSILON, "Odd index {} should be ~0.5, got {}", i, output[i] ); } } } /// Test Q8_0 dequantization accuracy #[test] fn test_q8_0_dequantization_accuracy() { // Create test Q8_0 block: scale (2 bytes) + 32 int8 values let mut block = vec![0u8; 34]; // Set scale = 1.0 (f16: 0x3C00) block[0] = 0x00; block[1] = 0x3C; // Set values 1, 2, 3, ..., 32 as signed int8 for i in 0..32 { block[2 + i] = (i + 1) as u8; } let result = dequantize_tensor(&block, GgufQuantType::Q8_0, 32); assert!(result.is_ok()); let output = result.unwrap(); // Verify: values should be 1.0, 2.0, ..., 32.0 for i in 0..32 { let expected = (i + 1) as f32; assert!( (output[i] - expected).abs() < EPSILON, "Index {}: expected {}, got {}", i, expected, output[i] ); } } /// Test Q4_K dequantization (most common format) #[test] fn test_q4_k_dequantization_accuracy() { let dtype = GgufQuantType::Q4_K; // Verify Q4_K properties assert_eq!(dtype.block_size(), 256); assert_eq!(dtype.type_size(), 144); assert!(dtype.is_quantized()); let bits = dtype.bits_per_weight(); assert!((bits - 4.5).abs() < 0.1, "Q4_K should be ~4.5 bits/weight"); } /// Test all quantization types have valid properties #[test] fn test_all_quant_types_valid() { let quant_types = [ GgufQuantType::F32, GgufQuantType::F16, GgufQuantType::Q8_0, GgufQuantType::Q4_0, GgufQuantType::Q4_1, GgufQuantType::Q5_0, GgufQuantType::Q5_1, GgufQuantType::Q2_K, GgufQuantType::Q3_K, GgufQuantType::Q4_K, GgufQuantType::Q5_K, GgufQuantType::Q6_K, ]; for dtype in quant_types { // Block size must be positive assert!( dtype.block_size() > 0, "{:?} must have positive block size", dtype ); // Type size must be positive assert!( dtype.type_size() > 0, "{:?} must have positive type size", dtype ); // Bits per weight should be in reasonable range (1-32) let bits = dtype.bits_per_weight(); assert!( bits >= 1.0 && bits <= 32.0, "{:?} bits/weight {} out of range", dtype, bits ); // Name should be non-empty assert!( !dtype.name().is_empty(), "{:?} must have non-empty name", dtype ); } } /// Test tensor size calculation #[test] fn test_tensor_size_calculation() { // F32: 256 elements = 256 * 4 = 1024 bytes assert_eq!(GgufQuantType::F32.tensor_size(256), 1024); // F16: 256 elements = 256 * 2 = 512 bytes assert_eq!(GgufQuantType::F16.tensor_size(256), 512); // Q4_0: 256 elements = 8 blocks * 18 bytes = 144 bytes assert_eq!(GgufQuantType::Q4_0.tensor_size(256), 144); // Q4_K: 256 elements = 1 block * 144 bytes = 144 bytes assert_eq!(GgufQuantType::Q4_K.tensor_size(256), 144); } /// Test quantized vs non-quantized detection #[test] fn test_is_quantized() { // Non-quantized types assert!(!GgufQuantType::F32.is_quantized()); assert!(!GgufQuantType::F16.is_quantized()); assert!(!GgufQuantType::Bf16.is_quantized()); // Quantized types assert!(GgufQuantType::Q4_0.is_quantized()); assert!(GgufQuantType::Q8_0.is_quantized()); assert!(GgufQuantType::Q4_K.is_quantized()); assert!(GgufQuantType::Q2_K.is_quantized()); } /// Test QuantizedTensor container #[test] fn test_quantized_tensor_container() { let tensor = QuantizedTensor { data: vec![0u8; 144], // One Q4_K block dtype: GgufQuantType::Q4_K, shape: vec![256], num_elements: 256, }; assert_eq!(tensor.block_count(), 1); assert!(tensor.dtype.is_quantized()); assert_eq!(tensor.shape, vec![256]); } /// Test dequantization roundtrip sanity #[test] fn test_dequantization_finite_values() { // Create valid Q4_0 quantized data // Q4_0 format: 2 bytes scale (f16) + 16 bytes packed 4-bit values = 18 bytes per block // Each block represents 32 elements let mut data = vec![0u8; 18 * 8]; // 8 Q4_0 blocks = 256 elements for block in 0..8 { let base = block * 18; // Set a valid f16 scale: 0x3C00 = 1.0f16, small positive value data[base] = 0x00; // Low byte of f16 scale data[base + 1] = 0x3C; // High byte: 0x3C00 = 1.0 // Fill packed 4-bit values with valid patterns (0-15) for i in 0..16 { let low_nibble = (i % 16) as u8; let high_nibble = ((i + 1) % 16) as u8; data[base + 2 + i] = low_nibble | (high_nibble << 4); } } let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 256); assert!(result.is_ok()); let output = result.unwrap(); // All values should be finite for (i, val) in output.iter().enumerate() { assert!( val.is_finite(), "Value at index {} should be finite, got {}", i, val ); } } /// Test quantization type conversion from u32 #[test] fn test_quant_type_try_from() { // Valid conversions assert_eq!(GgufQuantType::try_from(0).unwrap(), GgufQuantType::F32); assert_eq!(GgufQuantType::try_from(1).unwrap(), GgufQuantType::F16); assert_eq!(GgufQuantType::try_from(8).unwrap(), GgufQuantType::Q8_0); assert_eq!(GgufQuantType::try_from(12).unwrap(), GgufQuantType::Q4_K); // Invalid conversion assert!(GgufQuantType::try_from(100).is_err()); assert!(GgufQuantType::try_from(255).is_err()); } } // ============================================================================ // SONA Integration Tests // ============================================================================ mod sona_integration { use super::*; /// SONA configuration for testing #[derive(Debug, Clone)] struct SonaTestConfig { learning_rate: f32, momentum: f32, adaptation_threshold: f32, max_adaptations_per_step: usize, } impl Default for SonaTestConfig { fn default() -> Self { Self { learning_rate: 0.001, momentum: 0.9, adaptation_threshold: 0.05, max_adaptations_per_step: 3, } } } #[test] fn test_sona_config_defaults() { let config = SonaTestConfig::default(); assert!( config.learning_rate > 0.0 && config.learning_rate < 1.0, "Learning rate should be in (0, 1)" ); assert!( config.momentum >= 0.0 && config.momentum < 1.0, "Momentum should be in [0, 1)" ); assert!( config.adaptation_threshold > 0.0, "Adaptation threshold must be positive" ); assert!( config.max_adaptations_per_step > 0, "Max adaptations must be positive" ); } #[test] fn test_sona_adaptation_timing() { // SONA adaptation should be fast (<0.05ms target) let start = Instant::now(); // Simulate SONA adaptation calculation let mut weights = vec![0.5f32; 1000]; let gradients = vec![0.01f32; 1000]; // Simple gradient update (simulating SONA) for (w, g) in weights.iter_mut().zip(gradients.iter()) { *w -= 0.001 * g; } let duration = start.elapsed(); // Should be very fast assert!( duration < Duration::from_millis(1), "SONA adaptation took {:?}, expected <1ms", duration ); } #[test] fn test_sona_routing_decision() { // Test routing decision logic struct RoutingDecision { use_ane: bool, use_neon: bool, confidence: f32, } fn make_routing_decision(batch_size: usize, dim: usize) -> RoutingDecision { let ane_available = is_ane_available(); if ane_available && should_use_ane(batch_size, dim) { RoutingDecision { use_ane: true, use_neon: false, confidence: 0.9, } } else { RoutingDecision { use_ane: false, use_neon: true, confidence: 0.95, } } } // Small dimensions: NEON preferred let decision = make_routing_decision(1, 32); assert!( decision.use_neon || decision.use_ane, "Must use some compute backend" ); // Large batch with aligned dims: ANE may be preferred on Apple Silicon let decision = make_routing_decision(32, 256); assert!(decision.confidence > 0.5); } #[test] fn test_sona_pattern_learning() { // Simulate SONA pattern storage #[derive(Debug)] #[allow(dead_code)] struct SonaPattern { input_hash: u64, optimal_config: String, performance_score: f32, } let patterns = vec![ SonaPattern { input_hash: 12345, optimal_config: "ANE+NEON".to_string(), performance_score: 0.95, }, SonaPattern { input_hash: 67890, optimal_config: "NEON-only".to_string(), performance_score: 0.88, }, ]; for pattern in &patterns { assert!(pattern.performance_score >= 0.0 && pattern.performance_score <= 1.0); assert!(!pattern.optimal_config.is_empty()); } } #[test] fn test_sona_warmup_iterations() { // SONA typically needs a few iterations to warm up const WARMUP_ITERATIONS: usize = 3; let mut metrics = Vec::new(); for i in 0..10 { // Simulate inference timing let start = Instant::now(); std::thread::sleep(Duration::from_micros(100 + i as u64 * 10)); let duration = start.elapsed(); metrics.push(duration); } // Post-warmup iterations should be more stable let warmup_variance = calculate_variance(&metrics[..WARMUP_ITERATIONS]); let stable_variance = calculate_variance(&metrics[WARMUP_ITERATIONS..]); // Note: This is a simplified test; in real scenarios, // stable variance should typically be lower let _ = (warmup_variance, stable_variance); } fn calculate_variance(durations: &[Duration]) -> f64 { if durations.is_empty() { return 0.0; } let mean: f64 = durations.iter().map(|d| d.as_secs_f64()).sum::() / durations.len() as f64; durations .iter() .map(|d| (d.as_secs_f64() - mean).powi(2)) .sum::() / durations.len() as f64 } #[test] fn test_sona_ewc_consolidation() { // Test EWC++ (Elastic Weight Consolidation) behavior // This prevents catastrophic forgetting in SONA struct EwcConfig { lambda: f32, // Importance weight fisher_samples: usize, } let config = EwcConfig { lambda: 1000.0, fisher_samples: 100, }; // Lambda should be positive for weight importance assert!(config.lambda > 0.0); // Need enough samples for Fisher information assert!(config.fisher_samples >= 10); } } // ============================================================================ // ANE Dispatch Tests // ============================================================================ mod ane_dispatch { use super::*; #[test] fn test_ane_availability_detection() { // Should not panic let available = is_ane_available(); // Result should be consistent assert_eq!(is_ane_available(), available); assert_eq!(is_ane_available(), available); } #[test] fn test_ane_capabilities_detection() { let caps = AneCapabilities::detect(); #[cfg(all(target_os = "macos", target_arch = "aarch64"))] { // On Apple Silicon, ANE should be available assert!(caps.available, "ANE should be available on Apple Silicon"); assert!(caps.tops > 0.0, "TOPS should be positive"); assert!( caps.max_model_size_mb > 0, "Max model size should be positive" ); assert!(!caps.supported_ops.is_empty(), "Should have supported ops"); } #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] { // On non-Apple Silicon, ANE may not be available if !caps.available { assert_eq!(caps.tops, 0.0); assert_eq!(caps.max_model_size_mb, 0); } } } #[test] fn test_ane_routing_thresholds() { // Test various dimension combinations let test_cases = [ // (batch, dim, description) (1, 64, "minimum ANE dimensions"), (1, 128, "small aligned tensor"), (32, 256, "typical LLM dimensions"), (64, 4096, "large batch with large dim"), (1, 32, "below minimum dim"), (100, 128, "above max batch"), ]; for (batch, dim, desc) in test_cases { let should_use = should_use_ane(batch, dim); // Just verify no panic let _ = (should_use, desc); } } #[test] fn test_ane_matmul_routing() { let test_cases = [ // (m, k, n, description) (1, 64, 64, "small square matmul"), (32, 256, 128, "medium matmul"), (1, 4096, 4096, "large matmul"), (64, 512, 512, "optimal ANE size"), (1, 8192, 8192, "very large matmul"), ]; for (m, k, n, desc) in test_cases { let _should_use = should_use_ane_matmul(m, k, n); let recommendation = get_ane_recommendation(m, k, n); // Recommendation should be consistent assert!( recommendation.confidence >= 0.0 && recommendation.confidence <= 1.0, "Confidence for {} should be in [0, 1]", desc ); // Expected speedup should be reasonable assert!( recommendation.expected_speedup > 0.0 && recommendation.expected_speedup < 10.0, "Speedup for {} should be reasonable", desc ); } } #[test] fn test_ane_activation_routing() { let test_cases = [ (1, 64), (32, 256), (64, 4096), (100, 128), // Above typical ANE batch limit (1, 1000000), // Very large tensor ]; for (batch, dim) in test_cases { let should_use = should_use_ane_activation(batch, dim); // Just verify no panic and reasonable result let _ = should_use; } } #[test] fn test_ane_recommendation_structure() { let rec = get_ane_recommendation(1, 256, 256); // All fields should be valid assert!(rec.confidence >= 0.0 && rec.confidence <= 1.0); assert!(!rec.reason.is_empty()); assert!(rec.expected_speedup > 0.0); // Test Clone let cloned = rec.clone(); assert_eq!(rec.use_ane, cloned.use_ane); assert_eq!(rec.confidence, cloned.confidence); // Test Debug let debug = format!("{:?}", rec); assert!(debug.contains("use_ane")); } #[test] fn test_compute_units_configuration() { let units = [ ComputeUnits::CpuOnly, ComputeUnits::CpuAndGpu, ComputeUnits::CpuAndNeuralEngine, ComputeUnits::All, ]; for unit in units { // Test ANE usage flag let _uses_ane = unit.uses_ane(); let _uses_gpu = unit.uses_gpu(); // At least CPU should always be used // (implied by all compute unit configurations) // Test description let desc = unit.description(); assert!(!desc.is_empty()); } } #[test] fn test_ane_dimension_alignment() { // ANE prefers 16-aligned dimensions let aligned_dims = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096]; let unaligned_dims = [17, 33, 65, 100, 255, 1000]; for dim in aligned_dims { assert_eq!(dim % 16, 0, "{} should be 16-aligned", dim); } for dim in unaligned_dims { assert_ne!(dim % 16, 0, "{} should not be 16-aligned", dim); } } #[test] fn test_ane_no_dispatch_errors() { // Simulate dispatch to verify no errors occur let test_tensors = [(1, 64), (32, 256), (64, 4096)]; for (batch, dim) in test_tensors { // These should never panic let _ = should_use_ane(batch, dim); let _ = should_use_ane_activation(batch, dim); let _ = should_use_ane_matmul(batch, dim, dim); } } #[test] fn test_fallback_behavior() { // Test that fallback to NEON works when ANE is unavailable let mut data = vec![1.0f32; 64]; // This should work regardless of ANE availability // by falling back to scalar/NEON implementation for v in data.iter_mut() { *v = *v / (1.0 + (-*v).exp()); // SiLU } // All values should be valid assert!(data.iter().all(|v| v.is_finite())); } } // ============================================================================ // Memory Management Tests // ============================================================================ mod memory_management { use super::*; #[test] fn test_memory_bounds_validation() { // Verify memory bounds are reasonable assert!(MEMORY_BOUNDS.max_model_memory > 0); assert!(MEMORY_BOUNDS.max_kv_cache_memory > 0); assert!(MEMORY_BOUNDS.max_working_memory > 0); // Total should be reasonable for device let total = MEMORY_BOUNDS.max_model_memory + MEMORY_BOUNDS.max_kv_cache_memory + MEMORY_BOUNDS.max_working_memory; // Should fit in 8GB device memory assert!(total < 8_000_000_000, "Total memory {} exceeds 8GB", total); } #[test] fn test_tensor_memory_estimation() { // Estimate memory for RuvLTRA-Small tensors let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size; let _num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers; let vocab_size = RUVLTRA_SMALL_CONFIG.vocab_size; // Embedding: vocab_size * hidden_size * bytes_per_element let embedding_size_f32 = vocab_size * hidden_size * 4; let embedding_size_q4k = GgufQuantType::Q4_K.tensor_size(vocab_size * hidden_size); // Q4_K should be much smaller assert!( embedding_size_q4k < embedding_size_f32 / 4, "Q4_K should be at least 4x smaller than F32" ); } #[test] fn test_kv_cache_sizing() { let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size; let num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers; let num_kv_heads = RUVLTRA_SMALL_CONFIG.num_key_value_heads; let max_seq_len = RUVLTRA_SMALL_CONFIG.max_position_embeddings; let head_dim = hidden_size / RUVLTRA_SMALL_CONFIG.num_attention_heads; // KV cache per layer: 2 * seq_len * num_kv_heads * head_dim * sizeof(f16) let kv_per_layer = 2 * max_seq_len * num_kv_heads * head_dim * 2; let total_kv_cache = kv_per_layer * num_layers; assert!( total_kv_cache < MEMORY_BOUNDS.max_kv_cache_memory as usize, "KV cache {} exceeds bound {}", total_kv_cache, MEMORY_BOUNDS.max_kv_cache_memory ); } #[test] fn test_working_memory_allocation() { // Simulate working memory allocation let batch_size = 1; let seq_len = 1024; let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size; // Activations: batch * seq * hidden * sizeof(f32) let activation_memory = batch_size * seq_len * hidden_size * 4; // Should fit in working memory assert!(activation_memory < MEMORY_BOUNDS.max_working_memory as usize); } } // ============================================================================ // Output Validation Tests // ============================================================================ mod output_validation { use super::*; #[test] fn test_logits_finite() { // Simulated logits output let logits: Vec = (0..RUVLTRA_SMALL_CONFIG.vocab_size) .map(|i| (i as f32) * 0.001 - 16.0) .collect(); // All logits should be finite for (i, logit) in logits.iter().enumerate() { assert!( logit.is_finite(), "Logit at index {} should be finite, got {}", i, logit ); } } #[test] fn test_softmax_probabilities() { // Simulated softmax output let mut probs = vec![0.1f32; 10]; // Apply softmax normalization let max_val = probs.iter().cloned().fold(f32::NEG_INFINITY, f32::max); let mut sum = 0.0; for p in probs.iter_mut() { *p = (*p - max_val).exp(); sum += *p; } for p in probs.iter_mut() { *p /= sum; } // Probabilities should sum to 1.0 let prob_sum: f32 = probs.iter().sum(); assert!( (prob_sum - 1.0).abs() < EPSILON, "Probabilities should sum to 1.0, got {}", prob_sum ); // All probabilities should be in [0, 1] for (i, p) in probs.iter().enumerate() { assert!( *p >= 0.0 && *p <= 1.0, "Probability at {} should be in [0, 1], got {}", i, p ); } } #[test] fn test_token_generation_coherence() { // Test that token sequences have reasonable patterns let sample_tokens: Vec = vec![1, 234, 567, 89, 1234, 5678]; // All tokens should be valid (within vocab range) for token in &sample_tokens { assert!( *token < RUVLTRA_SMALL_CONFIG.vocab_size as u32, "Token {} exceeds vocab size", token ); } // No repeated padding tokens at start (unless intentional) // This is a basic coherence check let has_varied_tokens = sample_tokens.windows(2).any(|w| w[0] != w[1]); assert!( has_varied_tokens || sample_tokens.len() <= 1, "Token sequence should have variety" ); } #[test] fn test_attention_weights_valid() { let seq_len = 32; // Simulated attention weights (should sum to 1 per row after softmax) let mut attention = vec![0.0f32; seq_len * seq_len]; // Initialize with causal mask pattern for i in 0..seq_len { for j in 0..=i { attention[i * seq_len + j] = 1.0 / (i + 1) as f32; } } // Verify row sums are approximately 1.0 for i in 0..seq_len { let row_sum: f32 = attention[i * seq_len..(i + 1) * seq_len].iter().sum(); assert!( (row_sum - 1.0).abs() < LOOSE_EPSILON, "Attention row {} should sum to 1.0, got {}", i, row_sum ); } } } // ============================================================================ // Performance Validation Tests // ============================================================================ mod performance_validation { use super::*; #[test] fn test_inference_timing_reasonable() { // Basic timing test for operations let start = Instant::now(); // Simulate a basic forward pass calculation let data: Vec = (0..4096).map(|i| i as f32 * 0.001).collect(); let mut output = vec![0.0f32; 4096]; for (i, (o, d)) in output.iter_mut().zip(data.iter()).enumerate() { *o = *d * (i as f32 % 10.0 + 1.0); } let duration = start.elapsed(); // Basic operations should be very fast assert!( duration < Duration::from_millis(10), "Basic ops took {:?}", duration ); } #[test] fn test_batch_processing_scaling() { let batch_sizes = [1, 2, 4, 8, 16, 32]; let dim = 256; let mut timings = Vec::new(); for batch_size in batch_sizes { let start = Instant::now(); // Simulate batch processing let data = vec![1.0f32; batch_size * dim]; let _: f32 = data.iter().sum(); timings.push((batch_size, start.elapsed())); } // Larger batches should take more time (linear or better scaling) // This is a sanity check that batch size affects timing let _ = timings; } #[test] #[ignore] // Run with: cargo test --release -- --ignored fn test_throughput_benchmark() { let iterations = 100; let dim = 4096; let data: Vec = (0..dim).map(|i| i as f32 * 0.001).collect(); let start = Instant::now(); for _ in 0..iterations { let _: f32 = data.iter().map(|x| x * x).sum(); } let duration = start.elapsed(); let ops_per_second = (iterations * dim) as f64 / duration.as_secs_f64(); println!("Throughput: {:.2e} ops/sec", ops_per_second); // Should achieve reasonable throughput assert!( ops_per_second > 1_000_000.0, "Throughput {:.2e} below minimum", ops_per_second ); } } // ============================================================================ // Thread Safety Tests // ============================================================================ mod thread_safety { use super::*; use std::thread; #[test] fn test_ane_detection_thread_safe() { let handles: Vec<_> = (0..4) .map(|_| { thread::spawn(|| { for _ in 0..100 { let _ = is_ane_available(); let _ = AneCapabilities::detect(); } }) }) .collect(); for handle in handles { handle.join().expect("Thread should complete"); } } #[test] fn test_quantization_thread_safe() { let handles: Vec<_> = (0..4) .map(|i| { thread::spawn(move || { let mut data = vec![0u8; 18]; data[0] = 0x00; data[1] = 0x3C; for j in 2..18 { data[j] = ((i + j) % 256) as u8; } let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 32); assert!(result.is_ok()); let output = result.unwrap(); assert!(output.iter().all(|v| v.is_finite())); }) }) .collect(); for handle in handles { handle.join().expect("Thread should complete"); } } #[test] fn test_concurrent_routing_decisions() { let handles: Vec<_> = (0..4) .map(|i| { thread::spawn(move || { for j in 0..100 { let batch = (i + 1) * (j + 1) % 64 + 1; let dim = ((i + j) * 16 + 64) % 4096 + 64; let _ = should_use_ane(batch, dim); let _ = should_use_ane_matmul(batch, dim, dim); } }) }) .collect(); for handle in handles { handle.join().expect("Thread should complete"); } } }