Files
wifi-densepose/vendor/ruvector/crates/ruvllm/tests/ruvltra_tests.rs

1258 lines
38 KiB
Rust

#![allow(
clippy::all,
unused_imports,
unused_variables,
dead_code,
unused_mut,
unused_assignments,
non_camel_case_types,
clippy::approx_constant,
unexpected_cfgs,
unused_must_use,
unused_parens
)]
//! RuvLTRA-Small Model Tests
//!
//! This module provides comprehensive tests for the RuvLTRA-Small inference engine,
//! validating model loading, quantization accuracy, SONA integration, and ANE dispatch.
//!
//! ## Test Categories
//!
//! - **Model Loading**: Validate GGUF/SafeTensors loading and configuration
//! - **Quantization**: Test dequantization accuracy across all quantization formats
//! - **SONA Integration**: Test Self-Optimizing Neural Architecture adaptation
//! - **ANE Dispatch**: Test Apple Neural Engine routing and fallback behavior
//!
//! ## Running Tests
//!
//! ```bash
//! # Run all RuvLTRA tests
//! cargo test --package ruvllm ruvltra_tests
//!
//! # Run with ANE support (Apple Silicon only)
//! cargo test --package ruvllm --features coreml ruvltra_tests
//!
//! # Run with full feature set
//! cargo test --package ruvllm --all-features ruvltra_tests
//! ```
use ruvllm::backends::{
AneCapabilities, ComputeUnits, ModelArchitecture, ModelConfig, Quantization,
};
use ruvllm::gguf::quantization::{dequantize_tensor, GgufQuantType, QuantizedTensor};
use ruvllm::kernels::ane_ops::{
get_ane_recommendation, is_ane_available, should_use_ane, should_use_ane_activation,
should_use_ane_matmul,
};
use std::time::{Duration, Instant};
// ============================================================================
// Test Fixtures and Constants
// ============================================================================
/// RuvLTRA-Small model configuration for testing
const RUVLTRA_SMALL_CONFIG: RuvLtraTestConfig = RuvLtraTestConfig {
vocab_size: 32000,
hidden_size: 2048,
intermediate_size: 5504,
num_hidden_layers: 22,
num_attention_heads: 32,
num_key_value_heads: 8,
max_position_embeddings: 8192,
rope_theta: 10000.0,
layer_norm_eps: 1e-5,
};
/// Test configuration for RuvLTRA-Small
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)]
struct RuvLtraTestConfig {
vocab_size: usize,
hidden_size: usize,
intermediate_size: usize,
num_hidden_layers: usize,
num_attention_heads: usize,
num_key_value_heads: usize,
max_position_embeddings: usize,
rope_theta: f32,
layer_norm_eps: f32,
}
/// Memory bounds for validation (in bytes)
const MEMORY_BOUNDS: MemoryBounds = MemoryBounds {
// Q4_K quantization: ~1.2GB for small model
max_model_memory: 1_500_000_000,
// KV cache for 8K context
max_kv_cache_memory: 500_000_000,
// Working memory for inference
max_working_memory: 200_000_000,
};
#[derive(Debug, Clone, Copy)]
struct MemoryBounds {
max_model_memory: usize,
max_kv_cache_memory: usize,
max_working_memory: usize,
}
/// Test tolerance levels
const EPSILON: f32 = 1e-4;
const LOOSE_EPSILON: f32 = 0.01;
const QUANTIZATION_EPSILON: f32 = 0.1; // Higher tolerance for quantized values
// ============================================================================
// Model Loading Tests
// ============================================================================
mod model_loading {
use super::*;
#[test]
fn test_model_config_creation() {
let config = ModelConfig {
architecture: ModelArchitecture::Llama,
quantization: Some(Quantization::Q4K),
max_sequence_length: 8192,
vocab_size: Some(RUVLTRA_SMALL_CONFIG.vocab_size),
use_flash_attention: true,
..Default::default()
};
assert_eq!(config.architecture, ModelArchitecture::Llama);
assert_eq!(config.quantization, Some(Quantization::Q4K));
assert_eq!(config.max_sequence_length, 8192);
assert_eq!(config.vocab_size, Some(RUVLTRA_SMALL_CONFIG.vocab_size));
assert!(config.use_flash_attention);
}
#[test]
fn test_model_architecture_variants() {
let architectures = [
ModelArchitecture::Llama,
ModelArchitecture::Mistral,
ModelArchitecture::Phi,
ModelArchitecture::Qwen,
];
for arch in architectures {
let config = ModelConfig {
architecture: arch,
quantization: Some(Quantization::Q4K),
max_sequence_length: 4096,
vocab_size: Some(32000),
use_flash_attention: false,
..Default::default()
};
assert_eq!(config.architecture, arch);
// Verify architecture can be formatted/debugged
let _ = format!("{:?}", arch);
}
}
#[test]
fn test_quantization_format_selection() {
let quantizations = [
(Quantization::None, "None", 32.0),
(Quantization::F16, "F16", 16.0),
(Quantization::Bf16, "Bf16", 16.0),
(Quantization::Q8, "Q8", 8.0),
(Quantization::Q4K, "Q4K", 4.5),
(Quantization::Q4, "Q4", 4.0),
(Quantization::Q2K, "Q2K", 2.56),
];
for (quant, name, _expected_bits) in quantizations {
let config = ModelConfig {
architecture: ModelArchitecture::Llama,
quantization: Some(quant),
max_sequence_length: 4096,
vocab_size: Some(32000),
use_flash_attention: false,
..Default::default()
};
// Verify quantization is set correctly
assert_eq!(config.quantization, Some(quant));
// Verify name format
let quant_name = format!("{:?}", quant);
assert!(
quant_name.contains(name) || !quant_name.is_empty(),
"Quantization {:?} should have recognizable name",
quant
);
}
}
#[test]
fn test_model_config_default_values() {
let config = ModelConfig::default();
// Verify sensible defaults
assert!(config.max_sequence_length > 0);
// vocab_size is now Option, so check it's present or use default behavior
}
#[test]
fn test_invalid_model_path_error() {
// This test validates error handling for non-existent paths
let result = std::fs::metadata("/nonexistent/path/to/model.gguf");
assert!(result.is_err(), "Non-existent path should fail");
}
#[test]
fn test_gguf_extension_validation() {
let valid_extensions = [".gguf", ".GGUF"];
let invalid_extensions = [".bin", ".safetensors", ".pt", ".pth"];
for ext in valid_extensions {
assert!(
ext.to_lowercase().ends_with("gguf"),
"Extension {} should be valid GGUF",
ext
);
}
for ext in invalid_extensions {
assert!(
!ext.to_lowercase().ends_with("gguf"),
"Extension {} should not be GGUF",
ext
);
}
}
#[test]
fn test_rope_theta_configuration() {
// Test rope theta configuration
let config_with_theta = ModelConfig {
architecture: ModelArchitecture::Llama,
quantization: Some(Quantization::Q4K),
max_sequence_length: 4096,
vocab_size: Some(32000),
rope_theta: Some(10000.0),
use_flash_attention: false,
..Default::default()
};
assert_eq!(config_with_theta.rope_theta, Some(10000.0));
// Rope theta is the frequency base for rotary position embeddings
// The actual implementation depends on the model architecture
}
#[test]
fn test_context_length_bounds() {
let context_lengths = [512, 1024, 2048, 4096, 8192, 16384, 32768];
for ctx_len in context_lengths {
let config = ModelConfig {
architecture: ModelArchitecture::Llama,
quantization: Some(Quantization::Q4K),
max_sequence_length: ctx_len,
vocab_size: Some(32000),
use_flash_attention: false,
..Default::default()
};
assert_eq!(config.max_sequence_length, ctx_len);
assert!(ctx_len > 0, "Context length must be positive");
}
}
}
// ============================================================================
// Quantization Accuracy Tests
// ============================================================================
mod quantization_accuracy {
use super::*;
/// Test Q4_0 dequantization accuracy
#[test]
fn test_q4_0_dequantization_accuracy() {
// Create test Q4_0 block: scale + packed 4-bit values
let mut block = vec![0u8; 18];
// Set scale = 0.5 (f16: 0x3800)
block[0] = 0x00;
block[1] = 0x38;
// Pack values: (8 - offset) gives 0, (9 - offset) gives 1, etc.
// Q4_0 uses offset of 8
for i in 0..16 {
let low = 8u8; // Will become 0 after offset
let high = 9u8; // Will become 1 after offset
block[2 + i] = low | (high << 4);
}
let _output = vec![0.0f32; 32];
let dtype = GgufQuantType::Q4_0;
// Verify block size
assert_eq!(dtype.block_size(), 32);
assert_eq!(dtype.type_size(), 18);
// Dequantize
let result = dequantize_tensor(&block, dtype, 32);
assert!(result.is_ok(), "Dequantization should succeed");
let output = result.unwrap();
// Verify pattern: alternating 0.0, 0.5
for i in 0..32 {
if i % 2 == 0 {
assert!(
output[i].abs() < QUANTIZATION_EPSILON,
"Even index {} should be ~0.0, got {}",
i,
output[i]
);
} else {
assert!(
(output[i] - 0.5).abs() < QUANTIZATION_EPSILON,
"Odd index {} should be ~0.5, got {}",
i,
output[i]
);
}
}
}
/// Test Q8_0 dequantization accuracy
#[test]
fn test_q8_0_dequantization_accuracy() {
// Create test Q8_0 block: scale (2 bytes) + 32 int8 values
let mut block = vec![0u8; 34];
// Set scale = 1.0 (f16: 0x3C00)
block[0] = 0x00;
block[1] = 0x3C;
// Set values 1, 2, 3, ..., 32 as signed int8
for i in 0..32 {
block[2 + i] = (i + 1) as u8;
}
let result = dequantize_tensor(&block, GgufQuantType::Q8_0, 32);
assert!(result.is_ok());
let output = result.unwrap();
// Verify: values should be 1.0, 2.0, ..., 32.0
for i in 0..32 {
let expected = (i + 1) as f32;
assert!(
(output[i] - expected).abs() < EPSILON,
"Index {}: expected {}, got {}",
i,
expected,
output[i]
);
}
}
/// Test Q4_K dequantization (most common format)
#[test]
fn test_q4_k_dequantization_accuracy() {
let dtype = GgufQuantType::Q4_K;
// Verify Q4_K properties
assert_eq!(dtype.block_size(), 256);
assert_eq!(dtype.type_size(), 144);
assert!(dtype.is_quantized());
let bits = dtype.bits_per_weight();
assert!((bits - 4.5).abs() < 0.1, "Q4_K should be ~4.5 bits/weight");
}
/// Test all quantization types have valid properties
#[test]
fn test_all_quant_types_valid() {
let quant_types = [
GgufQuantType::F32,
GgufQuantType::F16,
GgufQuantType::Q8_0,
GgufQuantType::Q4_0,
GgufQuantType::Q4_1,
GgufQuantType::Q5_0,
GgufQuantType::Q5_1,
GgufQuantType::Q2_K,
GgufQuantType::Q3_K,
GgufQuantType::Q4_K,
GgufQuantType::Q5_K,
GgufQuantType::Q6_K,
];
for dtype in quant_types {
// Block size must be positive
assert!(
dtype.block_size() > 0,
"{:?} must have positive block size",
dtype
);
// Type size must be positive
assert!(
dtype.type_size() > 0,
"{:?} must have positive type size",
dtype
);
// Bits per weight should be in reasonable range (1-32)
let bits = dtype.bits_per_weight();
assert!(
bits >= 1.0 && bits <= 32.0,
"{:?} bits/weight {} out of range",
dtype,
bits
);
// Name should be non-empty
assert!(
!dtype.name().is_empty(),
"{:?} must have non-empty name",
dtype
);
}
}
/// Test tensor size calculation
#[test]
fn test_tensor_size_calculation() {
// F32: 256 elements = 256 * 4 = 1024 bytes
assert_eq!(GgufQuantType::F32.tensor_size(256), 1024);
// F16: 256 elements = 256 * 2 = 512 bytes
assert_eq!(GgufQuantType::F16.tensor_size(256), 512);
// Q4_0: 256 elements = 8 blocks * 18 bytes = 144 bytes
assert_eq!(GgufQuantType::Q4_0.tensor_size(256), 144);
// Q4_K: 256 elements = 1 block * 144 bytes = 144 bytes
assert_eq!(GgufQuantType::Q4_K.tensor_size(256), 144);
}
/// Test quantized vs non-quantized detection
#[test]
fn test_is_quantized() {
// Non-quantized types
assert!(!GgufQuantType::F32.is_quantized());
assert!(!GgufQuantType::F16.is_quantized());
assert!(!GgufQuantType::Bf16.is_quantized());
// Quantized types
assert!(GgufQuantType::Q4_0.is_quantized());
assert!(GgufQuantType::Q8_0.is_quantized());
assert!(GgufQuantType::Q4_K.is_quantized());
assert!(GgufQuantType::Q2_K.is_quantized());
}
/// Test QuantizedTensor container
#[test]
fn test_quantized_tensor_container() {
let tensor = QuantizedTensor {
data: vec![0u8; 144], // One Q4_K block
dtype: GgufQuantType::Q4_K,
shape: vec![256],
num_elements: 256,
};
assert_eq!(tensor.block_count(), 1);
assert!(tensor.dtype.is_quantized());
assert_eq!(tensor.shape, vec![256]);
}
/// Test dequantization roundtrip sanity
#[test]
fn test_dequantization_finite_values() {
// Create valid Q4_0 quantized data
// Q4_0 format: 2 bytes scale (f16) + 16 bytes packed 4-bit values = 18 bytes per block
// Each block represents 32 elements
let mut data = vec![0u8; 18 * 8]; // 8 Q4_0 blocks = 256 elements
for block in 0..8 {
let base = block * 18;
// Set a valid f16 scale: 0x3C00 = 1.0f16, small positive value
data[base] = 0x00; // Low byte of f16 scale
data[base + 1] = 0x3C; // High byte: 0x3C00 = 1.0
// Fill packed 4-bit values with valid patterns (0-15)
for i in 0..16 {
let low_nibble = (i % 16) as u8;
let high_nibble = ((i + 1) % 16) as u8;
data[base + 2 + i] = low_nibble | (high_nibble << 4);
}
}
let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 256);
assert!(result.is_ok());
let output = result.unwrap();
// All values should be finite
for (i, val) in output.iter().enumerate() {
assert!(
val.is_finite(),
"Value at index {} should be finite, got {}",
i,
val
);
}
}
/// Test quantization type conversion from u32
#[test]
fn test_quant_type_try_from() {
// Valid conversions
assert_eq!(GgufQuantType::try_from(0).unwrap(), GgufQuantType::F32);
assert_eq!(GgufQuantType::try_from(1).unwrap(), GgufQuantType::F16);
assert_eq!(GgufQuantType::try_from(8).unwrap(), GgufQuantType::Q8_0);
assert_eq!(GgufQuantType::try_from(12).unwrap(), GgufQuantType::Q4_K);
// Invalid conversion
assert!(GgufQuantType::try_from(100).is_err());
assert!(GgufQuantType::try_from(255).is_err());
}
}
// ============================================================================
// SONA Integration Tests
// ============================================================================
mod sona_integration {
use super::*;
/// SONA configuration for testing
#[derive(Debug, Clone)]
struct SonaTestConfig {
learning_rate: f32,
momentum: f32,
adaptation_threshold: f32,
max_adaptations_per_step: usize,
}
impl Default for SonaTestConfig {
fn default() -> Self {
Self {
learning_rate: 0.001,
momentum: 0.9,
adaptation_threshold: 0.05,
max_adaptations_per_step: 3,
}
}
}
#[test]
fn test_sona_config_defaults() {
let config = SonaTestConfig::default();
assert!(
config.learning_rate > 0.0 && config.learning_rate < 1.0,
"Learning rate should be in (0, 1)"
);
assert!(
config.momentum >= 0.0 && config.momentum < 1.0,
"Momentum should be in [0, 1)"
);
assert!(
config.adaptation_threshold > 0.0,
"Adaptation threshold must be positive"
);
assert!(
config.max_adaptations_per_step > 0,
"Max adaptations must be positive"
);
}
#[test]
fn test_sona_adaptation_timing() {
// SONA adaptation should be fast (<0.05ms target)
let start = Instant::now();
// Simulate SONA adaptation calculation
let mut weights = vec![0.5f32; 1000];
let gradients = vec![0.01f32; 1000];
// Simple gradient update (simulating SONA)
for (w, g) in weights.iter_mut().zip(gradients.iter()) {
*w -= 0.001 * g;
}
let duration = start.elapsed();
// Should be very fast
assert!(
duration < Duration::from_millis(1),
"SONA adaptation took {:?}, expected <1ms",
duration
);
}
#[test]
fn test_sona_routing_decision() {
// Test routing decision logic
struct RoutingDecision {
use_ane: bool,
use_neon: bool,
confidence: f32,
}
fn make_routing_decision(batch_size: usize, dim: usize) -> RoutingDecision {
let ane_available = is_ane_available();
if ane_available && should_use_ane(batch_size, dim) {
RoutingDecision {
use_ane: true,
use_neon: false,
confidence: 0.9,
}
} else {
RoutingDecision {
use_ane: false,
use_neon: true,
confidence: 0.95,
}
}
}
// Small dimensions: NEON preferred
let decision = make_routing_decision(1, 32);
assert!(
decision.use_neon || decision.use_ane,
"Must use some compute backend"
);
// Large batch with aligned dims: ANE may be preferred on Apple Silicon
let decision = make_routing_decision(32, 256);
assert!(decision.confidence > 0.5);
}
#[test]
fn test_sona_pattern_learning() {
// Simulate SONA pattern storage
#[derive(Debug)]
#[allow(dead_code)]
struct SonaPattern {
input_hash: u64,
optimal_config: String,
performance_score: f32,
}
let patterns = vec![
SonaPattern {
input_hash: 12345,
optimal_config: "ANE+NEON".to_string(),
performance_score: 0.95,
},
SonaPattern {
input_hash: 67890,
optimal_config: "NEON-only".to_string(),
performance_score: 0.88,
},
];
for pattern in &patterns {
assert!(pattern.performance_score >= 0.0 && pattern.performance_score <= 1.0);
assert!(!pattern.optimal_config.is_empty());
}
}
#[test]
fn test_sona_warmup_iterations() {
// SONA typically needs a few iterations to warm up
const WARMUP_ITERATIONS: usize = 3;
let mut metrics = Vec::new();
for i in 0..10 {
// Simulate inference timing
let start = Instant::now();
std::thread::sleep(Duration::from_micros(100 + i as u64 * 10));
let duration = start.elapsed();
metrics.push(duration);
}
// Post-warmup iterations should be more stable
let warmup_variance = calculate_variance(&metrics[..WARMUP_ITERATIONS]);
let stable_variance = calculate_variance(&metrics[WARMUP_ITERATIONS..]);
// Note: This is a simplified test; in real scenarios,
// stable variance should typically be lower
let _ = (warmup_variance, stable_variance);
}
fn calculate_variance(durations: &[Duration]) -> f64 {
if durations.is_empty() {
return 0.0;
}
let mean: f64 =
durations.iter().map(|d| d.as_secs_f64()).sum::<f64>() / durations.len() as f64;
durations
.iter()
.map(|d| (d.as_secs_f64() - mean).powi(2))
.sum::<f64>()
/ durations.len() as f64
}
#[test]
fn test_sona_ewc_consolidation() {
// Test EWC++ (Elastic Weight Consolidation) behavior
// This prevents catastrophic forgetting in SONA
struct EwcConfig {
lambda: f32, // Importance weight
fisher_samples: usize,
}
let config = EwcConfig {
lambda: 1000.0,
fisher_samples: 100,
};
// Lambda should be positive for weight importance
assert!(config.lambda > 0.0);
// Need enough samples for Fisher information
assert!(config.fisher_samples >= 10);
}
}
// ============================================================================
// ANE Dispatch Tests
// ============================================================================
mod ane_dispatch {
use super::*;
#[test]
fn test_ane_availability_detection() {
// Should not panic
let available = is_ane_available();
// Result should be consistent
assert_eq!(is_ane_available(), available);
assert_eq!(is_ane_available(), available);
}
#[test]
fn test_ane_capabilities_detection() {
let caps = AneCapabilities::detect();
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
{
// On Apple Silicon, ANE should be available
assert!(caps.available, "ANE should be available on Apple Silicon");
assert!(caps.tops > 0.0, "TOPS should be positive");
assert!(
caps.max_model_size_mb > 0,
"Max model size should be positive"
);
assert!(!caps.supported_ops.is_empty(), "Should have supported ops");
}
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
{
// On non-Apple Silicon, ANE may not be available
if !caps.available {
assert_eq!(caps.tops, 0.0);
assert_eq!(caps.max_model_size_mb, 0);
}
}
}
#[test]
fn test_ane_routing_thresholds() {
// Test various dimension combinations
let test_cases = [
// (batch, dim, description)
(1, 64, "minimum ANE dimensions"),
(1, 128, "small aligned tensor"),
(32, 256, "typical LLM dimensions"),
(64, 4096, "large batch with large dim"),
(1, 32, "below minimum dim"),
(100, 128, "above max batch"),
];
for (batch, dim, desc) in test_cases {
let should_use = should_use_ane(batch, dim);
// Just verify no panic
let _ = (should_use, desc);
}
}
#[test]
fn test_ane_matmul_routing() {
let test_cases = [
// (m, k, n, description)
(1, 64, 64, "small square matmul"),
(32, 256, 128, "medium matmul"),
(1, 4096, 4096, "large matmul"),
(64, 512, 512, "optimal ANE size"),
(1, 8192, 8192, "very large matmul"),
];
for (m, k, n, desc) in test_cases {
let _should_use = should_use_ane_matmul(m, k, n);
let recommendation = get_ane_recommendation(m, k, n);
// Recommendation should be consistent
assert!(
recommendation.confidence >= 0.0 && recommendation.confidence <= 1.0,
"Confidence for {} should be in [0, 1]",
desc
);
// Expected speedup should be reasonable
assert!(
recommendation.expected_speedup > 0.0 && recommendation.expected_speedup < 10.0,
"Speedup for {} should be reasonable",
desc
);
}
}
#[test]
fn test_ane_activation_routing() {
let test_cases = [
(1, 64),
(32, 256),
(64, 4096),
(100, 128), // Above typical ANE batch limit
(1, 1000000), // Very large tensor
];
for (batch, dim) in test_cases {
let should_use = should_use_ane_activation(batch, dim);
// Just verify no panic and reasonable result
let _ = should_use;
}
}
#[test]
fn test_ane_recommendation_structure() {
let rec = get_ane_recommendation(1, 256, 256);
// All fields should be valid
assert!(rec.confidence >= 0.0 && rec.confidence <= 1.0);
assert!(!rec.reason.is_empty());
assert!(rec.expected_speedup > 0.0);
// Test Clone
let cloned = rec.clone();
assert_eq!(rec.use_ane, cloned.use_ane);
assert_eq!(rec.confidence, cloned.confidence);
// Test Debug
let debug = format!("{:?}", rec);
assert!(debug.contains("use_ane"));
}
#[test]
fn test_compute_units_configuration() {
let units = [
ComputeUnits::CpuOnly,
ComputeUnits::CpuAndGpu,
ComputeUnits::CpuAndNeuralEngine,
ComputeUnits::All,
];
for unit in units {
// Test ANE usage flag
let _uses_ane = unit.uses_ane();
let _uses_gpu = unit.uses_gpu();
// At least CPU should always be used
// (implied by all compute unit configurations)
// Test description
let desc = unit.description();
assert!(!desc.is_empty());
}
}
#[test]
fn test_ane_dimension_alignment() {
// ANE prefers 16-aligned dimensions
let aligned_dims = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
let unaligned_dims = [17, 33, 65, 100, 255, 1000];
for dim in aligned_dims {
assert_eq!(dim % 16, 0, "{} should be 16-aligned", dim);
}
for dim in unaligned_dims {
assert_ne!(dim % 16, 0, "{} should not be 16-aligned", dim);
}
}
#[test]
fn test_ane_no_dispatch_errors() {
// Simulate dispatch to verify no errors occur
let test_tensors = [(1, 64), (32, 256), (64, 4096)];
for (batch, dim) in test_tensors {
// These should never panic
let _ = should_use_ane(batch, dim);
let _ = should_use_ane_activation(batch, dim);
let _ = should_use_ane_matmul(batch, dim, dim);
}
}
#[test]
fn test_fallback_behavior() {
// Test that fallback to NEON works when ANE is unavailable
let mut data = vec![1.0f32; 64];
// This should work regardless of ANE availability
// by falling back to scalar/NEON implementation
for v in data.iter_mut() {
*v = *v / (1.0 + (-*v).exp()); // SiLU
}
// All values should be valid
assert!(data.iter().all(|v| v.is_finite()));
}
}
// ============================================================================
// Memory Management Tests
// ============================================================================
mod memory_management {
use super::*;
#[test]
fn test_memory_bounds_validation() {
// Verify memory bounds are reasonable
assert!(MEMORY_BOUNDS.max_model_memory > 0);
assert!(MEMORY_BOUNDS.max_kv_cache_memory > 0);
assert!(MEMORY_BOUNDS.max_working_memory > 0);
// Total should be reasonable for device
let total = MEMORY_BOUNDS.max_model_memory
+ MEMORY_BOUNDS.max_kv_cache_memory
+ MEMORY_BOUNDS.max_working_memory;
// Should fit in 8GB device memory
assert!(total < 8_000_000_000, "Total memory {} exceeds 8GB", total);
}
#[test]
fn test_tensor_memory_estimation() {
// Estimate memory for RuvLTRA-Small tensors
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
let _num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers;
let vocab_size = RUVLTRA_SMALL_CONFIG.vocab_size;
// Embedding: vocab_size * hidden_size * bytes_per_element
let embedding_size_f32 = vocab_size * hidden_size * 4;
let embedding_size_q4k = GgufQuantType::Q4_K.tensor_size(vocab_size * hidden_size);
// Q4_K should be much smaller
assert!(
embedding_size_q4k < embedding_size_f32 / 4,
"Q4_K should be at least 4x smaller than F32"
);
}
#[test]
fn test_kv_cache_sizing() {
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
let num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers;
let num_kv_heads = RUVLTRA_SMALL_CONFIG.num_key_value_heads;
let max_seq_len = RUVLTRA_SMALL_CONFIG.max_position_embeddings;
let head_dim = hidden_size / RUVLTRA_SMALL_CONFIG.num_attention_heads;
// KV cache per layer: 2 * seq_len * num_kv_heads * head_dim * sizeof(f16)
let kv_per_layer = 2 * max_seq_len * num_kv_heads * head_dim * 2;
let total_kv_cache = kv_per_layer * num_layers;
assert!(
total_kv_cache < MEMORY_BOUNDS.max_kv_cache_memory as usize,
"KV cache {} exceeds bound {}",
total_kv_cache,
MEMORY_BOUNDS.max_kv_cache_memory
);
}
#[test]
fn test_working_memory_allocation() {
// Simulate working memory allocation
let batch_size = 1;
let seq_len = 1024;
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
// Activations: batch * seq * hidden * sizeof(f32)
let activation_memory = batch_size * seq_len * hidden_size * 4;
// Should fit in working memory
assert!(activation_memory < MEMORY_BOUNDS.max_working_memory as usize);
}
}
// ============================================================================
// Output Validation Tests
// ============================================================================
mod output_validation {
use super::*;
#[test]
fn test_logits_finite() {
// Simulated logits output
let logits: Vec<f32> = (0..RUVLTRA_SMALL_CONFIG.vocab_size)
.map(|i| (i as f32) * 0.001 - 16.0)
.collect();
// All logits should be finite
for (i, logit) in logits.iter().enumerate() {
assert!(
logit.is_finite(),
"Logit at index {} should be finite, got {}",
i,
logit
);
}
}
#[test]
fn test_softmax_probabilities() {
// Simulated softmax output
let mut probs = vec![0.1f32; 10];
// Apply softmax normalization
let max_val = probs.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0;
for p in probs.iter_mut() {
*p = (*p - max_val).exp();
sum += *p;
}
for p in probs.iter_mut() {
*p /= sum;
}
// Probabilities should sum to 1.0
let prob_sum: f32 = probs.iter().sum();
assert!(
(prob_sum - 1.0).abs() < EPSILON,
"Probabilities should sum to 1.0, got {}",
prob_sum
);
// All probabilities should be in [0, 1]
for (i, p) in probs.iter().enumerate() {
assert!(
*p >= 0.0 && *p <= 1.0,
"Probability at {} should be in [0, 1], got {}",
i,
p
);
}
}
#[test]
fn test_token_generation_coherence() {
// Test that token sequences have reasonable patterns
let sample_tokens: Vec<u32> = vec![1, 234, 567, 89, 1234, 5678];
// All tokens should be valid (within vocab range)
for token in &sample_tokens {
assert!(
*token < RUVLTRA_SMALL_CONFIG.vocab_size as u32,
"Token {} exceeds vocab size",
token
);
}
// No repeated padding tokens at start (unless intentional)
// This is a basic coherence check
let has_varied_tokens = sample_tokens.windows(2).any(|w| w[0] != w[1]);
assert!(
has_varied_tokens || sample_tokens.len() <= 1,
"Token sequence should have variety"
);
}
#[test]
fn test_attention_weights_valid() {
let seq_len = 32;
// Simulated attention weights (should sum to 1 per row after softmax)
let mut attention = vec![0.0f32; seq_len * seq_len];
// Initialize with causal mask pattern
for i in 0..seq_len {
for j in 0..=i {
attention[i * seq_len + j] = 1.0 / (i + 1) as f32;
}
}
// Verify row sums are approximately 1.0
for i in 0..seq_len {
let row_sum: f32 = attention[i * seq_len..(i + 1) * seq_len].iter().sum();
assert!(
(row_sum - 1.0).abs() < LOOSE_EPSILON,
"Attention row {} should sum to 1.0, got {}",
i,
row_sum
);
}
}
}
// ============================================================================
// Performance Validation Tests
// ============================================================================
mod performance_validation {
use super::*;
#[test]
fn test_inference_timing_reasonable() {
// Basic timing test for operations
let start = Instant::now();
// Simulate a basic forward pass calculation
let data: Vec<f32> = (0..4096).map(|i| i as f32 * 0.001).collect();
let mut output = vec![0.0f32; 4096];
for (i, (o, d)) in output.iter_mut().zip(data.iter()).enumerate() {
*o = *d * (i as f32 % 10.0 + 1.0);
}
let duration = start.elapsed();
// Basic operations should be very fast
assert!(
duration < Duration::from_millis(10),
"Basic ops took {:?}",
duration
);
}
#[test]
fn test_batch_processing_scaling() {
let batch_sizes = [1, 2, 4, 8, 16, 32];
let dim = 256;
let mut timings = Vec::new();
for batch_size in batch_sizes {
let start = Instant::now();
// Simulate batch processing
let data = vec![1.0f32; batch_size * dim];
let _: f32 = data.iter().sum();
timings.push((batch_size, start.elapsed()));
}
// Larger batches should take more time (linear or better scaling)
// This is a sanity check that batch size affects timing
let _ = timings;
}
#[test]
#[ignore] // Run with: cargo test --release -- --ignored
fn test_throughput_benchmark() {
let iterations = 100;
let dim = 4096;
let data: Vec<f32> = (0..dim).map(|i| i as f32 * 0.001).collect();
let start = Instant::now();
for _ in 0..iterations {
let _: f32 = data.iter().map(|x| x * x).sum();
}
let duration = start.elapsed();
let ops_per_second = (iterations * dim) as f64 / duration.as_secs_f64();
println!("Throughput: {:.2e} ops/sec", ops_per_second);
// Should achieve reasonable throughput
assert!(
ops_per_second > 1_000_000.0,
"Throughput {:.2e} below minimum",
ops_per_second
);
}
}
// ============================================================================
// Thread Safety Tests
// ============================================================================
mod thread_safety {
use super::*;
use std::thread;
#[test]
fn test_ane_detection_thread_safe() {
let handles: Vec<_> = (0..4)
.map(|_| {
thread::spawn(|| {
for _ in 0..100 {
let _ = is_ane_available();
let _ = AneCapabilities::detect();
}
})
})
.collect();
for handle in handles {
handle.join().expect("Thread should complete");
}
}
#[test]
fn test_quantization_thread_safe() {
let handles: Vec<_> = (0..4)
.map(|i| {
thread::spawn(move || {
let mut data = vec![0u8; 18];
data[0] = 0x00;
data[1] = 0x3C;
for j in 2..18 {
data[j] = ((i + j) % 256) as u8;
}
let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 32);
assert!(result.is_ok());
let output = result.unwrap();
assert!(output.iter().all(|v| v.is_finite()));
})
})
.collect();
for handle in handles {
handle.join().expect("Thread should complete");
}
}
#[test]
fn test_concurrent_routing_decisions() {
let handles: Vec<_> = (0..4)
.map(|i| {
thread::spawn(move || {
for j in 0..100 {
let batch = (i + 1) * (j + 1) % 64 + 1;
let dim = ((i + j) * 16 + 64) % 4096 + 64;
let _ = should_use_ane(batch, dim);
let _ = should_use_ane_matmul(batch, dim, dim);
}
})
})
.collect();
for handle in handles {
handle.join().expect("Thread should complete");
}
}
}