1258 lines
38 KiB
Rust
1258 lines
38 KiB
Rust
#![allow(
|
|
clippy::all,
|
|
unused_imports,
|
|
unused_variables,
|
|
dead_code,
|
|
unused_mut,
|
|
unused_assignments,
|
|
non_camel_case_types,
|
|
clippy::approx_constant,
|
|
unexpected_cfgs,
|
|
unused_must_use,
|
|
unused_parens
|
|
)]
|
|
//! RuvLTRA-Small Model Tests
|
|
//!
|
|
//! This module provides comprehensive tests for the RuvLTRA-Small inference engine,
|
|
//! validating model loading, quantization accuracy, SONA integration, and ANE dispatch.
|
|
//!
|
|
//! ## Test Categories
|
|
//!
|
|
//! - **Model Loading**: Validate GGUF/SafeTensors loading and configuration
|
|
//! - **Quantization**: Test dequantization accuracy across all quantization formats
|
|
//! - **SONA Integration**: Test Self-Optimizing Neural Architecture adaptation
|
|
//! - **ANE Dispatch**: Test Apple Neural Engine routing and fallback behavior
|
|
//!
|
|
//! ## Running Tests
|
|
//!
|
|
//! ```bash
|
|
//! # Run all RuvLTRA tests
|
|
//! cargo test --package ruvllm ruvltra_tests
|
|
//!
|
|
//! # Run with ANE support (Apple Silicon only)
|
|
//! cargo test --package ruvllm --features coreml ruvltra_tests
|
|
//!
|
|
//! # Run with full feature set
|
|
//! cargo test --package ruvllm --all-features ruvltra_tests
|
|
//! ```
|
|
|
|
use ruvllm::backends::{
|
|
AneCapabilities, ComputeUnits, ModelArchitecture, ModelConfig, Quantization,
|
|
};
|
|
use ruvllm::gguf::quantization::{dequantize_tensor, GgufQuantType, QuantizedTensor};
|
|
use ruvllm::kernels::ane_ops::{
|
|
get_ane_recommendation, is_ane_available, should_use_ane, should_use_ane_activation,
|
|
should_use_ane_matmul,
|
|
};
|
|
|
|
use std::time::{Duration, Instant};
|
|
|
|
// ============================================================================
|
|
// Test Fixtures and Constants
|
|
// ============================================================================
|
|
|
|
/// RuvLTRA-Small model configuration for testing
|
|
const RUVLTRA_SMALL_CONFIG: RuvLtraTestConfig = RuvLtraTestConfig {
|
|
vocab_size: 32000,
|
|
hidden_size: 2048,
|
|
intermediate_size: 5504,
|
|
num_hidden_layers: 22,
|
|
num_attention_heads: 32,
|
|
num_key_value_heads: 8,
|
|
max_position_embeddings: 8192,
|
|
rope_theta: 10000.0,
|
|
layer_norm_eps: 1e-5,
|
|
};
|
|
|
|
/// Test configuration for RuvLTRA-Small
|
|
#[derive(Debug, Clone, Copy)]
|
|
#[allow(dead_code)]
|
|
struct RuvLtraTestConfig {
|
|
vocab_size: usize,
|
|
hidden_size: usize,
|
|
intermediate_size: usize,
|
|
num_hidden_layers: usize,
|
|
num_attention_heads: usize,
|
|
num_key_value_heads: usize,
|
|
max_position_embeddings: usize,
|
|
rope_theta: f32,
|
|
layer_norm_eps: f32,
|
|
}
|
|
|
|
/// Memory bounds for validation (in bytes)
|
|
const MEMORY_BOUNDS: MemoryBounds = MemoryBounds {
|
|
// Q4_K quantization: ~1.2GB for small model
|
|
max_model_memory: 1_500_000_000,
|
|
// KV cache for 8K context
|
|
max_kv_cache_memory: 500_000_000,
|
|
// Working memory for inference
|
|
max_working_memory: 200_000_000,
|
|
};
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
struct MemoryBounds {
|
|
max_model_memory: usize,
|
|
max_kv_cache_memory: usize,
|
|
max_working_memory: usize,
|
|
}
|
|
|
|
/// Test tolerance levels
|
|
const EPSILON: f32 = 1e-4;
|
|
const LOOSE_EPSILON: f32 = 0.01;
|
|
const QUANTIZATION_EPSILON: f32 = 0.1; // Higher tolerance for quantized values
|
|
|
|
// ============================================================================
|
|
// Model Loading Tests
|
|
// ============================================================================
|
|
|
|
mod model_loading {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_model_config_creation() {
|
|
let config = ModelConfig {
|
|
architecture: ModelArchitecture::Llama,
|
|
quantization: Some(Quantization::Q4K),
|
|
max_sequence_length: 8192,
|
|
vocab_size: Some(RUVLTRA_SMALL_CONFIG.vocab_size),
|
|
use_flash_attention: true,
|
|
..Default::default()
|
|
};
|
|
|
|
assert_eq!(config.architecture, ModelArchitecture::Llama);
|
|
assert_eq!(config.quantization, Some(Quantization::Q4K));
|
|
assert_eq!(config.max_sequence_length, 8192);
|
|
assert_eq!(config.vocab_size, Some(RUVLTRA_SMALL_CONFIG.vocab_size));
|
|
assert!(config.use_flash_attention);
|
|
}
|
|
|
|
#[test]
|
|
fn test_model_architecture_variants() {
|
|
let architectures = [
|
|
ModelArchitecture::Llama,
|
|
ModelArchitecture::Mistral,
|
|
ModelArchitecture::Phi,
|
|
ModelArchitecture::Qwen,
|
|
];
|
|
|
|
for arch in architectures {
|
|
let config = ModelConfig {
|
|
architecture: arch,
|
|
quantization: Some(Quantization::Q4K),
|
|
max_sequence_length: 4096,
|
|
vocab_size: Some(32000),
|
|
use_flash_attention: false,
|
|
..Default::default()
|
|
};
|
|
|
|
assert_eq!(config.architecture, arch);
|
|
// Verify architecture can be formatted/debugged
|
|
let _ = format!("{:?}", arch);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_quantization_format_selection() {
|
|
let quantizations = [
|
|
(Quantization::None, "None", 32.0),
|
|
(Quantization::F16, "F16", 16.0),
|
|
(Quantization::Bf16, "Bf16", 16.0),
|
|
(Quantization::Q8, "Q8", 8.0),
|
|
(Quantization::Q4K, "Q4K", 4.5),
|
|
(Quantization::Q4, "Q4", 4.0),
|
|
(Quantization::Q2K, "Q2K", 2.56),
|
|
];
|
|
|
|
for (quant, name, _expected_bits) in quantizations {
|
|
let config = ModelConfig {
|
|
architecture: ModelArchitecture::Llama,
|
|
quantization: Some(quant),
|
|
max_sequence_length: 4096,
|
|
vocab_size: Some(32000),
|
|
use_flash_attention: false,
|
|
..Default::default()
|
|
};
|
|
|
|
// Verify quantization is set correctly
|
|
assert_eq!(config.quantization, Some(quant));
|
|
|
|
// Verify name format
|
|
let quant_name = format!("{:?}", quant);
|
|
assert!(
|
|
quant_name.contains(name) || !quant_name.is_empty(),
|
|
"Quantization {:?} should have recognizable name",
|
|
quant
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_model_config_default_values() {
|
|
let config = ModelConfig::default();
|
|
|
|
// Verify sensible defaults
|
|
assert!(config.max_sequence_length > 0);
|
|
// vocab_size is now Option, so check it's present or use default behavior
|
|
}
|
|
|
|
#[test]
|
|
fn test_invalid_model_path_error() {
|
|
// This test validates error handling for non-existent paths
|
|
let result = std::fs::metadata("/nonexistent/path/to/model.gguf");
|
|
assert!(result.is_err(), "Non-existent path should fail");
|
|
}
|
|
|
|
#[test]
|
|
fn test_gguf_extension_validation() {
|
|
let valid_extensions = [".gguf", ".GGUF"];
|
|
let invalid_extensions = [".bin", ".safetensors", ".pt", ".pth"];
|
|
|
|
for ext in valid_extensions {
|
|
assert!(
|
|
ext.to_lowercase().ends_with("gguf"),
|
|
"Extension {} should be valid GGUF",
|
|
ext
|
|
);
|
|
}
|
|
|
|
for ext in invalid_extensions {
|
|
assert!(
|
|
!ext.to_lowercase().ends_with("gguf"),
|
|
"Extension {} should not be GGUF",
|
|
ext
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_rope_theta_configuration() {
|
|
// Test rope theta configuration
|
|
let config_with_theta = ModelConfig {
|
|
architecture: ModelArchitecture::Llama,
|
|
quantization: Some(Quantization::Q4K),
|
|
max_sequence_length: 4096,
|
|
vocab_size: Some(32000),
|
|
rope_theta: Some(10000.0),
|
|
use_flash_attention: false,
|
|
..Default::default()
|
|
};
|
|
assert_eq!(config_with_theta.rope_theta, Some(10000.0));
|
|
|
|
// Rope theta is the frequency base for rotary position embeddings
|
|
// The actual implementation depends on the model architecture
|
|
}
|
|
|
|
#[test]
|
|
fn test_context_length_bounds() {
|
|
let context_lengths = [512, 1024, 2048, 4096, 8192, 16384, 32768];
|
|
|
|
for ctx_len in context_lengths {
|
|
let config = ModelConfig {
|
|
architecture: ModelArchitecture::Llama,
|
|
quantization: Some(Quantization::Q4K),
|
|
max_sequence_length: ctx_len,
|
|
vocab_size: Some(32000),
|
|
use_flash_attention: false,
|
|
..Default::default()
|
|
};
|
|
|
|
assert_eq!(config.max_sequence_length, ctx_len);
|
|
assert!(ctx_len > 0, "Context length must be positive");
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Quantization Accuracy Tests
|
|
// ============================================================================
|
|
|
|
mod quantization_accuracy {
|
|
use super::*;
|
|
|
|
/// Test Q4_0 dequantization accuracy
|
|
#[test]
|
|
fn test_q4_0_dequantization_accuracy() {
|
|
// Create test Q4_0 block: scale + packed 4-bit values
|
|
let mut block = vec![0u8; 18];
|
|
|
|
// Set scale = 0.5 (f16: 0x3800)
|
|
block[0] = 0x00;
|
|
block[1] = 0x38;
|
|
|
|
// Pack values: (8 - offset) gives 0, (9 - offset) gives 1, etc.
|
|
// Q4_0 uses offset of 8
|
|
for i in 0..16 {
|
|
let low = 8u8; // Will become 0 after offset
|
|
let high = 9u8; // Will become 1 after offset
|
|
block[2 + i] = low | (high << 4);
|
|
}
|
|
|
|
let _output = vec![0.0f32; 32];
|
|
let dtype = GgufQuantType::Q4_0;
|
|
|
|
// Verify block size
|
|
assert_eq!(dtype.block_size(), 32);
|
|
assert_eq!(dtype.type_size(), 18);
|
|
|
|
// Dequantize
|
|
let result = dequantize_tensor(&block, dtype, 32);
|
|
assert!(result.is_ok(), "Dequantization should succeed");
|
|
|
|
let output = result.unwrap();
|
|
|
|
// Verify pattern: alternating 0.0, 0.5
|
|
for i in 0..32 {
|
|
if i % 2 == 0 {
|
|
assert!(
|
|
output[i].abs() < QUANTIZATION_EPSILON,
|
|
"Even index {} should be ~0.0, got {}",
|
|
i,
|
|
output[i]
|
|
);
|
|
} else {
|
|
assert!(
|
|
(output[i] - 0.5).abs() < QUANTIZATION_EPSILON,
|
|
"Odd index {} should be ~0.5, got {}",
|
|
i,
|
|
output[i]
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test Q8_0 dequantization accuracy
|
|
#[test]
|
|
fn test_q8_0_dequantization_accuracy() {
|
|
// Create test Q8_0 block: scale (2 bytes) + 32 int8 values
|
|
let mut block = vec![0u8; 34];
|
|
|
|
// Set scale = 1.0 (f16: 0x3C00)
|
|
block[0] = 0x00;
|
|
block[1] = 0x3C;
|
|
|
|
// Set values 1, 2, 3, ..., 32 as signed int8
|
|
for i in 0..32 {
|
|
block[2 + i] = (i + 1) as u8;
|
|
}
|
|
|
|
let result = dequantize_tensor(&block, GgufQuantType::Q8_0, 32);
|
|
assert!(result.is_ok());
|
|
|
|
let output = result.unwrap();
|
|
|
|
// Verify: values should be 1.0, 2.0, ..., 32.0
|
|
for i in 0..32 {
|
|
let expected = (i + 1) as f32;
|
|
assert!(
|
|
(output[i] - expected).abs() < EPSILON,
|
|
"Index {}: expected {}, got {}",
|
|
i,
|
|
expected,
|
|
output[i]
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Test Q4_K dequantization (most common format)
|
|
#[test]
|
|
fn test_q4_k_dequantization_accuracy() {
|
|
let dtype = GgufQuantType::Q4_K;
|
|
|
|
// Verify Q4_K properties
|
|
assert_eq!(dtype.block_size(), 256);
|
|
assert_eq!(dtype.type_size(), 144);
|
|
assert!(dtype.is_quantized());
|
|
|
|
let bits = dtype.bits_per_weight();
|
|
assert!((bits - 4.5).abs() < 0.1, "Q4_K should be ~4.5 bits/weight");
|
|
}
|
|
|
|
/// Test all quantization types have valid properties
|
|
#[test]
|
|
fn test_all_quant_types_valid() {
|
|
let quant_types = [
|
|
GgufQuantType::F32,
|
|
GgufQuantType::F16,
|
|
GgufQuantType::Q8_0,
|
|
GgufQuantType::Q4_0,
|
|
GgufQuantType::Q4_1,
|
|
GgufQuantType::Q5_0,
|
|
GgufQuantType::Q5_1,
|
|
GgufQuantType::Q2_K,
|
|
GgufQuantType::Q3_K,
|
|
GgufQuantType::Q4_K,
|
|
GgufQuantType::Q5_K,
|
|
GgufQuantType::Q6_K,
|
|
];
|
|
|
|
for dtype in quant_types {
|
|
// Block size must be positive
|
|
assert!(
|
|
dtype.block_size() > 0,
|
|
"{:?} must have positive block size",
|
|
dtype
|
|
);
|
|
|
|
// Type size must be positive
|
|
assert!(
|
|
dtype.type_size() > 0,
|
|
"{:?} must have positive type size",
|
|
dtype
|
|
);
|
|
|
|
// Bits per weight should be in reasonable range (1-32)
|
|
let bits = dtype.bits_per_weight();
|
|
assert!(
|
|
bits >= 1.0 && bits <= 32.0,
|
|
"{:?} bits/weight {} out of range",
|
|
dtype,
|
|
bits
|
|
);
|
|
|
|
// Name should be non-empty
|
|
assert!(
|
|
!dtype.name().is_empty(),
|
|
"{:?} must have non-empty name",
|
|
dtype
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Test tensor size calculation
|
|
#[test]
|
|
fn test_tensor_size_calculation() {
|
|
// F32: 256 elements = 256 * 4 = 1024 bytes
|
|
assert_eq!(GgufQuantType::F32.tensor_size(256), 1024);
|
|
|
|
// F16: 256 elements = 256 * 2 = 512 bytes
|
|
assert_eq!(GgufQuantType::F16.tensor_size(256), 512);
|
|
|
|
// Q4_0: 256 elements = 8 blocks * 18 bytes = 144 bytes
|
|
assert_eq!(GgufQuantType::Q4_0.tensor_size(256), 144);
|
|
|
|
// Q4_K: 256 elements = 1 block * 144 bytes = 144 bytes
|
|
assert_eq!(GgufQuantType::Q4_K.tensor_size(256), 144);
|
|
}
|
|
|
|
/// Test quantized vs non-quantized detection
|
|
#[test]
|
|
fn test_is_quantized() {
|
|
// Non-quantized types
|
|
assert!(!GgufQuantType::F32.is_quantized());
|
|
assert!(!GgufQuantType::F16.is_quantized());
|
|
assert!(!GgufQuantType::Bf16.is_quantized());
|
|
|
|
// Quantized types
|
|
assert!(GgufQuantType::Q4_0.is_quantized());
|
|
assert!(GgufQuantType::Q8_0.is_quantized());
|
|
assert!(GgufQuantType::Q4_K.is_quantized());
|
|
assert!(GgufQuantType::Q2_K.is_quantized());
|
|
}
|
|
|
|
/// Test QuantizedTensor container
|
|
#[test]
|
|
fn test_quantized_tensor_container() {
|
|
let tensor = QuantizedTensor {
|
|
data: vec![0u8; 144], // One Q4_K block
|
|
dtype: GgufQuantType::Q4_K,
|
|
shape: vec![256],
|
|
num_elements: 256,
|
|
};
|
|
|
|
assert_eq!(tensor.block_count(), 1);
|
|
assert!(tensor.dtype.is_quantized());
|
|
assert_eq!(tensor.shape, vec![256]);
|
|
}
|
|
|
|
/// Test dequantization roundtrip sanity
|
|
#[test]
|
|
fn test_dequantization_finite_values() {
|
|
// Create valid Q4_0 quantized data
|
|
// Q4_0 format: 2 bytes scale (f16) + 16 bytes packed 4-bit values = 18 bytes per block
|
|
// Each block represents 32 elements
|
|
let mut data = vec![0u8; 18 * 8]; // 8 Q4_0 blocks = 256 elements
|
|
|
|
for block in 0..8 {
|
|
let base = block * 18;
|
|
// Set a valid f16 scale: 0x3C00 = 1.0f16, small positive value
|
|
data[base] = 0x00; // Low byte of f16 scale
|
|
data[base + 1] = 0x3C; // High byte: 0x3C00 = 1.0
|
|
|
|
// Fill packed 4-bit values with valid patterns (0-15)
|
|
for i in 0..16 {
|
|
let low_nibble = (i % 16) as u8;
|
|
let high_nibble = ((i + 1) % 16) as u8;
|
|
data[base + 2 + i] = low_nibble | (high_nibble << 4);
|
|
}
|
|
}
|
|
|
|
let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 256);
|
|
assert!(result.is_ok());
|
|
|
|
let output = result.unwrap();
|
|
|
|
// All values should be finite
|
|
for (i, val) in output.iter().enumerate() {
|
|
assert!(
|
|
val.is_finite(),
|
|
"Value at index {} should be finite, got {}",
|
|
i,
|
|
val
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Test quantization type conversion from u32
|
|
#[test]
|
|
fn test_quant_type_try_from() {
|
|
// Valid conversions
|
|
assert_eq!(GgufQuantType::try_from(0).unwrap(), GgufQuantType::F32);
|
|
assert_eq!(GgufQuantType::try_from(1).unwrap(), GgufQuantType::F16);
|
|
assert_eq!(GgufQuantType::try_from(8).unwrap(), GgufQuantType::Q8_0);
|
|
assert_eq!(GgufQuantType::try_from(12).unwrap(), GgufQuantType::Q4_K);
|
|
|
|
// Invalid conversion
|
|
assert!(GgufQuantType::try_from(100).is_err());
|
|
assert!(GgufQuantType::try_from(255).is_err());
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// SONA Integration Tests
|
|
// ============================================================================
|
|
|
|
mod sona_integration {
|
|
use super::*;
|
|
|
|
/// SONA configuration for testing
|
|
#[derive(Debug, Clone)]
|
|
struct SonaTestConfig {
|
|
learning_rate: f32,
|
|
momentum: f32,
|
|
adaptation_threshold: f32,
|
|
max_adaptations_per_step: usize,
|
|
}
|
|
|
|
impl Default for SonaTestConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
learning_rate: 0.001,
|
|
momentum: 0.9,
|
|
adaptation_threshold: 0.05,
|
|
max_adaptations_per_step: 3,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_config_defaults() {
|
|
let config = SonaTestConfig::default();
|
|
|
|
assert!(
|
|
config.learning_rate > 0.0 && config.learning_rate < 1.0,
|
|
"Learning rate should be in (0, 1)"
|
|
);
|
|
assert!(
|
|
config.momentum >= 0.0 && config.momentum < 1.0,
|
|
"Momentum should be in [0, 1)"
|
|
);
|
|
assert!(
|
|
config.adaptation_threshold > 0.0,
|
|
"Adaptation threshold must be positive"
|
|
);
|
|
assert!(
|
|
config.max_adaptations_per_step > 0,
|
|
"Max adaptations must be positive"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_adaptation_timing() {
|
|
// SONA adaptation should be fast (<0.05ms target)
|
|
let start = Instant::now();
|
|
|
|
// Simulate SONA adaptation calculation
|
|
let mut weights = vec![0.5f32; 1000];
|
|
let gradients = vec![0.01f32; 1000];
|
|
|
|
// Simple gradient update (simulating SONA)
|
|
for (w, g) in weights.iter_mut().zip(gradients.iter()) {
|
|
*w -= 0.001 * g;
|
|
}
|
|
|
|
let duration = start.elapsed();
|
|
|
|
// Should be very fast
|
|
assert!(
|
|
duration < Duration::from_millis(1),
|
|
"SONA adaptation took {:?}, expected <1ms",
|
|
duration
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_routing_decision() {
|
|
// Test routing decision logic
|
|
struct RoutingDecision {
|
|
use_ane: bool,
|
|
use_neon: bool,
|
|
confidence: f32,
|
|
}
|
|
|
|
fn make_routing_decision(batch_size: usize, dim: usize) -> RoutingDecision {
|
|
let ane_available = is_ane_available();
|
|
|
|
if ane_available && should_use_ane(batch_size, dim) {
|
|
RoutingDecision {
|
|
use_ane: true,
|
|
use_neon: false,
|
|
confidence: 0.9,
|
|
}
|
|
} else {
|
|
RoutingDecision {
|
|
use_ane: false,
|
|
use_neon: true,
|
|
confidence: 0.95,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Small dimensions: NEON preferred
|
|
let decision = make_routing_decision(1, 32);
|
|
assert!(
|
|
decision.use_neon || decision.use_ane,
|
|
"Must use some compute backend"
|
|
);
|
|
|
|
// Large batch with aligned dims: ANE may be preferred on Apple Silicon
|
|
let decision = make_routing_decision(32, 256);
|
|
assert!(decision.confidence > 0.5);
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_pattern_learning() {
|
|
// Simulate SONA pattern storage
|
|
#[derive(Debug)]
|
|
#[allow(dead_code)]
|
|
struct SonaPattern {
|
|
input_hash: u64,
|
|
optimal_config: String,
|
|
performance_score: f32,
|
|
}
|
|
|
|
let patterns = vec![
|
|
SonaPattern {
|
|
input_hash: 12345,
|
|
optimal_config: "ANE+NEON".to_string(),
|
|
performance_score: 0.95,
|
|
},
|
|
SonaPattern {
|
|
input_hash: 67890,
|
|
optimal_config: "NEON-only".to_string(),
|
|
performance_score: 0.88,
|
|
},
|
|
];
|
|
|
|
for pattern in &patterns {
|
|
assert!(pattern.performance_score >= 0.0 && pattern.performance_score <= 1.0);
|
|
assert!(!pattern.optimal_config.is_empty());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_warmup_iterations() {
|
|
// SONA typically needs a few iterations to warm up
|
|
const WARMUP_ITERATIONS: usize = 3;
|
|
|
|
let mut metrics = Vec::new();
|
|
|
|
for i in 0..10 {
|
|
// Simulate inference timing
|
|
let start = Instant::now();
|
|
std::thread::sleep(Duration::from_micros(100 + i as u64 * 10));
|
|
let duration = start.elapsed();
|
|
metrics.push(duration);
|
|
}
|
|
|
|
// Post-warmup iterations should be more stable
|
|
let warmup_variance = calculate_variance(&metrics[..WARMUP_ITERATIONS]);
|
|
let stable_variance = calculate_variance(&metrics[WARMUP_ITERATIONS..]);
|
|
|
|
// Note: This is a simplified test; in real scenarios,
|
|
// stable variance should typically be lower
|
|
let _ = (warmup_variance, stable_variance);
|
|
}
|
|
|
|
fn calculate_variance(durations: &[Duration]) -> f64 {
|
|
if durations.is_empty() {
|
|
return 0.0;
|
|
}
|
|
let mean: f64 =
|
|
durations.iter().map(|d| d.as_secs_f64()).sum::<f64>() / durations.len() as f64;
|
|
|
|
durations
|
|
.iter()
|
|
.map(|d| (d.as_secs_f64() - mean).powi(2))
|
|
.sum::<f64>()
|
|
/ durations.len() as f64
|
|
}
|
|
|
|
#[test]
|
|
fn test_sona_ewc_consolidation() {
|
|
// Test EWC++ (Elastic Weight Consolidation) behavior
|
|
// This prevents catastrophic forgetting in SONA
|
|
|
|
struct EwcConfig {
|
|
lambda: f32, // Importance weight
|
|
fisher_samples: usize,
|
|
}
|
|
|
|
let config = EwcConfig {
|
|
lambda: 1000.0,
|
|
fisher_samples: 100,
|
|
};
|
|
|
|
// Lambda should be positive for weight importance
|
|
assert!(config.lambda > 0.0);
|
|
// Need enough samples for Fisher information
|
|
assert!(config.fisher_samples >= 10);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// ANE Dispatch Tests
|
|
// ============================================================================
|
|
|
|
mod ane_dispatch {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_ane_availability_detection() {
|
|
// Should not panic
|
|
let available = is_ane_available();
|
|
|
|
// Result should be consistent
|
|
assert_eq!(is_ane_available(), available);
|
|
assert_eq!(is_ane_available(), available);
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_capabilities_detection() {
|
|
let caps = AneCapabilities::detect();
|
|
|
|
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
|
{
|
|
// On Apple Silicon, ANE should be available
|
|
assert!(caps.available, "ANE should be available on Apple Silicon");
|
|
assert!(caps.tops > 0.0, "TOPS should be positive");
|
|
assert!(
|
|
caps.max_model_size_mb > 0,
|
|
"Max model size should be positive"
|
|
);
|
|
assert!(!caps.supported_ops.is_empty(), "Should have supported ops");
|
|
}
|
|
|
|
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
|
{
|
|
// On non-Apple Silicon, ANE may not be available
|
|
if !caps.available {
|
|
assert_eq!(caps.tops, 0.0);
|
|
assert_eq!(caps.max_model_size_mb, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_routing_thresholds() {
|
|
// Test various dimension combinations
|
|
let test_cases = [
|
|
// (batch, dim, description)
|
|
(1, 64, "minimum ANE dimensions"),
|
|
(1, 128, "small aligned tensor"),
|
|
(32, 256, "typical LLM dimensions"),
|
|
(64, 4096, "large batch with large dim"),
|
|
(1, 32, "below minimum dim"),
|
|
(100, 128, "above max batch"),
|
|
];
|
|
|
|
for (batch, dim, desc) in test_cases {
|
|
let should_use = should_use_ane(batch, dim);
|
|
// Just verify no panic
|
|
let _ = (should_use, desc);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_matmul_routing() {
|
|
let test_cases = [
|
|
// (m, k, n, description)
|
|
(1, 64, 64, "small square matmul"),
|
|
(32, 256, 128, "medium matmul"),
|
|
(1, 4096, 4096, "large matmul"),
|
|
(64, 512, 512, "optimal ANE size"),
|
|
(1, 8192, 8192, "very large matmul"),
|
|
];
|
|
|
|
for (m, k, n, desc) in test_cases {
|
|
let _should_use = should_use_ane_matmul(m, k, n);
|
|
let recommendation = get_ane_recommendation(m, k, n);
|
|
|
|
// Recommendation should be consistent
|
|
assert!(
|
|
recommendation.confidence >= 0.0 && recommendation.confidence <= 1.0,
|
|
"Confidence for {} should be in [0, 1]",
|
|
desc
|
|
);
|
|
|
|
// Expected speedup should be reasonable
|
|
assert!(
|
|
recommendation.expected_speedup > 0.0 && recommendation.expected_speedup < 10.0,
|
|
"Speedup for {} should be reasonable",
|
|
desc
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_activation_routing() {
|
|
let test_cases = [
|
|
(1, 64),
|
|
(32, 256),
|
|
(64, 4096),
|
|
(100, 128), // Above typical ANE batch limit
|
|
(1, 1000000), // Very large tensor
|
|
];
|
|
|
|
for (batch, dim) in test_cases {
|
|
let should_use = should_use_ane_activation(batch, dim);
|
|
// Just verify no panic and reasonable result
|
|
let _ = should_use;
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_recommendation_structure() {
|
|
let rec = get_ane_recommendation(1, 256, 256);
|
|
|
|
// All fields should be valid
|
|
assert!(rec.confidence >= 0.0 && rec.confidence <= 1.0);
|
|
assert!(!rec.reason.is_empty());
|
|
assert!(rec.expected_speedup > 0.0);
|
|
|
|
// Test Clone
|
|
let cloned = rec.clone();
|
|
assert_eq!(rec.use_ane, cloned.use_ane);
|
|
assert_eq!(rec.confidence, cloned.confidence);
|
|
|
|
// Test Debug
|
|
let debug = format!("{:?}", rec);
|
|
assert!(debug.contains("use_ane"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_units_configuration() {
|
|
let units = [
|
|
ComputeUnits::CpuOnly,
|
|
ComputeUnits::CpuAndGpu,
|
|
ComputeUnits::CpuAndNeuralEngine,
|
|
ComputeUnits::All,
|
|
];
|
|
|
|
for unit in units {
|
|
// Test ANE usage flag
|
|
let _uses_ane = unit.uses_ane();
|
|
let _uses_gpu = unit.uses_gpu();
|
|
|
|
// At least CPU should always be used
|
|
// (implied by all compute unit configurations)
|
|
|
|
// Test description
|
|
let desc = unit.description();
|
|
assert!(!desc.is_empty());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_dimension_alignment() {
|
|
// ANE prefers 16-aligned dimensions
|
|
let aligned_dims = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096];
|
|
let unaligned_dims = [17, 33, 65, 100, 255, 1000];
|
|
|
|
for dim in aligned_dims {
|
|
assert_eq!(dim % 16, 0, "{} should be 16-aligned", dim);
|
|
}
|
|
|
|
for dim in unaligned_dims {
|
|
assert_ne!(dim % 16, 0, "{} should not be 16-aligned", dim);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ane_no_dispatch_errors() {
|
|
// Simulate dispatch to verify no errors occur
|
|
let test_tensors = [(1, 64), (32, 256), (64, 4096)];
|
|
|
|
for (batch, dim) in test_tensors {
|
|
// These should never panic
|
|
let _ = should_use_ane(batch, dim);
|
|
let _ = should_use_ane_activation(batch, dim);
|
|
let _ = should_use_ane_matmul(batch, dim, dim);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_fallback_behavior() {
|
|
// Test that fallback to NEON works when ANE is unavailable
|
|
let mut data = vec![1.0f32; 64];
|
|
|
|
// This should work regardless of ANE availability
|
|
// by falling back to scalar/NEON implementation
|
|
for v in data.iter_mut() {
|
|
*v = *v / (1.0 + (-*v).exp()); // SiLU
|
|
}
|
|
|
|
// All values should be valid
|
|
assert!(data.iter().all(|v| v.is_finite()));
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Memory Management Tests
|
|
// ============================================================================
|
|
|
|
mod memory_management {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_memory_bounds_validation() {
|
|
// Verify memory bounds are reasonable
|
|
assert!(MEMORY_BOUNDS.max_model_memory > 0);
|
|
assert!(MEMORY_BOUNDS.max_kv_cache_memory > 0);
|
|
assert!(MEMORY_BOUNDS.max_working_memory > 0);
|
|
|
|
// Total should be reasonable for device
|
|
let total = MEMORY_BOUNDS.max_model_memory
|
|
+ MEMORY_BOUNDS.max_kv_cache_memory
|
|
+ MEMORY_BOUNDS.max_working_memory;
|
|
|
|
// Should fit in 8GB device memory
|
|
assert!(total < 8_000_000_000, "Total memory {} exceeds 8GB", total);
|
|
}
|
|
|
|
#[test]
|
|
fn test_tensor_memory_estimation() {
|
|
// Estimate memory for RuvLTRA-Small tensors
|
|
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
|
|
let _num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers;
|
|
let vocab_size = RUVLTRA_SMALL_CONFIG.vocab_size;
|
|
|
|
// Embedding: vocab_size * hidden_size * bytes_per_element
|
|
let embedding_size_f32 = vocab_size * hidden_size * 4;
|
|
let embedding_size_q4k = GgufQuantType::Q4_K.tensor_size(vocab_size * hidden_size);
|
|
|
|
// Q4_K should be much smaller
|
|
assert!(
|
|
embedding_size_q4k < embedding_size_f32 / 4,
|
|
"Q4_K should be at least 4x smaller than F32"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_kv_cache_sizing() {
|
|
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
|
|
let num_layers = RUVLTRA_SMALL_CONFIG.num_hidden_layers;
|
|
let num_kv_heads = RUVLTRA_SMALL_CONFIG.num_key_value_heads;
|
|
let max_seq_len = RUVLTRA_SMALL_CONFIG.max_position_embeddings;
|
|
|
|
let head_dim = hidden_size / RUVLTRA_SMALL_CONFIG.num_attention_heads;
|
|
|
|
// KV cache per layer: 2 * seq_len * num_kv_heads * head_dim * sizeof(f16)
|
|
let kv_per_layer = 2 * max_seq_len * num_kv_heads * head_dim * 2;
|
|
let total_kv_cache = kv_per_layer * num_layers;
|
|
|
|
assert!(
|
|
total_kv_cache < MEMORY_BOUNDS.max_kv_cache_memory as usize,
|
|
"KV cache {} exceeds bound {}",
|
|
total_kv_cache,
|
|
MEMORY_BOUNDS.max_kv_cache_memory
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_working_memory_allocation() {
|
|
// Simulate working memory allocation
|
|
let batch_size = 1;
|
|
let seq_len = 1024;
|
|
let hidden_size = RUVLTRA_SMALL_CONFIG.hidden_size;
|
|
|
|
// Activations: batch * seq * hidden * sizeof(f32)
|
|
let activation_memory = batch_size * seq_len * hidden_size * 4;
|
|
|
|
// Should fit in working memory
|
|
assert!(activation_memory < MEMORY_BOUNDS.max_working_memory as usize);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Output Validation Tests
|
|
// ============================================================================
|
|
|
|
mod output_validation {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_logits_finite() {
|
|
// Simulated logits output
|
|
let logits: Vec<f32> = (0..RUVLTRA_SMALL_CONFIG.vocab_size)
|
|
.map(|i| (i as f32) * 0.001 - 16.0)
|
|
.collect();
|
|
|
|
// All logits should be finite
|
|
for (i, logit) in logits.iter().enumerate() {
|
|
assert!(
|
|
logit.is_finite(),
|
|
"Logit at index {} should be finite, got {}",
|
|
i,
|
|
logit
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_softmax_probabilities() {
|
|
// Simulated softmax output
|
|
let mut probs = vec![0.1f32; 10];
|
|
|
|
// Apply softmax normalization
|
|
let max_val = probs.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
|
let mut sum = 0.0;
|
|
for p in probs.iter_mut() {
|
|
*p = (*p - max_val).exp();
|
|
sum += *p;
|
|
}
|
|
for p in probs.iter_mut() {
|
|
*p /= sum;
|
|
}
|
|
|
|
// Probabilities should sum to 1.0
|
|
let prob_sum: f32 = probs.iter().sum();
|
|
assert!(
|
|
(prob_sum - 1.0).abs() < EPSILON,
|
|
"Probabilities should sum to 1.0, got {}",
|
|
prob_sum
|
|
);
|
|
|
|
// All probabilities should be in [0, 1]
|
|
for (i, p) in probs.iter().enumerate() {
|
|
assert!(
|
|
*p >= 0.0 && *p <= 1.0,
|
|
"Probability at {} should be in [0, 1], got {}",
|
|
i,
|
|
p
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_token_generation_coherence() {
|
|
// Test that token sequences have reasonable patterns
|
|
let sample_tokens: Vec<u32> = vec![1, 234, 567, 89, 1234, 5678];
|
|
|
|
// All tokens should be valid (within vocab range)
|
|
for token in &sample_tokens {
|
|
assert!(
|
|
*token < RUVLTRA_SMALL_CONFIG.vocab_size as u32,
|
|
"Token {} exceeds vocab size",
|
|
token
|
|
);
|
|
}
|
|
|
|
// No repeated padding tokens at start (unless intentional)
|
|
// This is a basic coherence check
|
|
let has_varied_tokens = sample_tokens.windows(2).any(|w| w[0] != w[1]);
|
|
assert!(
|
|
has_varied_tokens || sample_tokens.len() <= 1,
|
|
"Token sequence should have variety"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_attention_weights_valid() {
|
|
let seq_len = 32;
|
|
|
|
// Simulated attention weights (should sum to 1 per row after softmax)
|
|
let mut attention = vec![0.0f32; seq_len * seq_len];
|
|
|
|
// Initialize with causal mask pattern
|
|
for i in 0..seq_len {
|
|
for j in 0..=i {
|
|
attention[i * seq_len + j] = 1.0 / (i + 1) as f32;
|
|
}
|
|
}
|
|
|
|
// Verify row sums are approximately 1.0
|
|
for i in 0..seq_len {
|
|
let row_sum: f32 = attention[i * seq_len..(i + 1) * seq_len].iter().sum();
|
|
assert!(
|
|
(row_sum - 1.0).abs() < LOOSE_EPSILON,
|
|
"Attention row {} should sum to 1.0, got {}",
|
|
i,
|
|
row_sum
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Performance Validation Tests
|
|
// ============================================================================
|
|
|
|
mod performance_validation {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_inference_timing_reasonable() {
|
|
// Basic timing test for operations
|
|
let start = Instant::now();
|
|
|
|
// Simulate a basic forward pass calculation
|
|
let data: Vec<f32> = (0..4096).map(|i| i as f32 * 0.001).collect();
|
|
let mut output = vec![0.0f32; 4096];
|
|
|
|
for (i, (o, d)) in output.iter_mut().zip(data.iter()).enumerate() {
|
|
*o = *d * (i as f32 % 10.0 + 1.0);
|
|
}
|
|
|
|
let duration = start.elapsed();
|
|
|
|
// Basic operations should be very fast
|
|
assert!(
|
|
duration < Duration::from_millis(10),
|
|
"Basic ops took {:?}",
|
|
duration
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_batch_processing_scaling() {
|
|
let batch_sizes = [1, 2, 4, 8, 16, 32];
|
|
let dim = 256;
|
|
|
|
let mut timings = Vec::new();
|
|
|
|
for batch_size in batch_sizes {
|
|
let start = Instant::now();
|
|
|
|
// Simulate batch processing
|
|
let data = vec![1.0f32; batch_size * dim];
|
|
let _: f32 = data.iter().sum();
|
|
|
|
timings.push((batch_size, start.elapsed()));
|
|
}
|
|
|
|
// Larger batches should take more time (linear or better scaling)
|
|
// This is a sanity check that batch size affects timing
|
|
let _ = timings;
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Run with: cargo test --release -- --ignored
|
|
fn test_throughput_benchmark() {
|
|
let iterations = 100;
|
|
let dim = 4096;
|
|
|
|
let data: Vec<f32> = (0..dim).map(|i| i as f32 * 0.001).collect();
|
|
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
let _: f32 = data.iter().map(|x| x * x).sum();
|
|
}
|
|
let duration = start.elapsed();
|
|
|
|
let ops_per_second = (iterations * dim) as f64 / duration.as_secs_f64();
|
|
|
|
println!("Throughput: {:.2e} ops/sec", ops_per_second);
|
|
|
|
// Should achieve reasonable throughput
|
|
assert!(
|
|
ops_per_second > 1_000_000.0,
|
|
"Throughput {:.2e} below minimum",
|
|
ops_per_second
|
|
);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Thread Safety Tests
|
|
// ============================================================================
|
|
|
|
mod thread_safety {
|
|
use super::*;
|
|
use std::thread;
|
|
|
|
#[test]
|
|
fn test_ane_detection_thread_safe() {
|
|
let handles: Vec<_> = (0..4)
|
|
.map(|_| {
|
|
thread::spawn(|| {
|
|
for _ in 0..100 {
|
|
let _ = is_ane_available();
|
|
let _ = AneCapabilities::detect();
|
|
}
|
|
})
|
|
})
|
|
.collect();
|
|
|
|
for handle in handles {
|
|
handle.join().expect("Thread should complete");
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_quantization_thread_safe() {
|
|
let handles: Vec<_> = (0..4)
|
|
.map(|i| {
|
|
thread::spawn(move || {
|
|
let mut data = vec![0u8; 18];
|
|
data[0] = 0x00;
|
|
data[1] = 0x3C;
|
|
for j in 2..18 {
|
|
data[j] = ((i + j) % 256) as u8;
|
|
}
|
|
|
|
let result = dequantize_tensor(&data, GgufQuantType::Q4_0, 32);
|
|
assert!(result.is_ok());
|
|
|
|
let output = result.unwrap();
|
|
assert!(output.iter().all(|v| v.is_finite()));
|
|
})
|
|
})
|
|
.collect();
|
|
|
|
for handle in handles {
|
|
handle.join().expect("Thread should complete");
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_concurrent_routing_decisions() {
|
|
let handles: Vec<_> = (0..4)
|
|
.map(|i| {
|
|
thread::spawn(move || {
|
|
for j in 0..100 {
|
|
let batch = (i + 1) * (j + 1) % 64 + 1;
|
|
let dim = ((i + j) * 16 + 64) % 4096 + 64;
|
|
|
|
let _ = should_use_ane(batch, dim);
|
|
let _ = should_use_ane_matmul(batch, dim, dim);
|
|
}
|
|
})
|
|
})
|
|
.collect();
|
|
|
|
for handle in handles {
|
|
handle.join().expect("Thread should complete");
|
|
}
|
|
}
|
|
}
|