Files
wifi-densepose/vendor/ruvector/crates/ruvllm/tests/backend_integration.rs

650 lines
19 KiB
Rust

#![allow(
clippy::all,
unused_imports,
unused_variables,
dead_code,
unused_mut,
unused_assignments,
non_camel_case_types,
clippy::approx_constant,
unexpected_cfgs,
unused_must_use,
unused_parens
)]
//! Integration tests for LLM backends
//!
//! Tests the LLM backend infrastructure including model loading,
//! text generation, streaming, and embeddings extraction.
use ruvllm::{
backends::{
create_backend, DType, DeviceType, GenerateParams, LlmBackend, ModelArchitecture,
ModelConfig, ModelInfo, Quantization, SpecialTokens, TokenStream, Tokenizer,
},
error::Result,
};
use std::sync::Arc;
/// Mock backend for testing without requiring actual model files
#[derive(Debug)]
struct MockBackend {
model_info: Option<ModelInfo>,
loaded: bool,
}
impl MockBackend {
fn new() -> Self {
Self {
model_info: None,
loaded: false,
}
}
}
impl LlmBackend for MockBackend {
fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()> {
self.model_info = Some(ModelInfo {
name: model_id.to_string(),
architecture: config.architecture,
num_parameters: 100_000,
vocab_size: 32000,
hidden_size: 768,
num_layers: 12,
max_context_length: config.max_sequence_length,
quantization: config.quantization,
memory_usage: 1024 * 1024 * 100, // 100MB
});
self.loaded = true;
Ok(())
}
fn generate(&self, prompt: &str, _params: GenerateParams) -> Result<String> {
if !self.loaded {
return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
}
Ok(format!("Response to: {}", prompt))
}
fn generate_stream(
&self,
_prompt: &str,
_params: GenerateParams,
) -> Result<Box<dyn Iterator<Item = Result<ruvllm::backends::GeneratedToken>> + Send + '_>>
{
if !self.loaded {
return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
}
let tokens = vec![
ruvllm::backends::GeneratedToken {
id: 1,
text: "Hello".to_string(),
logprob: Some(-0.5),
is_special: false,
},
ruvllm::backends::GeneratedToken {
id: 2,
text: " world".to_string(),
logprob: Some(-0.3),
is_special: false,
},
ruvllm::backends::GeneratedToken {
id: 3,
text: "!".to_string(),
logprob: Some(-0.1),
is_special: false,
},
];
Ok(Box::new(tokens.into_iter().map(Ok)))
}
fn generate_stream_v2(&self, _prompt: &str, _params: GenerateParams) -> Result<TokenStream> {
if !self.loaded {
return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
}
// Return a mock stream using channel
let (tx, stream) = TokenStream::channel();
// Drop tx immediately since we don't need to send anything for this mock
drop(tx);
Ok(stream)
}
fn get_embeddings(&self, _text: &str) -> Result<Vec<f32>> {
if !self.loaded {
return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
}
// Return a mock embedding
Ok(vec![0.1; 768])
}
fn tokenizer(&self) -> Option<&dyn Tokenizer> {
None
}
fn is_model_loaded(&self) -> bool {
self.loaded
}
fn model_info(&self) -> Option<ModelInfo> {
self.model_info.clone()
}
fn unload_model(&mut self) {
self.loaded = false;
self.model_info = None;
}
}
#[test]
fn test_mock_backend_load_model() {
let mut backend = MockBackend::new();
// Initially not loaded
assert!(!backend.is_model_loaded());
assert!(backend.model_info().is_none());
// Load model
let config = ModelConfig::default();
let result = backend.load_model("test-model", config);
assert!(result.is_ok());
assert!(backend.is_model_loaded());
assert!(backend.model_info().is_some());
}
#[test]
fn test_backend_generate_basic() {
let mut backend = MockBackend::new();
backend
.load_model("test-model", ModelConfig::default())
.unwrap();
let params = GenerateParams {
max_tokens: 100,
temperature: 0.7,
top_p: 0.9,
top_k: 40,
repetition_penalty: 1.1,
frequency_penalty: 0.0,
presence_penalty: 0.0,
stop_sequences: vec![],
seed: Some(42),
};
let result = backend.generate("Hello, how are you?", params);
assert!(result.is_ok());
let output = result.unwrap();
assert!(!output.is_empty());
assert!(output.contains("Hello"));
}
#[test]
fn test_backend_generate_requires_loaded_model() {
let backend = MockBackend::new();
let params = GenerateParams::default();
let result = backend.generate("Test prompt", params);
assert!(result.is_err());
}
#[test]
fn test_backend_streaming() {
let mut backend = MockBackend::new();
backend
.load_model("test-model", ModelConfig::default())
.unwrap();
let params = GenerateParams::default();
let stream = backend.generate_stream("Hello", params).unwrap();
let tokens: Vec<_> = stream.collect();
assert_eq!(tokens.len(), 3);
let first = tokens[0].as_ref().unwrap();
assert_eq!(first.text, "Hello");
assert_eq!(first.id, 1);
assert!(!first.is_special);
}
#[test]
fn test_backend_embeddings() {
let mut backend = MockBackend::new();
backend
.load_model("test-model", ModelConfig::default())
.unwrap();
let embedding = backend.get_embeddings("Test text for embedding").unwrap();
assert_eq!(embedding.len(), 768);
assert!(embedding.iter().all(|&v| v.is_finite()));
}
#[test]
fn test_backend_model_info() {
let mut backend = MockBackend::new();
let config = ModelConfig {
architecture: ModelArchitecture::Llama,
max_sequence_length: 4096,
quantization: Some(Quantization::Q4K),
..Default::default()
};
backend.load_model("llama-test", config).unwrap();
let info = backend.model_info().unwrap();
assert_eq!(info.name, "llama-test");
assert_eq!(info.max_context_length, 4096);
assert!(matches!(info.architecture, ModelArchitecture::Llama));
assert!(matches!(info.quantization, Some(Quantization::Q4K)));
}
#[test]
fn test_backend_unload() {
let mut backend = MockBackend::new();
backend
.load_model("test-model", ModelConfig::default())
.unwrap();
assert!(backend.is_model_loaded());
backend.unload_model();
assert!(!backend.is_model_loaded());
assert!(backend.model_info().is_none());
// Should fail after unload
let result = backend.generate("Test", GenerateParams::default());
assert!(result.is_err());
}
#[test]
fn test_model_config() {
let config = ModelConfig {
architecture: ModelArchitecture::Mistral,
device: DeviceType::Cpu,
dtype: DType::F32,
quantization: Some(Quantization::Q4K),
use_flash_attention: true,
max_sequence_length: 4096,
num_kv_heads: Some(8),
hidden_size: Some(4096),
num_layers: Some(32),
vocab_size: Some(32000),
rope_theta: Some(10000.0),
sliding_window: None,
};
assert!(matches!(config.device, DeviceType::Cpu));
assert!(matches!(config.dtype, DType::F32));
assert!(matches!(config.quantization, Some(Quantization::Q4K)));
assert!(config.use_flash_attention);
assert_eq!(config.max_sequence_length, 4096);
}
#[test]
fn test_generate_params_default() {
let params = GenerateParams::default();
assert!(params.max_tokens > 0);
assert!(params.temperature > 0.0);
assert!(params.top_p <= 1.0);
assert!(params.top_k > 0);
}
#[test]
fn test_generate_params_builder() {
let params = GenerateParams::default()
.with_max_tokens(512)
.with_temperature(0.5)
.with_top_p(0.95)
.with_top_k(50)
.with_repetition_penalty(1.2)
.with_seed(42);
assert_eq!(params.max_tokens, 512);
assert_eq!(params.temperature, 0.5);
assert_eq!(params.top_p, 0.95);
assert_eq!(params.top_k, 50);
assert_eq!(params.repetition_penalty, 1.2);
assert_eq!(params.seed, Some(42));
}
#[test]
fn test_quantization_variants() {
let q4 = Quantization::Q4;
let q8 = Quantization::Q8;
let q4k = Quantization::Q4K;
let f16 = Quantization::F16;
assert!(q4.is_gguf());
assert!(q8.is_gguf());
assert!(q4k.is_gguf());
assert!(!f16.is_gguf());
// Check bytes per weight
assert_eq!(Quantization::None.bytes_per_weight(), 4.0);
assert_eq!(Quantization::F16.bytes_per_weight(), 2.0);
assert_eq!(Quantization::Q8.bytes_per_weight(), 1.0);
assert_eq!(Quantization::Q4K.bytes_per_weight(), 0.5);
}
#[test]
fn test_device_type_variants() {
let cpu = DeviceType::Cpu;
let metal = DeviceType::Metal;
let cuda = DeviceType::Cuda(0);
assert!(matches!(cpu, DeviceType::Cpu));
assert!(matches!(metal, DeviceType::Metal));
if let DeviceType::Cuda(idx) = cuda {
assert_eq!(idx, 0);
}
}
#[test]
fn test_model_architecture_variants() {
let llama = ModelArchitecture::Llama;
let mistral = ModelArchitecture::Mistral;
let phi = ModelArchitecture::Phi;
let qwen = ModelArchitecture::Qwen;
let gemma = ModelArchitecture::Gemma;
assert_eq!(llama.config_name(), "llama");
assert_eq!(mistral.config_name(), "mistral");
assert_eq!(phi.config_name(), "phi");
assert_eq!(qwen.config_name(), "qwen2");
assert_eq!(gemma.config_name(), "gemma");
}
#[test]
fn test_dtype_variants() {
let f32_type = DType::F32;
let f16_type = DType::F16;
let bf16_type = DType::Bf16;
assert!(matches!(f32_type, DType::F32));
assert!(matches!(f16_type, DType::F16));
assert!(matches!(bf16_type, DType::Bf16));
}
#[test]
fn test_special_tokens() {
let tokens = SpecialTokens {
bos_token_id: Some(1),
eos_token_id: Some(2),
pad_token_id: Some(0),
unk_token_id: Some(3),
};
assert_eq!(tokens.bos_token_id, Some(1));
assert_eq!(tokens.eos_token_id, Some(2));
assert_eq!(tokens.pad_token_id, Some(0));
assert_eq!(tokens.unk_token_id, Some(3));
}
#[test]
fn test_create_backend() {
// This creates a NoopBackend when candle feature is not enabled
let backend = create_backend();
// Without the candle feature, the backend should not be able to load models
#[cfg(not(feature = "candle"))]
{
assert!(!backend.is_model_loaded());
}
}
// Candle backend tests (only run when the feature is enabled)
#[cfg(feature = "candle")]
mod candle_tests {
use super::*;
use ruvllm::backends::CandleBackend;
#[test]
#[ignore] // Requires model download
fn test_candle_backend_creation() {
let backend = CandleBackend::new();
assert!(backend.is_ok());
}
#[test]
#[ignore] // Requires model download
fn test_candle_backend_load_model() {
let mut backend = CandleBackend::new().unwrap();
let config = ModelConfig {
architecture: ModelArchitecture::Phi,
device: DeviceType::Cpu,
..Default::default()
};
// This would require an actual model file
// let result = backend.load_model("microsoft/phi-2", config);
// assert!(result.is_ok());
}
}
// ========== V2 Feature Tests: Memory Pool Integration ==========
mod memory_pool_tests {
use ruvllm::memory_pool::{
BufferPool, BufferSize, InferenceArena, MemoryManager, MemoryManagerConfig,
ScratchSpaceManager,
};
/// Test memory pool integration with streaming generation
#[test]
fn test_memory_pool_integration() {
let pool = BufferPool::new();
// Pre-warm the pool
pool.prewarm_all(4).expect("prewarm failed");
// Simulate multiple generation steps
for step in 0..10 {
// Acquire buffers for KV cache
let kv_buffer = pool.acquire(BufferSize::KB64).expect("acquire failed");
assert_eq!(kv_buffer.capacity(), 65536);
// Simulate processing
let data = kv_buffer.as_slice::<f32>();
assert!(!data.is_empty());
// Buffer returns to pool when dropped
}
// Check pool statistics
let stats = pool.stats();
assert!(stats.hits + stats.misses > 0, "Pool should have been used");
// Hit rate should be decent after warm-up
if stats.hits + stats.misses >= 10 {
assert!(
stats.hit_rate > 0.5,
"Pool hit rate should be decent: {:.2}",
stats.hit_rate
);
}
}
/// Test streaming with memory pool
#[test]
fn test_streaming_with_pool() {
let manager = MemoryManager::new().expect("manager creation failed");
// Simulate streaming generation
for token_idx in 0..100 {
// Reset arena at start of each step
manager.reset_step();
// Allocate temporary buffers from arena
let activations: &mut [f32] = manager.arena.alloc(1024).expect("arena alloc failed");
activations[0] = token_idx as f32;
let logits: &mut [f32] = manager.arena.alloc(32000).expect("arena alloc for logits");
logits[0] = token_idx as f32 * 0.1;
// Acquire KV cache buffer from pool
let kv_buf = manager
.pool
.acquire(BufferSize::KB16)
.expect("acquire failed");
assert!(kv_buf.capacity() >= 16384);
// Use scratch space for intermediate computations
let mut scratch = manager.scratch.get_scratch().expect("get_scratch failed");
if let Some(temp) = scratch.get::<f32>(256) {
temp.fill(1.0);
assert_eq!(temp.len(), 256);
}
// Verify arena usage grows
assert!(manager.arena.used() > 0);
}
// Verify final statistics
let stats = manager.stats();
assert!(stats.pool.hits + stats.pool.misses > 0);
assert!(stats.arena.high_water_mark > 0);
}
/// Test arena allocation and reset cycle
#[test]
fn test_arena_allocation_cycle() {
let arena = InferenceArena::new(4 * 1024 * 1024).expect("arena creation failed"); // 4MB
for cycle in 0..50 {
// Allocate various buffer sizes
let buf1: &mut [f32] = arena.alloc(4096).expect("alloc 4096");
let buf2: &mut [f32] = arena.alloc(8192).expect("alloc 8192");
let buf3: &mut [f32] = arena.alloc(1024).expect("alloc 1024");
// Write to buffers
buf1[0] = cycle as f32;
buf2[0] = cycle as f32 * 2.0;
buf3[0] = cycle as f32 * 3.0;
// Verify allocations
assert_eq!(arena.allocation_count(), 3);
assert!(arena.used() > 0);
// Reset for next cycle
arena.reset();
assert_eq!(arena.used(), 0);
assert_eq!(arena.allocation_count(), 0);
}
// High water mark should be set
assert!(arena.high_water_mark() > 0);
}
/// Test buffer pool reuse efficiency
#[test]
fn test_buffer_pool_reuse() {
let pool = BufferPool::with_capacity(8);
// Acquire and release same size multiple times
for _ in 0..20 {
let buf = pool.acquire(BufferSize::KB4).expect("acquire failed");
assert_eq!(buf.capacity(), 4096);
// Buffer returns to pool on drop
}
let stats = pool.stats();
// After first allocation, subsequent ones should hit the pool
assert!(
stats.hits >= 19,
"Expected at least 19 hits, got {}",
stats.hits
);
}
/// Test scratch space thread isolation
#[test]
fn test_scratch_space_isolation() {
use std::sync::Arc;
use std::thread;
let manager = Arc::new(ScratchSpaceManager::new(8192, 8).expect("manager creation failed"));
let handles: Vec<_> = (0..4)
.map(|thread_id| {
let manager = Arc::clone(&manager);
thread::spawn(move || {
for _ in 0..10 {
let mut scratch = manager.get_scratch().expect("get_scratch failed");
// Each thread writes its ID
if let Some(buf) = scratch.get::<u32>(100) {
buf.fill(thread_id);
// Verify no cross-thread contamination
assert!(buf.iter().all(|&v| v == thread_id));
}
scratch.reset();
}
})
})
.collect();
for handle in handles {
handle.join().expect("Thread panicked");
}
// Verify 4 threads were tracked
assert_eq!(manager.active_threads(), 4);
}
/// Test memory manager configuration for model
#[test]
fn test_memory_manager_for_model() {
// Configure for a small LLM (e.g., Phi-2)
let config = MemoryManagerConfig::for_model(
2560, // hidden_dim
51200, // vocab_size
1, // batch_size
);
let manager = MemoryManager::with_config(config).expect("manager creation failed");
// Verify adequate capacity
assert!(manager.arena.capacity() > 2560 * 4 * 4); // At least hidden_dim * 4 * sizeof(f32)
// Simulate inference
let activations: &mut [f32] = manager.arena.alloc(2560).expect("alloc activations");
let logits: &mut [f32] = manager.arena.alloc(51200).expect("alloc logits");
assert_eq!(activations.len(), 2560);
assert_eq!(logits.len(), 51200);
// Reset for next step
manager.reset_step();
assert_eq!(manager.arena.used(), 0);
}
/// Test buffer size class selection
#[test]
fn test_buffer_size_selection() {
let pool = BufferPool::new();
// Test automatic size class selection
if let Some(buf) = pool.acquire_for_size(500).ok().flatten() {
assert!(buf.capacity() >= 500);
assert_eq!(buf.size_class(), BufferSize::KB1);
}
if let Some(buf) = pool.acquire_for_size(3000).ok().flatten() {
assert!(buf.capacity() >= 3000);
assert_eq!(buf.size_class(), BufferSize::KB4);
}
if let Some(buf) = pool.acquire_for_size(100000).ok().flatten() {
assert!(buf.capacity() >= 100000);
assert_eq!(buf.size_class(), BufferSize::KB256);
}
// Size too large should return None
let too_large = pool.acquire_for_size(500000).ok().flatten();
assert!(too_large.is_none(), "Should not find buffer for 500KB");
}
}