Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,862 @@
# RuvLLM API Reference
Complete API documentation for the RuvLLM crate.
## Table of Contents
- [Core Types](#core-types)
- [Backend Trait](#backend-trait)
- [Candle Backend](#candle-backend)
- [LoRA Module](#lora-module)
- [Optimization Module](#optimization-module)
- [Kernel Functions](#kernel-functions)
- [KV Cache](#kv-cache)
- [Error Handling](#error-handling)
---
## Core Types
### `Precision`
Numeric precision for model weights and KV cache.
```rust
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Precision {
/// Full 32-bit floating point
FP32,
/// Half precision 16-bit float
FP16,
/// Brain floating point (16-bit)
BF16,
/// 8-bit integer quantization
Q8,
/// 4-bit integer quantization
Q4,
/// 4-bit K-quant (GGML-style)
Q4K,
}
impl Precision {
/// Get bytes per element for this precision
pub fn bytes_per_element(&self) -> u8;
}
```
### `ModelSize`
Model size classification for routing.
```rust
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModelSize {
Tiny, // < 1B params
Small, // 1-3B params
Medium, // 3-13B params
Large, // > 13B params
}
```
### `DeviceType`
Compute device selection.
```rust
#[derive(Debug, Clone, Copy)]
pub enum DeviceType {
/// CPU (fallback)
Cpu,
/// Apple Metal GPU
Metal,
/// NVIDIA CUDA GPU
Cuda(usize), // device index
}
```
---
## Backend Trait
### `LlmBackend`
Main trait for LLM inference backends.
```rust
pub trait LlmBackend: Send + Sync {
/// Load a model from HuggingFace Hub or local path
///
/// # Arguments
/// * `model_id` - HuggingFace model ID or local path
/// * `config` - Model configuration
///
/// # Example
/// ```
/// backend.load_model("Qwen/Qwen2.5-7B-Instruct", config)?;
/// ```
fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()>;
/// Generate text from a prompt
///
/// # Arguments
/// * `prompt` - Input text prompt
/// * `params` - Generation parameters
///
/// # Returns
/// Generated text response
///
/// # Example
/// ```
/// let response = backend.generate("Hello!", GenerateParams::default())?;
/// ```
fn generate(&self, prompt: &str, params: GenerateParams) -> Result<String>;
/// Streaming text generation
///
/// # Arguments
/// * `prompt` - Input text prompt
/// * `params` - Generation parameters
/// * `callback` - Called for each generated token
fn generate_stream<F>(&self, prompt: &str, params: GenerateParams, callback: F) -> Result<()>
where
F: FnMut(&str) -> bool;
/// Get the tokenizer for this model
fn tokenizer(&self) -> Option<&dyn Tokenizer>;
/// Get model metadata
fn model_info(&self) -> Option<ModelInfo>;
/// Check if a model is loaded
fn is_loaded(&self) -> bool;
}
```
### `ModelConfig`
Configuration for model loading.
```rust
#[derive(Debug, Clone)]
pub struct ModelConfig {
/// Maximum context length
pub max_context: usize,
/// Use Flash Attention
pub use_flash_attention: bool,
/// Weight quantization level
pub quantization: Precision,
/// KV cache configuration
pub kv_cache_config: KvCacheConfig,
/// Device to load model on
pub device: DeviceType,
/// HuggingFace token for gated models
pub hf_token: Option<String>,
}
impl Default for ModelConfig {
fn default() -> Self {
Self {
max_context: 4096,
use_flash_attention: true,
quantization: Precision::Q4K,
kv_cache_config: KvCacheConfig::default(),
device: DeviceType::Metal,
hf_token: None,
}
}
}
```
### `GenerateParams`
Parameters for text generation.
```rust
#[derive(Debug, Clone)]
pub struct GenerateParams {
/// Maximum tokens to generate
pub max_tokens: usize,
/// Sampling temperature (0.0 = deterministic)
pub temperature: f32,
/// Top-p (nucleus) sampling
pub top_p: f32,
/// Top-k sampling (0 = disabled)
pub top_k: usize,
/// Repetition penalty
pub repetition_penalty: f32,
/// Stop sequences
pub stop_sequences: Vec<String>,
/// Random seed for reproducibility
pub seed: Option<u64>,
}
impl Default for GenerateParams {
fn default() -> Self {
Self {
max_tokens: 256,
temperature: 0.7,
top_p: 0.9,
top_k: 0,
repetition_penalty: 1.1,
stop_sequences: vec![],
seed: None,
}
}
}
```
---
## Candle Backend
### `CandleBackend`
HuggingFace Candle-based inference backend.
```rust
impl CandleBackend {
/// Create a new backend with default device
///
/// # Example
/// ```
/// let backend = CandleBackend::new()?;
/// ```
pub fn new() -> Result<Self>;
/// Create with specific device
///
/// # Example
/// ```
/// let backend = CandleBackend::with_device(DeviceType::Metal)?;
/// ```
pub fn with_device(device: DeviceType) -> Result<Self>;
/// Download model from HuggingFace Hub
///
/// # Arguments
/// * `model_id` - HuggingFace model ID
/// * `quantization` - Target quantization
/// * `cache_dir` - Local cache directory
///
/// # Example
/// ```
/// let path = backend.download_model(
/// "Qwen/Qwen2.5-7B-Instruct",
/// Precision::Q4K,
/// "~/.cache/ruvllm"
/// ).await?;
/// ```
pub async fn download_model(
&self,
model_id: &str,
quantization: Precision,
cache_dir: &str,
) -> Result<PathBuf>;
/// Get current device
pub fn device(&self) -> DeviceType;
/// Get memory usage statistics
pub fn memory_stats(&self) -> MemoryStats;
}
```
---
## LoRA Module
### `MicroLoRA`
Real-time per-request fine-tuning with rank 1-2 adapters.
```rust
impl MicroLoRA {
/// Create a new MicroLoRA instance
///
/// # Example
/// ```
/// let config = MicroLoraConfig::for_hidden_dim(4096);
/// let lora = MicroLoRA::new(config);
/// ```
pub fn new(config: MicroLoraConfig) -> Self;
/// Adapt on new input with feedback
///
/// # Arguments
/// * `input` - Input embedding vector
/// * `feedback` - Quality feedback for learning
///
/// # Example
/// ```
/// let feedback = AdaptFeedback::from_quality(0.9);
/// lora.adapt(&input_embedding, feedback)?;
/// ```
pub fn adapt(&self, input: &[f32], feedback: AdaptFeedback) -> Result<()>;
/// Forward pass through LoRA adapter
///
/// # Arguments
/// * `input` - Input tensor
/// * `module` - Target module (Q, K, V, O projections)
///
/// # Returns
/// Output with LoRA contribution added
///
/// # Example
/// ```
/// let output = lora.forward(&input, &TargetModule::QProj);
/// ```
pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
/// Forward pass that adds to existing output (in-place)
pub fn forward_add(&self, input: &[f32], module: &TargetModule, output: &mut [f32]);
/// Apply accumulated gradient updates
///
/// # Arguments
/// * `learning_rate` - Learning rate for update
pub fn apply_updates(&self, learning_rate: f32);
/// Apply updates with EWC++ regularization
///
/// # Arguments
/// * `learning_rate` - Learning rate
/// * `ewc_states` - EWC++ state per module
/// * `ewc_lambda` - EWC regularization strength
pub fn apply_updates_with_ewc(
&self,
learning_rate: f32,
ewc_states: &HashMap<TargetModule, EwcState>,
ewc_lambda: f32,
);
/// Reset all adapter weights
pub fn reset(&self);
/// Get adapter statistics
pub fn stats(&self) -> MicroLoraStats;
}
```
### `MicroLoraConfig`
Configuration for MicroLoRA adapters.
```rust
#[derive(Debug, Clone)]
pub struct MicroLoraConfig {
/// Input feature dimension
pub in_features: usize,
/// Output feature dimension
pub out_features: usize,
/// LoRA rank (1-2 for MicroLoRA)
pub rank: usize,
/// LoRA alpha scaling factor
pub alpha: f32,
/// Dropout probability
pub dropout: f32,
/// Target modules to adapt
pub target_modules: Vec<TargetModule>,
/// Enable gradient checkpointing
pub gradient_checkpointing: bool,
}
impl MicroLoraConfig {
/// Create config for a specific hidden dimension
///
/// # Example
/// ```
/// let config = MicroLoraConfig::for_hidden_dim(4096);
/// assert_eq!(config.in_features, 4096);
/// assert_eq!(config.rank, 2);
/// ```
pub fn for_hidden_dim(hidden_dim: usize) -> Self;
}
```
### `TargetModule`
Transformer modules that can be adapted.
```rust
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TargetModule {
/// Query projection
QProj,
/// Key projection
KProj,
/// Value projection
VProj,
/// Output projection
OProj,
/// Gate projection (FFN)
GateProj,
/// Up projection (FFN)
UpProj,
/// Down projection (FFN)
DownProj,
}
```
### `AdaptFeedback`
Feedback for LoRA adaptation.
```rust
#[derive(Debug, Clone)]
pub struct AdaptFeedback {
/// Quality score (0.0 - 1.0)
pub quality: f32,
/// Gradient estimate from feedback
pub gradient_estimate: Vec<f32>,
/// Optional reward signal
pub reward: Option<f32>,
/// Latency in microseconds
pub latency_us: u64,
/// Source module (optional)
pub source_module: Option<TargetModule>,
/// Session identifier
pub session_id: Option<String>,
}
impl AdaptFeedback {
/// Create feedback from quality score
///
/// # Example
/// ```
/// let feedback = AdaptFeedback::from_quality(0.85);
/// ```
pub fn from_quality(quality: f32) -> Self;
}
```
---
## Optimization Module
### `SonaLlm`
SONA learning integration for LLM inference.
```rust
impl SonaLlm {
/// Create new SONA LLM integration
///
/// # Example
/// ```
/// let sona = SonaLlm::new(SonaLlmConfig::default());
/// ```
pub fn new(config: SonaLlmConfig) -> Self;
/// Instant loop: per-request MicroLoRA adaptation
///
/// Target latency: <1ms
///
/// # Arguments
/// * `request` - User query text
/// * `response` - Model response text
/// * `feedback` - Quality score (0.0 - 1.0)
///
/// # Returns
/// Adaptation result with statistics
///
/// # Example
/// ```
/// let result = sona.instant_adapt(
/// "What is machine learning?",
/// "Machine learning is...",
/// 0.9
/// );
/// assert!(result.applied);
/// assert!(result.latency_us < 1000); // <1ms
/// ```
pub fn instant_adapt(&self, request: &str, response: &str, feedback: f32) -> AdaptationResult;
/// Background loop: consolidate patterns
///
/// Called periodically (~100ms interval)
///
/// # Example
/// ```
/// let result = sona.background_consolidate();
/// println!("Consolidated {} samples", result.samples_used);
/// ```
pub fn background_consolidate(&self) -> AdaptationResult;
/// Deep loop: trigger full optimization
///
/// # Arguments
/// * `dataset` - Training samples to learn from
pub fn deep_optimize(&self, dataset: &[TrainingSample]) -> AdaptationResult;
/// Check if background loop should run
pub fn maybe_background(&self) -> Option<AdaptationResult>;
/// Check if deep loop should be triggered
pub fn should_trigger_deep(&self) -> bool;
/// Get current statistics
pub fn stats(&self) -> LearningLoopStats;
/// Forward pass through MicroLoRA
pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
/// Reset all learning state
pub fn reset(&self);
}
```
### `SonaLlmConfig`
Configuration for SONA LLM integration.
```rust
#[derive(Debug, Clone)]
pub struct SonaLlmConfig {
/// MicroLoRA configuration
pub micro_lora: MicroLoraConfig,
/// Training pipeline configuration
pub training: TrainingConfig,
/// SONA core configuration
pub sona: SonaConfig,
/// Instant loop learning rate
pub instant_lr: f32,
/// Background loop interval (milliseconds)
pub background_interval_ms: u64,
/// Minimum samples for background consolidation
pub background_min_samples: usize,
/// Deep loop trigger threshold
pub deep_trigger_threshold: f32,
/// Maximum pending samples
pub max_pending_samples: usize,
/// Consolidation strategy
pub consolidation_strategy: ConsolidationStrategy,
}
```
### `ConsolidationStrategy`
Strategy for consolidating learned patterns.
```rust
#[derive(Debug, Clone, Copy)]
pub enum ConsolidationStrategy {
/// Merge with EWC++ regularization (default)
EwcMerge,
/// Simple averaging
Average,
/// Weighted by quality
QualityWeighted,
/// Keep best performing only
BestOnly,
/// Ensemble multiple adapters
Ensemble,
}
```
---
## Kernel Functions
### Attention Kernels
```rust
/// Flash Attention 2 with NEON SIMD optimization
///
/// Memory-efficient attention with O(N) complexity.
///
/// # Arguments
/// * `query` - Query tensor (head_dim,)
/// * `key` - Key tensor (kv_len, head_dim)
/// * `value` - Value tensor (kv_len, head_dim)
/// * `scale` - Softmax scale (typically 1/sqrt(head_dim))
/// * `causal` - Apply causal masking
///
/// # Returns
/// Output tensor (head_dim,)
///
/// # Example
/// ```
/// let scale = 1.0 / (head_dim as f32).sqrt();
/// let output = flash_attention_neon(&query, &key, &value, scale, true);
/// ```
pub fn flash_attention_neon(
query: &[f32],
key: &[f32],
value: &[f32],
scale: f32,
causal: bool,
) -> Vec<f32>;
/// Paged Attention for KV cache
///
/// # Arguments
/// * `query` - Query tensor
/// * `kv_cache` - Paged KV cache
/// * `block_tables` - Block index mapping
/// * `scale` - Softmax scale
pub fn paged_attention_neon(
query: &[f32],
kv_cache: &PagedKvCache,
block_tables: &[usize],
scale: f32,
) -> Vec<f32>;
/// Grouped-Query Attention (GQA)
///
/// KV heads shared among query head groups.
///
/// # Arguments
/// * `queries` - Query tensor (num_heads, head_dim)
/// * `keys` - Key tensor (kv_len, num_kv_heads, head_dim)
/// * `values` - Value tensor (kv_len, num_kv_heads, head_dim)
/// * `config` - Attention configuration
pub fn grouped_query_attention_neon(
queries: &[f32],
keys: &[f32],
values: &[f32],
config: &AttentionConfig,
) -> Vec<f32>;
/// Multi-Query Attention (MQA)
///
/// Single KV head shared across all query heads.
pub fn multi_query_attention_neon(
queries: &[f32],
key: &[f32],
value: &[f32],
config: &AttentionConfig,
) -> Vec<f32>;
```
### `AttentionConfig`
Configuration for attention operations.
```rust
#[derive(Debug, Clone)]
pub struct AttentionConfig {
/// Number of query heads
pub num_heads: usize,
/// Number of KV heads (for GQA)
pub num_kv_heads: usize,
/// Dimension per head
pub head_dim: usize,
/// Apply causal masking
pub causal: bool,
/// Custom scale factor (None = 1/sqrt(head_dim))
pub scale: Option<f32>,
}
impl AttentionConfig {
/// Calculate GQA ratio (query heads / KV heads)
pub fn gqa_ratio(&self) -> usize;
/// Get effective scale factor
pub fn effective_scale(&self) -> f32;
}
```
---
## KV Cache
### `TwoTierKvCache`
Two-tier KV cache with FP16 tail and quantized store.
```rust
impl TwoTierKvCache {
/// Create a new two-tier KV cache
///
/// # Example
/// ```
/// let config = KvCacheConfig {
/// tail_length: 256,
/// max_tokens: 4096,
/// ..Default::default()
/// };
/// let cache = TwoTierKvCache::new(config);
/// ```
pub fn new(config: KvCacheConfig) -> Self;
/// Append new KV pairs
///
/// Automatically handles:
/// - Adding to tail
/// - Migrating to quantized store
/// - Evicting oldest tokens
///
/// # Arguments
/// * `keys` - Key tensor
/// * `values` - Value tensor
///
/// # Example
/// ```
/// cache.append(&keys, &values)?;
/// ```
pub fn append(&self, keys: &[f32], values: &[f32]) -> Result<()>;
/// Get all KV pairs for attention
///
/// Returns (keys, values) with cold tier dequantized.
pub fn get_all_kv(&self) -> (Vec<f32>, Vec<f32>);
/// Compute attention with tier-aware access
///
/// # Arguments
/// * `query` - Query tensor
/// * `scale` - Softmax scale
pub fn attend(&self, query: &[f32], scale: f32) -> Result<Vec<f32>>;
/// Get current statistics
pub fn stats(&self) -> KvCacheStats;
/// Clear the cache
pub fn clear(&self);
/// Update quantization policy
pub fn update_policy(&self, policy: CacheQuantization);
}
```
### `KvCacheConfig`
Configuration for KV cache.
```rust
#[derive(Debug, Clone)]
pub struct KvCacheConfig {
/// Tokens to keep in high-precision tail
pub tail_length: usize,
/// Precision for tail storage
pub tail_precision: Precision,
/// Precision for quantized store
pub store_precision: Precision,
/// Maximum total tokens
pub max_tokens: usize,
/// Number of KV heads
pub num_kv_heads: usize,
/// Head dimension
pub head_dim: usize,
/// Migration batch size
pub migration_batch: usize,
}
```
### `KvCacheStats`
Statistics for KV cache usage.
```rust
#[derive(Debug, Clone)]
pub struct KvCacheStats {
/// Total tokens cached
pub total_tokens: usize,
/// Tokens in high-precision tail
pub tail_tokens: usize,
/// Tokens in quantized store
pub store_tokens: usize,
/// Bytes used by tail
pub tail_bytes: usize,
/// Bytes used by store
pub store_bytes: usize,
/// Compression ratio
pub compression_ratio: f32,
}
```
---
## Error Handling
### `RuvLLMError`
Main error type for RuvLLM operations.
```rust
#[derive(Error, Debug)]
pub enum RuvLLMError {
/// Storage-related errors
#[error("Storage error: {0}")]
Storage(String),
/// Session management errors
#[error("Session error: {0}")]
Session(String),
/// KV cache errors
#[error("KV cache error: {0}")]
KvCache(String),
/// Paged attention errors
#[error("Paged attention error: {0}")]
PagedAttention(String),
/// Adapter management errors
#[error("Adapter error: {0}")]
Adapter(String),
/// SONA learning errors
#[error("SONA error: {0}")]
Sona(String),
/// Configuration errors
#[error("Configuration error: {0}")]
Config(String),
/// Out of memory
#[error("Out of memory: {0}")]
OutOfMemory(String),
/// Invalid operation
#[error("Invalid operation: {0}")]
InvalidOperation(String),
/// Not found
#[error("Not found: {0}")]
NotFound(String),
/// Backend inference errors
#[error("Backend error: {0}")]
Backend(String),
/// Model loading errors
#[error("Model error: {0}")]
Model(String),
/// Tokenization errors
#[error("Tokenization error: {0}")]
Tokenization(String),
/// Generation errors
#[error("Generation error: {0}")]
Generation(String),
/// IO errors
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
}
```
### `Result` Type Alias
```rust
/// Result type alias for RuvLLM operations
pub type Result<T> = std::result::Result<T, RuvLLMError>;
```
---
## Feature Flags Reference
| Feature | Dependencies | Description |
|---------|-------------|-------------|
| `default` | `async-runtime` | Standard async support |
| `async-runtime` | `tokio` | Tokio async runtime |
| `wasm` | - | WebAssembly support |
| `candle` | `candle-*`, `tokenizers`, `hf-hub` | Candle ML backend |
| `metal` | `candle/metal` | Apple Metal GPU |
| `cuda` | `candle/cuda` | NVIDIA CUDA GPU |
| `inference-metal` | `candle`, `metal` | Full Metal stack |
| `inference-cuda` | `candle`, `cuda` | Full CUDA stack |

View File

@@ -0,0 +1,402 @@
# RuvLLM Architecture (v2.0.0)
This document describes the system architecture of RuvLLM, a high-performance LLM inference engine optimized for Apple Silicon.
## v2.0.0 New Features
| Feature | Description | Performance Impact |
|---------|-------------|-------------------|
| Multi-threaded GEMM/GEMV | Rayon parallelization | 12.7x speedup on M4 Pro |
| Flash Attention 2 | Auto block sizing | +10% throughput |
| Quantized Inference | INT8/INT4/Q4_K kernels | 4-8x memory reduction |
| Metal GPU Shaders | simdgroup_matrix ops | 3x speedup |
| Memory Pool | Arena allocator | Zero-alloc inference |
| WASM Support | Browser inference | ~2.5x overhead |
| npm Integration | @ruvector/ruvllm | JavaScript/TypeScript API |
## System Overview
```
+----------------------------------+
| User Application |
+----------------------------------+
|
v
+-------------------------------------------------------------------------------------+
| RuvLLM Core |
| +-------------------------------------------------------------------------------+ |
| | Backend Abstraction | |
| | +-------------------------+ +-------------------------+ | |
| | | Candle Backend | | mistral-rs Backend | | |
| | | - Model Loading | | - Model Loading | | |
| | | - Tokenization | | - Tokenization | | |
| | | - Forward Pass | | - Forward Pass | | |
| | +-------------------------+ +-------------------------+ | |
| +-------------------------------------------------------------------------------+ |
| | |
| +-------------------------------------------------------------------------------+ |
| | SONA Learning Layer | |
| | +---------------------+ +----------------------+ +---------------------+ | |
| | | Instant Loop | | Background Loop | | Deep Loop | | |
| | | (<1ms latency) | | (~100ms interval) | | (minutes/hours) | | |
| | | - MicroLoRA adapt | | - Pattern merge | | - Full fine-tune | | |
| | | - Per-request | | - EWC++ update | | - Model distill | | |
| | +---------------------+ +----------------------+ +---------------------+ | |
| +-------------------------------------------------------------------------------+ |
| | |
| +-------------------------------------------------------------------------------+ |
| | Optimized Kernels | |
| | +------------------+ +------------------+ +------------------+ | |
| | | Attention | | Normalization | | Embedding | | |
| | | - Flash Attn 2 | | - RMSNorm | | - RoPE | | |
| | | - Paged Attn | | - LayerNorm | | - Token Embed | | |
| | | - GQA/MQA | | - Fused Ops | | - Pos Embed | | |
| | +------------------+ +------------------+ +------------------+ | |
| +-------------------------------------------------------------------------------+ |
| | |
| +-------------------------------------------------------------------------------+ |
| | Memory Management | |
| | +-------------------------+ +-------------------------------------------+ | |
| | | Two-Tier KV Cache | | Memory Pool | | |
| | | +-------------------+ | | - Slab allocator | | |
| | | | FP16 Tail (hot) | | | - Arena allocation | | |
| | | +-------------------+ | | - Zero-copy transfers | | |
| | | | Q4 Store (cold) | | | | | |
| | | +-------------------+ | +-------------------------------------------+ | |
| | +-------------------------+ | |
| +-------------------------------------------------------------------------------+ |
+-------------------------------------------------------------------------------------+
|
v
+-------------------------------------------------------------------------------------+
| Hardware Acceleration |
| +---------------------------+ +---------------------------+ |
| | Metal (Apple GPU) | | CUDA (NVIDIA) | |
| | - MLX integration | | - cuBLAS | |
| | - Metal Performance | | - cuDNN | |
| | Shaders | | - TensorRT | |
| +---------------------------+ +---------------------------+ |
+-------------------------------------------------------------------------------------+
```
## Component Architecture
### 1. Backend Abstraction Layer
The backend abstraction provides a unified interface for different ML frameworks.
```
+---------------------------+
| LlmBackend Trait |
| - load_model() |
| - generate() |
| - forward() |
| - get_tokenizer() |
+---------------------------+
^
|
+------+------+
| |
+-------+ +-----------+
|Candle | |mistral-rs |
+-------+ +-----------+
```
**Candle Backend Features:**
- HuggingFace model hub integration
- Native Rust tensor operations
- Metal/CUDA acceleration
- Safetensors loading
### 2. SONA Learning Layer
Self-Optimizing Neural Architecture with three learning loops:
```
+-------------------+ +-------------------+
| Inference Request |---->| Instant Loop |
| + feedback | | - MicroLoRA adapt |
+-------------------+ | - <1ms latency |
+--------+----------+
|
v (async, 100ms)
+--------+----------+
| Background Loop |
| - Pattern merge |
| - Adapter compose |
| - EWC++ update |
+--------+----------+
|
v (triggered)
+--------+----------+
| Deep Loop |
| - Full fine-tune |
| - Model distill |
| - Pattern bank |
+-------------------+
```
**Loop Characteristics:**
| Loop | Latency | Trigger | Purpose |
|------|---------|---------|---------|
| Instant | <1ms | Per-request | Real-time adaptation |
| Background | ~100ms | Interval/threshold | Pattern consolidation |
| Deep | Minutes | Accumulated quality | Full optimization |
### 3. Optimized Kernel Layer
NEON SIMD-optimized kernels for ARM64:
```
+-----------------------------------------------+
| Attention Kernels |
+-----------------------------------------------+
| |
| +------------------+ +------------------+ |
| | Flash Attention | | Paged Attention | |
| | - Tiled QKV | | - Block tables | |
| | - Online softmax| | - Non-contiguous| |
| | - O(N) memory | | - KV cache aware| |
| +------------------+ +------------------+ |
| |
| +------------------+ +------------------+ |
| | Multi-Query (MQA)| | Grouped-Query | |
| | - 1 KV head | | - KV groups | |
| | - Shared KV | | - 4-8x savings | |
| +------------------+ +------------------+ |
+-----------------------------------------------+
+-----------------------------------------------+
| Normalization Kernels |
+-----------------------------------------------+
| +------------------+ +------------------+ |
| | RMSNorm | | LayerNorm | |
| | - NEON SIMD | | - NEON SIMD | |
| | - Fused ops | | - Fused ops | |
| +------------------+ +------------------+ |
+-----------------------------------------------+
+-----------------------------------------------+
| Embedding Kernels |
+-----------------------------------------------+
| +------------------+ +------------------+ |
| | Rotary Position | | Token Embedding | |
| | (RoPE) | | - Lookup table | |
| | - Precomputed | | - Batch gather | |
| +------------------+ +------------------+ |
+-----------------------------------------------+
```
### 4. Memory Management
Two-tier KV cache for optimal memory/quality tradeoff:
```
+----------------------------------------------------+
| Two-Tier KV Cache |
+----------------------------------------------------+
| |
| Position: 0 tail_length max |
| +------------------+------------------+ |
| | | | |
| | Quantized Store | High-Precision | |
| | (Cold) | Tail (Hot) | |
| | | | |
| | - Q4/Q8 format | - FP16 format | |
| | - Older tokens | - Recent tokens | |
| | - 4x smaller | - Full quality | |
| | | | |
| +------------------+------------------+ |
| |
| Migration: Hot -> Cold (when tail_length exceeded)|
| Eviction: Cold first, then Hot |
+----------------------------------------------------+
```
**Cache Operations:**
1. **Append**: Add new KV pairs to tail
2. **Migrate**: Move old tokens from tail to quantized store
3. **Evict**: Remove oldest tokens when max exceeded
4. **Attend**: Dequantize cold + use hot for attention
## Data Flow
### Inference Pipeline
```
Input Tokens
|
v
+--------------------+
| Token Embedding |
| + RoPE Position |
+--------------------+
|
v (for each layer)
+--------------------+
| Attention Layer |
| +---------------+|
| | Q,K,V Project ||
| +---------------+|
| | |
| +---------------+|
| | KV Cache ||
| | Update ||
| +---------------+|
| | |
| +---------------+|
| | Flash/Paged ||
| | Attention ||
| +---------------+|
| | |
| +---------------+|
| | Output Proj ||
| +---------------+|
+--------------------+
|
v
+--------------------+
| FFN Layer |
| - Gate Proj |
| - Up Proj |
| - Down Proj |
| - Activation |
+--------------------+
|
v
+--------------------+
| RMSNorm |
+--------------------+
|
v
+--------------------+
| LM Head |
| (final layer) |
+--------------------+
|
v
Logits -> Sampling -> Token
```
### Learning Pipeline
```
Request + Response + Feedback
|
v
+---------------------------+
| Instant Loop |
| - Compute embeddings |
| - Apply MicroLoRA |
| - Queue for background |
+---------------------------+
|
v (async)
+---------------------------+
| Background Loop |
| - Batch samples |
| - Update EWC++ Fisher |
| - Merge adapters |
| - Store in ReasoningBank |
+---------------------------+
|
v (threshold triggered)
+---------------------------+
| Deep Loop |
| - Full training pipeline |
| - Pattern distillation |
| - Catastrophic forget |
| prevention (EWC++) |
+---------------------------+
```
## Module Structure
```
ruvllm/
├── src/
│ ├── lib.rs # Crate root, re-exports
│ ├── error.rs # Error types
│ ├── types.rs # Common types (Precision, etc.)
│ │
│ ├── backends/ # ML framework backends
│ │ ├── mod.rs # Backend trait
│ │ ├── candle_backend.rs
│ │ └── config.rs
│ │
│ ├── kernels/ # Optimized kernels
│ │ ├── mod.rs # Kernel exports
│ │ ├── attention.rs # Attention variants
│ │ ├── matmul.rs # Matrix multiplication
│ │ ├── norm.rs # Normalization ops
│ │ └── rope.rs # Rotary embeddings
│ │
│ ├── lora/ # LoRA adapters
│ │ ├── mod.rs # LoRA exports
│ │ ├── micro_lora.rs # Real-time MicroLoRA
│ │ └── training.rs # Training pipeline
│ │
│ ├── optimization/ # SONA integration
│ │ ├── mod.rs
│ │ └── sona_llm.rs # Learning loops
│ │
│ ├── kv_cache.rs # Two-tier KV cache
│ ├── sona.rs # SONA core integration
│ ├── policy_store.rs # Learned policies
│ └── witness_log.rs # Inference logging
└── benches/ # Benchmarks
├── attention_bench.rs
├── lora_bench.rs
└── e2e_bench.rs
```
## Performance Characteristics
### Memory Layout
| Component | Memory Pattern | Optimization |
|-----------|---------------|--------------|
| KV Cache Tail | Sequential | NEON vectorized |
| KV Cache Store | Quantized blocks | Batch dequant |
| Model Weights | Memory-mapped | Zero-copy |
| Intermediate | Stack allocated | Arena alloc |
### Throughput Targets (M4 Pro)
| Operation | Target | Achieved |
|-----------|--------|----------|
| Flash Attention | 2.5x vs naive | ~2.3x |
| Paged Attention | 1.8x vs contiguous | ~1.7x |
| GQA vs MHA | 4x less KV memory | 4x |
| MicroLoRA adapt | <1ms | ~0.5ms |
## Integration Points
### With RuVector Core
```rust
// Memory backend integration
use ruvector_core::storage::Storage;
// SONA learning integration
use ruvector_sona::{SonaEngine, ReasoningBank};
```
### With External Systems
- **HuggingFace Hub**: Model downloads
- **OpenAI API**: Compatible inference endpoint
- **Prometheus**: Metrics export
- **gRPC**: High-performance RPC
## Future Architecture
Planned enhancements:
1. **Speculative Decoding**: Draft model integration
2. **Tensor Parallelism**: Multi-GPU support
3. **Continuous Batching**: Dynamic batch scheduling
4. **PagedAttention v2**: vLLM-style memory management

View File

@@ -0,0 +1,523 @@
# RuvLLM Fine-Tuning Guide
This guide covers RuvLLM's fine-tuning capabilities, including MicroLoRA for real-time adaptation and EWC++ for preventing catastrophic forgetting.
## Overview
RuvLLM provides three levels of fine-tuning:
| Level | Technique | Latency | Use Case |
|-------|-----------|---------|----------|
| Instant | MicroLoRA | <1ms | Per-request adaptation |
| Background | Adapter Merge + EWC++ | ~100ms | Pattern consolidation |
| Deep | Full Training Pipeline | Minutes | Periodic optimization |
## MicroLoRA: Real-Time Adaptation
MicroLoRA enables per-request fine-tuning with minimal overhead.
### How It Works
```
User Request
|
v
+------------------+
| Compute Input |
| Embedding |
+------------------+
|
v
+------------------+ +------------------+
| Base Model |--->| MicroLoRA Delta |
| Forward Pass | | (rank 1-2) |
+------------------+ +------------------+
| |
+----------+---------------+
|
v
+------------------+
| Combined Output |
+------------------+
|
v
Response + Quality Feedback
|
v
+------------------+
| Update MicroLoRA |
| Weights |
+------------------+
```
### Basic Usage
```rust
use ruvllm::lora::{MicroLoRA, MicroLoraConfig, AdaptFeedback, TargetModule};
// Create MicroLoRA for 4096-dim hidden states
let config = MicroLoraConfig::for_hidden_dim(4096);
let lora = MicroLoRA::new(config);
// During inference: apply LoRA delta
let base_output = model.forward(&input)?;
let lora_delta = lora.forward(&input, &TargetModule::QProj);
// Combine outputs
let output: Vec<f32> = base_output.iter()
.zip(lora_delta.iter())
.map(|(b, d)| b + d)
.collect();
// After response: adapt based on feedback
let feedback = AdaptFeedback::from_quality(0.85);
lora.adapt(&input, feedback)?;
// Periodically apply accumulated gradients
lora.apply_updates(0.01); // learning rate
```
### Configuration Options
```rust
let config = MicroLoraConfig {
// Input/output dimensions (typically hidden_dim)
in_features: 4096,
out_features: 4096,
// LoRA rank: 1-2 for micro, 4-8 for standard
rank: 2,
// Scaling factor (effective_rank = alpha / rank)
alpha: 4.0,
// Dropout for regularization
dropout: 0.0,
// Which modules to adapt
target_modules: vec![
TargetModule::QProj,
TargetModule::VProj,
],
// Memory optimization
gradient_checkpointing: false,
};
```
### Target Modules
Choose which transformer components to adapt:
| Module | Description | Memory | Impact |
|--------|-------------|--------|--------|
| `QProj` | Query projection | Low | High (attention focus) |
| `KProj` | Key projection | Low | Medium |
| `VProj` | Value projection | Low | High (content) |
| `OProj` | Output projection | Low | Medium |
| `GateProj` | FFN gate | Medium | High (routing) |
| `UpProj` | FFN up | High | Medium |
| `DownProj` | FFN down | High | Medium |
**Recommended combinations:**
- **Speed-focused**: `QProj` only
- **Quality-focused**: `QProj`, `VProj`
- **Full adaptation**: All attention projections
## EWC++ (Elastic Weight Consolidation)
EWC++ prevents catastrophic forgetting when adapting to new tasks.
### How It Works
```
Task 1 Training
|
v
+------------------+
| Compute Fisher |
| Information |
| F = E[grad^2] |
+------------------+
|
v
+------------------+
| Store Optimal |
| Weights θ* |
+------------------+
...later...
Task 2 Training
|
v
+------------------+
| Regularized Loss |
| L = L_task + |
| λ Σ F_i(θ-θ*)² |
+------------------+
|
v
+------------------+
| Update with |
| Importance |
| Weights |
+------------------+
```
### Using EWC++ with MicroLoRA
```rust
use ruvllm::lora::{MicroLoRA, TrainingPipeline, TrainingConfig};
// Create training pipeline with EWC++
let training_config = TrainingConfig {
learning_rate: 0.001,
ewc_lambda: 0.1, // Regularization strength
..Default::default()
};
let mut pipeline = TrainingPipeline::new(training_config);
pipeline.init_for_lora(&lora);
// Train on task 1
for sample in task1_samples {
pipeline.train_step(&lora, &sample.input, sample.feedback)?;
}
// Mark end of task 1 (computes Fisher information)
pipeline.start_new_task(&lora);
// Train on task 2 (EWC++ regularization active)
for sample in task2_samples {
pipeline.train_step(&lora, &sample.input, sample.feedback)?;
}
```
### EWC++ Configuration
```rust
let config = TrainingConfig {
// Base learning rate
learning_rate: 0.001,
// EWC regularization strength
// Higher = more preservation of old knowledge
// Lower = more adaptation to new tasks
ewc_lambda: 0.1,
// Minimum quality for learning
quality_threshold: 0.5,
// Fisher information estimation samples
fisher_samples: 100,
// Online Fisher update rate
online_ewc_gamma: 0.95,
};
```
## SONA Learning Loops
SONA provides automated multi-tier learning.
### Architecture
```
+-------------------+ +-------------------+
| Inference Request |---->| Instant Loop |
| + feedback | | - MicroLoRA adapt |
+-------------------+ | - <1ms latency |
+--------+----------+
|
v (async, 100ms)
+--------+----------+
| Background Loop |
| - Pattern merge |
| - Adapter compose |
| - EWC++ update |
+--------+----------+
|
v (triggered)
+--------+----------+
| Deep Loop |
| - Full fine-tune |
| - Model distill |
| - Pattern bank |
+-------------------+
```
### Using SONA
```rust
use ruvllm::optimization::{SonaLlm, SonaLlmConfig};
// Create SONA integration
let config = SonaLlmConfig {
instant_lr: 0.01,
background_interval_ms: 100,
background_min_samples: 10,
deep_trigger_threshold: 100.0,
consolidation_strategy: ConsolidationStrategy::EwcMerge,
..Default::default()
};
let sona = SonaLlm::new(config);
// During inference
let response = model.generate(&query)?;
// Record feedback (runs instant loop)
let result = sona.instant_adapt(&query, &response, 0.85);
println!("Instant adapt latency: {}μs", result.latency_us);
// Periodically check background loop
if let Some(bg_result) = sona.maybe_background() {
println!("Background: {} samples, quality delta: {:.3}",
bg_result.samples_used, bg_result.quality_delta);
}
// Check if deep loop should trigger
if sona.should_trigger_deep() {
let samples = collect_training_samples();
let deep_result = sona.deep_optimize(&samples);
println!("Deep optimization complete");
}
```
### Consolidation Strategies
```rust
pub enum ConsolidationStrategy {
/// EWC++ merge (default) - preserves important weights
EwcMerge,
/// Simple averaging - fast but may lose specialization
Average,
/// Quality-weighted - higher quality samples have more influence
QualityWeighted,
/// Best only - keep top 20% by quality
BestOnly,
/// Ensemble - maintain multiple adapters
Ensemble,
}
```
**Recommendations:**
- `EwcMerge`: Best for multi-domain use
- `QualityWeighted`: Best for quality optimization
- `BestOnly`: Best for high-variance feedback
- `Ensemble`: Best when you have distinct use cases
## Training Data Format
### TrainingSample
```rust
pub struct TrainingSample {
/// Input embedding
pub input_embedding: Vec<f32>,
/// Output embedding
pub output_embedding: Vec<f32>,
/// Query text (optional)
pub query: Option<String>,
/// Response text (optional)
pub response: Option<String>,
/// Quality score (0.0 - 1.0)
pub quality: f32,
/// Latency in milliseconds
pub latency_ms: f32,
/// Token count
pub token_count: usize,
/// Session identifier
pub session_id: String,
}
```
### Creating Training Samples
```rust
let sample = TrainingSample::new(
input_embedding,
output_embedding,
0.9, // quality
)
.with_query("What is machine learning?".to_string())
.with_response("Machine learning is...".to_string())
.with_latency(150.0) // ms
.with_session("session-123".to_string());
```
## Adapter Management
### Saving and Loading Adapters
```rust
// Save adapter state
let adapter_bytes = lora.export_weights()?;
std::fs::write("adapter.bin", &adapter_bytes)?;
// Load adapter state
let adapter_bytes = std::fs::read("adapter.bin")?;
lora.import_weights(&adapter_bytes)?;
```
### Merging Adapters
```rust
// Merge multiple adapters with weights
let adapters = vec![
(adapter1, 0.6), // 60% weight
(adapter2, 0.4), // 40% weight
];
let merged = MicroLoRA::merge_adapters(&adapters)?;
```
### Adapter Composition
```rust
// Sequential composition: adapter1 -> adapter2
let composed = MicroLoRA::compose_sequential(&[adapter1, adapter2])?;
// Parallel composition: average outputs
let composed = MicroLoRA::compose_parallel(&[adapter1, adapter2])?;
```
## Best Practices
### 1. Quality Threshold Selection
```rust
let config = TrainingConfig {
// Too low: learns from poor examples
// Too high: learns very slowly
// Recommended: 0.5 - 0.7
quality_threshold: 0.6,
..Default::default()
};
```
### 2. Learning Rate Scheduling
```rust
// Start high for quick adaptation
let initial_lr = 0.01;
// Reduce over time for stability
let decay_lr = |epoch: usize| -> f32 {
initial_lr * 0.95_f32.powi(epoch as i32)
};
```
### 3. Memory Management
```rust
// For memory-constrained environments
let config = MicroLoraConfig {
rank: 1, // Minimum rank
target_modules: vec![TargetModule::QProj], // Single module
gradient_checkpointing: true,
..Default::default()
};
```
### 4. Preventing Overfitting
```rust
let config = MicroLoraConfig {
dropout: 0.1, // Add regularization
..Default::default()
};
let training_config = TrainingConfig {
ewc_lambda: 0.5, // Strong regularization
..Default::default()
};
```
## Monitoring and Debugging
### Statistics
```rust
let stats = sona.stats();
println!("Learning Statistics:");
println!(" Instant updates: {}", stats.instant_count);
println!(" Avg instant latency: {:.2}μs", stats.instant_avg_latency_us);
println!(" Background updates: {}", stats.background_count);
println!(" Pending samples: {}", stats.pending_samples);
println!(" Accumulated quality: {:.2}", stats.accumulated_quality);
```
### Debugging Adaptation
```rust
// Enable debug logging
std::env::set_var("RUST_LOG", "ruvllm::lora=debug");
// Check adaptation result
let result = sona.instant_adapt(&query, &response, feedback);
if !result.applied {
println!("Adaptation skipped: {:?}", result.notes);
}
```
## Performance Tuning
### Latency Optimization
| Setting | Low Latency | Balanced | High Quality |
|---------|-------------|----------|--------------|
| LoRA rank | 1 | 2 | 4 |
| Target modules | 1 | 2 | 4 |
| Background interval | 200ms | 100ms | 50ms |
| EWC lambda | 0.0 | 0.1 | 0.5 |
### Memory Optimization
```rust
// Minimal memory footprint
let config = SonaLlmConfig {
max_pending_samples: 100, // Reduce buffer
micro_lora: MicroLoraConfig {
rank: 1,
target_modules: vec![TargetModule::QProj],
..Default::default()
},
..Default::default()
};
```
## Troubleshooting
### Adaptation Not Improving
1. Check quality threshold isn't too high
2. Verify feedback is meaningful (not always same value)
3. Increase learning rate
4. Try different target modules
### Catastrophic Forgetting
1. Increase EWC lambda
2. Use `EwcMerge` consolidation strategy
3. Reduce learning rate
4. Add more diverse training data
### High Latency
1. Reduce LoRA rank to 1
2. Reduce target modules
3. Increase background interval
4. Use `gradient_checkpointing`

View File

@@ -0,0 +1,521 @@
# RuvLLM Optimization Guide (v2.0.0)
This guide covers performance optimization strategies for RuvLLM, including SONA learning loops, batch sizing, KV cache management, and hardware-specific tuning.
## v2.0.0 Performance Highlights
| Feature | Improvement | Notes |
|---------|-------------|-------|
| Multi-threaded GEMM | 12.7x speedup | Rayon on M4 Pro 10-core |
| Flash Attention 2 | +10% throughput | Auto block sizing |
| Quantized Inference | 4-8x memory | INT8/INT4/Q4_K |
| Metal GPU | 3x speedup | simdgroup_matrix |
| Memory Pool | Zero-alloc | Arena allocator |
## Performance Overview
### Key Metrics
| Metric | Target (M4 Pro) | Achieved (v2.0.0) | Description |
|--------|-----------------|-------------------|-------------|
| Prefill | >2000 tok/s | 3500 tok/s | Processing input tokens |
| Decode | >80 tok/s | 120 tok/s | Generating output tokens |
| TTFT | <50ms | 35ms | Time to first token |
| Memory | <8GB for 7B | 3.4GB (Q4K) | Peak memory usage |
| MicroLoRA | <1ms | 8.56us | Per-request adaptation |
### Architecture Impact
```
┌─────────────────────────────────────────────────────────┐
│ Optimization Layers │
├─────────────────────────────────────────────────────────┤
│ SONA Learning │ Real-time adaptation, routing │
├─────────────────────────────────────────────────────────┤
│ Attention │ Flash, Paged, GQA - 2-4x speedup │
├─────────────────────────────────────────────────────────┤
│ KV Cache │ Two-tier, quantized - 4x memory │
├─────────────────────────────────────────────────────────┤
│ Quantization │ Q4K, Q8 - 4-8x smaller │
├─────────────────────────────────────────────────────────┤
│ SIMD/GPU │ NEON, Metal - hardware accel │
└─────────────────────────────────────────────────────────┘
```
## SONA Learning Optimization
### Instant Loop Tuning
The instant loop runs per-request with <1ms target latency.
```rust
let config = SonaLlmConfig {
// Learning rate for instant updates
// Higher = faster adaptation, more variance
// Lower = slower adaptation, more stable
instant_lr: 0.01,
// Quality threshold - skip low-quality samples
training: TrainingConfig {
quality_threshold: 0.5, // 0.0-1.0
..Default::default()
},
..Default::default()
};
```
**Tuning Guidelines:**
| Use Case | instant_lr | quality_threshold |
|----------|------------|-------------------|
| High variance tasks | 0.005 | 0.7 |
| Stable domains | 0.02 | 0.3 |
| User personalization | 0.01 | 0.5 |
### Background Loop Tuning
Consolidates patterns without blocking inference.
```rust
let config = SonaLlmConfig {
// How often to run (milliseconds)
background_interval_ms: 100,
// Minimum samples before consolidation
background_min_samples: 10,
// Maximum pending (triggers forced consolidation)
max_pending_samples: 1000,
// Consolidation strategy
consolidation_strategy: ConsolidationStrategy::EwcMerge,
..Default::default()
};
```
**Tuning Guidelines:**
| Priority | interval_ms | min_samples | Strategy |
|----------|-------------|-------------|----------|
| Latency | 200 | 20 | Average |
| Quality | 50 | 5 | EwcMerge |
| Memory | 100 | 50 | BestOnly |
### Deep Loop Optimization
Triggered periodically for full optimization.
```rust
let config = SonaLlmConfig {
// Accumulated quality threshold to trigger
deep_trigger_threshold: 100.0,
..Default::default()
};
// Manual trigger for scheduled optimization
if sona.should_trigger_deep() || is_scheduled_time() {
let samples = collect_high_quality_samples();
let result = sona.deep_optimize(&samples);
// Log improvement
println!("Deep optimization: quality delta = {:.3}", result.quality_delta);
}
```
## Batch Size Optimization
### Dynamic Batching
```rust
// Optimal batch sizes vary by operation
struct BatchConfig {
prefill_batch: usize, // Process multiple prompts together
decode_batch: usize, // Parallel token generation
lora_batch: usize, // LoRA adaptation batch
}
impl BatchConfig {
fn for_memory(available_gb: f32) -> Self {
match available_gb {
x if x < 8.0 => Self {
prefill_batch: 1,
decode_batch: 4,
lora_batch: 16,
},
x if x < 16.0 => Self {
prefill_batch: 2,
decode_batch: 8,
lora_batch: 32,
},
_ => Self {
prefill_batch: 4,
decode_batch: 16,
lora_batch: 64,
},
}
}
}
```
### Batch Size Impact
| Batch Size | Throughput | Latency | Memory |
|------------|------------|---------|--------|
| 1 | Low | Lowest | Lowest |
| 4 | Medium | Low | Medium |
| 8 | High | Medium | High |
| 16+ | Highest | Higher | Highest |
**Rule of thumb:** Increase batch size until memory pressure or latency constraints are hit.
## KV Cache Optimization
### Two-Tier Configuration
```rust
let config = KvCacheConfig {
// Tokens in high-precision tail
// More = better attention quality for recent context
// Less = less memory usage
tail_length: 256,
// Tail precision (FP16 recommended)
tail_precision: Precision::FP16,
// Store precision (Q4 for 4x compression)
store_precision: Precision::Q4,
// Maximum context length
max_tokens: 4096,
// KV heads (depends on model architecture)
num_kv_heads: 8,
head_dim: 128,
// Batch size for migration (affects latency spikes)
migration_batch: 64,
};
```
### Memory Calculation
```
KV Cache Memory = num_layers * 2 * max_tokens * num_kv_heads * head_dim * bytes_per_element
Example (Qwen2.5-7B with 4096 context):
- Layers: 32
- KV heads: 8
- Head dim: 128
- FP16 tail (256 tokens): 32 * 2 * 256 * 8 * 128 * 2 = 33.5 MB
- Q4 store (3840 tokens): 32 * 2 * 3840 * 8 * 128 * 0.5 = 125.8 MB
- Total: ~160 MB (vs ~672 MB for full FP16)
```
### Cache Strategies by Use Case
| Use Case | tail_length | store_precision | max_tokens |
|----------|-------------|-----------------|------------|
| Chat (short) | 128 | Q8 | 2048 |
| Chat (long) | 256 | Q4 | 8192 |
| Document QA | 512 | Q4 | 16384 |
| Code completion | 128 | Q8 | 4096 |
## Attention Optimization
### Grouped-Query Attention (GQA)
```rust
let config = AttentionConfig {
num_heads: 32, // Query heads
num_kv_heads: 8, // KV heads (4:1 ratio)
head_dim: 128,
causal: true,
..Default::default()
};
// GQA ratio determines memory savings
// 4:1 = ~4x KV cache reduction
// 8:1 = ~8x KV cache reduction
assert_eq!(config.gqa_ratio(), 4);
```
### Flash Attention Optimization
```rust
// Flash Attention is memory-efficient but has setup overhead
// Best for: longer sequences (>256 tokens)
// For short sequences, standard attention may be faster
let use_flash = sequence_length > 256;
if use_flash {
let output = flash_attention_neon(&query, &key, &value, scale, causal);
} else {
let output = standard_attention(&query, &key, &value, scale, causal);
}
```
### Paged Attention for Inference
```rust
// Paged attention enables non-contiguous KV cache
// Best for: long-running inference with variable context
let mut cache = PagedKvCache::new(
16, // block_size: tokens per block
8, // num_kv_heads
128, // head_dim
);
// Append incrementally
for token in tokens {
let (k, v) = compute_kv(token)?;
cache.append(&k, &v);
}
// Efficient attention over paged cache
let output = paged_attention_neon(&query, &cache, &block_tables, scale);
```
## Quantization Optimization
### Model Quantization
| Precision | Memory | Quality | Speed |
|-----------|--------|---------|-------|
| FP32 | 4x | Best | Slowest |
| FP16 | 2x | Excellent | Fast |
| Q8 | 1x | Very Good | Faster |
| Q4K | 0.5x | Good | Fastest |
| Q4 | 0.5x | Acceptable | Fastest |
**Recommendations:**
```rust
// High quality (16GB+ RAM)
let config = ModelConfig {
quantization: Precision::Q8,
..Default::default()
};
// Balanced (8-16GB RAM)
let config = ModelConfig {
quantization: Precision::Q4K, // K-quant preserves quality
..Default::default()
};
// Memory constrained (<8GB RAM)
let config = ModelConfig {
quantization: Precision::Q4,
..Default::default()
};
```
### KV Cache Quantization
```rust
// Hybrid quantization: recent tokens in high precision
let config = KvCacheConfig {
tail_length: 256, // Recent: FP16
tail_precision: Precision::FP16,
store_precision: Precision::Q4, // Older: Q4
..Default::default()
};
// Quality impact by position
// Position 0-256 (tail): Full quality
// Position 256+: ~95% quality with Q4
```
## Hardware-Specific Optimization
### Apple Silicon (M1/M2/M3/M4)
```rust
// Metal backend for GPU acceleration
let backend = CandleBackend::with_device(DeviceType::Metal)?;
// Optimize for unified memory
let config = ModelConfig {
// Unified memory = larger KV cache possible
kv_cache_config: KvCacheConfig {
max_tokens: 8192, // Can be larger on M-series
..Default::default()
},
..Default::default()
};
```
**M4 Pro Specific:**
- Use `metal` feature for GPU acceleration
- NEON SIMD enabled by default
- Leverage unified memory for larger context
### NVIDIA GPUs
```rust
// CUDA backend
let backend = CandleBackend::with_device(DeviceType::Cuda(0))?;
// Optimize for separate VRAM
let config = ModelConfig {
kv_cache_config: KvCacheConfig {
// Conservative: VRAM is limited
max_tokens: 4096,
..Default::default()
},
..Default::default()
};
```
### CPU Fallback
```rust
// CPU with SIMD optimization
let backend = CandleBackend::with_device(DeviceType::Cpu)?;
// Reduce memory pressure
let config = ModelConfig {
quantization: Precision::Q4,
kv_cache_config: KvCacheConfig {
tail_length: 128,
max_tokens: 2048,
..Default::default()
},
..Default::default()
};
```
## Real-Time Optimization
### Adaptive Optimization
```rust
use ruvllm::optimization::{RealTimeOptimizer, OptimizerConfig};
let optimizer = RealTimeOptimizer::new(OptimizerConfig {
target_latency_ms: 100.0,
min_throughput: 50.0, // tokens/sec
memory_threshold: 0.9, // 90% of available
});
// Optimizer adjusts parameters in real-time
loop {
let metrics = backend.get_metrics();
let adjustments = optimizer.recommend(&metrics);
if adjustments.reduce_batch_size {
config.batch_size -= 1;
}
if adjustments.increase_quantization {
config.kv_cache_config.store_precision = Precision::Q4;
}
}
```
### Latency Monitoring
```rust
// Track latency components
struct LatencyBreakdown {
tokenization_us: u64,
prefill_us: u64,
decode_us: u64,
sampling_us: u64,
lora_us: u64,
}
impl LatencyBreakdown {
fn total_ms(&self) -> f64 {
(self.tokenization_us + self.prefill_us +
self.decode_us + self.sampling_us + self.lora_us) as f64 / 1000.0
}
fn bottleneck(&self) -> &str {
let max = [
(self.tokenization_us, "tokenization"),
(self.prefill_us, "prefill"),
(self.decode_us, "decode"),
(self.sampling_us, "sampling"),
(self.lora_us, "lora"),
].into_iter().max_by_key(|(v, _)| *v).unwrap();
max.1
}
}
```
## Benchmarking
### Running Benchmarks
```bash
# All benchmarks
cargo bench
# Specific benchmarks
cargo bench --bench attention_bench
cargo bench --bench lora_bench
cargo bench --bench e2e_bench
# With specific features
cargo bench --features metal
cargo bench --features cuda
```
### Custom Benchmarks
```rust
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use ruvllm::kernels::attention::flash_attention_neon;
fn bench_attention(c: &mut Criterion) {
let query = vec![0.1f32; 128];
let key = vec![0.1f32; 512 * 128];
let value = vec![0.1f32; 512 * 128];
let scale = 1.0 / 128.0_f32.sqrt();
c.bench_function("flash_attention_512", |b| {
b.iter(|| {
flash_attention_neon(
black_box(&query),
black_box(&key),
black_box(&value),
scale,
true,
)
})
});
}
criterion_group!(benches, bench_attention);
criterion_main!(benches);
```
## Optimization Checklist
### Before Deployment
- [ ] Choose appropriate quantization (Q4K for most cases)
- [ ] Configure KV cache for expected context length
- [ ] Enable GQA if model supports it
- [ ] Set appropriate batch sizes for memory
- [ ] Configure SONA learning rates
- [ ] Test with representative workloads
### Monitoring
- [ ] Track prefill and decode throughput
- [ ] Monitor memory usage over time
- [ ] Log KV cache hit rates
- [ ] Track SONA learning metrics
- [ ] Alert on latency spikes
### Troubleshooting
| Symptom | Likely Cause | Solution |
|---------|--------------|----------|
| High latency | Batch too large | Reduce batch size |
| OOM errors | KV cache too large | Reduce max_tokens or use Q4 |
| Quality degradation | Over-quantization | Use Q8 instead of Q4 |
| Slow adaptation | Learning rate too low | Increase instant_lr |
| Forgetting | EWC lambda too low | Increase ewc_lambda |

View File

@@ -0,0 +1,417 @@
# RuvLTRA-Medium: 3B Parameter Model Architecture
## Overview
RuvLTRA-Medium is a 3 billion parameter language model based on the Qwen2.5-3B-Instruct architecture, enhanced with advanced learning capabilities and optimized for Apple Silicon and modern GPU acceleration.
## Architecture Specifications
### Model Configuration
| Parameter | Value | Description |
|-----------|-------|-------------|
| **Total Parameters** | ~3.0B | Full model size |
| **Hidden Size** | 2048 | Embedding dimension |
| **Layers** | 32 | Transformer decoder layers |
| **Attention Heads** | 16 | Query heads |
| **KV Heads** | 2 | Key-value heads (GQA) |
| **GQA Ratio** | 8:1 | Grouped Query Attention ratio |
| **Head Dimension** | 128 | Per-head dimension |
| **Intermediate Size** | 11008 | MLP hidden dimension |
| **Vocabulary Size** | 151936 | Qwen tokenizer |
| **Context Length** | 32768 | Maximum sequence length |
| **RoPE Theta** | 1,000,000 | RoPE base frequency |
### Quantization Options
| Format | Model Size | Quality | Speed | Recommended Use |
|--------|-----------|---------|-------|-----------------|
| **Q4_K_M** | ~2.0 GB | Good | Fast | Production inference |
| **Q5_K_M** | ~2.5 GB | Better | Medium | Balanced quality/speed |
| **Q8_0** | ~3.5 GB | Best | Slower | Maximum quality |
| **Mixed** | ~2.8 GB | Excellent | Medium | FP16 attn + Q4 MLP |
## Model Variants
### 1. RuvLTRA-Medium-Base
General-purpose model for diverse tasks.
**Configuration:**
```rust
let config = RuvLtraMediumConfig::base();
```
**Characteristics:**
- Temperature: 0.7
- Top-p: 0.9
- SONA hooks: Layers 8, 16, 24
- Pattern capacity: 50,000
**Use Cases:**
- General conversation
- Text completion
- Summarization
- Question answering
### 2. RuvLTRA-Medium-Coder
Optimized for code generation and analysis.
**Configuration:**
```rust
let config = RuvLtraMediumConfig::coder();
```
**Characteristics:**
- Temperature: 0.2 (deterministic)
- Top-p: 0.95
- SONA hooks: Layers 8, 16, 24, 28 (extra late-layer)
- Pattern capacity: 100,000
- Quality threshold: 0.7 (stricter)
**Use Cases:**
- Code completion
- Bug fixing
- Code refactoring
- API generation
### 3. RuvLTRA-Medium-Agent
Routing and planning optimized for agent systems.
**Configuration:**
```rust
let config = RuvLtraMediumConfig::agent();
```
**Characteristics:**
- Temperature: 0.3
- Top-p: 0.85
- SONA hooks: Layers 8, 16, 24
- HNSW M: 32 (higher connectivity)
- HNSW ef_construction: 400
- Micro-LoRA rank: 2 (low latency)
**Use Cases:**
- Claude Flow agent routing
- Task planning
- Decision making
- Multi-agent coordination
## RuvLTRA Enhancements
### 1. SONA Learning Hooks
SONA (Self-Optimizing Neural Architecture) hooks enable continuous learning during inference.
**Hook Layers:**
- **Layer 8**: Early pattern recognition (shallow semantics)
- **Layer 16**: Mid-layer semantic extraction (concepts)
- **Layer 24**: Deep reasoning capture (abstract thinking)
**Implementation:**
```rust
let config = RuvLtraMediumConfig::base();
let mut model = RuvLtraMediumModel::new(&config)?;
// Enable custom hook layers
model.enable_sona_with_hooks(&[8, 16, 24])?;
```
**Learning Loop:**
1. **Instant Loop**: Ring buffer with MicroLoRA (rank 4)
2. **Background Loop**: Router training with EWC++ Fisher
3. **Deep Loop**: Pattern bank consolidation
### 2. HNSW Routing Integration
HNSW (Hierarchical Navigable Small World) enables fast agent routing.
**Configuration:**
```rust
let config = RuvLtraMediumConfig::agent();
assert_eq!(config.sona_hooks.hnsw_m, 32);
assert_eq!(config.sona_hooks.hnsw_ef_construction, 400);
```
**Performance:**
- Search: 150x-12,500x faster than brute-force
- Insertion: O(log n) complexity
- Memory: ~4 bytes per node per connection
### 3. Claude Flow Agent Embeddings
Integration with Claude Flow for intelligent task routing.
**Features:**
- Agent type classification
- Task complexity estimation
- Quality prediction
- Trajectory recording
**Usage:**
```rust
let config = RuvLtraMediumConfig::agent();
config.enable_agent_routing = true;
let model = RuvLtraMediumModel::new(&config)?;
// Model automatically records trajectories for routing
```
### 4. ReasoningBank Trajectory Storage
Stores successful reasoning patterns for future retrieval.
**Storage Format:**
- State-action pairs
- Quality scores (0.0-1.0)
- Contextual embeddings
- Temporal metadata
**Configuration:**
```rust
let config = RuvLtraMediumConfig::base();
config.enable_reasoning_bank = true;
config.sona_config.pattern_capacity = 50000;
```
## Memory Optimization
### 1. Paged KV Cache
Efficient memory management for attention computation.
**Block Size:** 64 tokens per page
**Benefits:**
- 40-60% memory reduction
- Dynamic sequence handling
- Copy-on-write semantics
- Efficient prefix caching
**Configuration:**
```rust
let config = RuvLtraMediumConfig::base();
assert!(config.use_paged_attention);
assert_eq!(config.paged_config.page_size, 64);
```
### 2. Flash Attention 2
Optimized attention kernel for 2.49x-7.47x speedup.
**Algorithm:**
- Tiled computation
- Recomputation on-the-fly
- IO-aware optimization
- Causal masking
**Performance:**
| Sequence Length | Speedup | Memory Savings |
|-----------------|---------|----------------|
| 2K tokens | 2.5x | 30% |
| 8K tokens | 4.2x | 50% |
| 32K tokens | 7.1x | 70% |
### 3. Speculative Decoding
Uses RuvLTRA-Small (0.5B) as draft model for 2-3x speedup.
**Configuration:**
```rust
let mut config = RuvLtraMediumConfig::base();
config.use_speculative_decoding = true;
config.speculative_config.lookahead = 4;
config.draft_model_path = Some("models/ruvltra-small-q4.gguf".into());
```
**Parameters:**
- Lookahead: 4 tokens (default)
- Acceptance threshold: 0.7
- Draft temperature: 0.0 (greedy)
- Adaptive lookahead: enabled
**Expected Speedup:**
| Temperature | Speedup |
|-------------|---------|
| 0.0 (greedy) | 2.8-3.2x |
| 0.5 | 2.2-2.6x |
| 1.0 | 1.5-1.8x |
## Usage Examples
### Basic Inference
```rust
use ruvllm::models::ruvltra_medium::{RuvLtraMediumConfig, RuvLtraMediumModel};
// Create model
let config = RuvLtraMediumConfig::base();
let mut model = RuvLtraMediumModel::new(&config)?;
// Tokenize input
let input_ids = vec![151643, 9521, 11, 1917]; // "Hello, world"
let positions = (0..input_ids.len()).collect::<Vec<_>>();
// Run inference
let logits = model.forward(&input_ids, &positions)?;
// Get next token
let next_token = argmax(&logits[logits.len() - config.vocab_size..]);
```
### Code Generation (Coder Variant)
```rust
let config = RuvLtraMediumConfig::coder();
let mut model = RuvLtraMediumModel::new(&config)?;
// Enable SONA hooks for learning
model.enable_sona_with_hooks(&[8, 16, 24, 28])?;
// Generate code
let prompt = "fn fibonacci(n: u32) -> u32 {";
let output = model.generate(prompt, GenerateParams {
max_tokens: 256,
temperature: 0.2,
top_p: 0.95,
..Default::default()
})?;
```
### Agent Routing (Agent Variant)
```rust
let config = RuvLtraMediumConfig::agent();
let model = RuvLtraMediumModel::new(&config)?;
// Enable Claude Flow integration
assert!(config.enable_agent_routing);
// Model automatically:
// - Records trajectories
// - Updates HNSW index
// - Learns routing patterns
```
### Speculative Decoding
```rust
let mut config = RuvLtraMediumConfig::base();
config.use_speculative_decoding = true;
config.draft_model_path = Some("ruvltra-small-q4.gguf".into());
let model = RuvLtraMediumModel::new(&config)?;
// 2-3x faster generation
let output = model.generate("Once upon a time", params)?;
```
## Model Loading
### From GGUF
```rust
use ruvllm::gguf::loader::GGUFLoader;
let loader = GGUFLoader::new("ruvltra-medium-q4_k_m.gguf")?;
let model = loader.load_ruvltra_medium()?;
```
### Quantization Formats
```bash
# Download pre-quantized models
wget https://huggingface.co/ruvector/ruvltra-medium-q4_k_m-gguf
wget https://huggingface.co/ruvector/ruvltra-medium-q5_k_m-gguf
wget https://huggingface.co/ruvector/ruvltra-medium-q8_0-gguf
# Or quantize yourself
cargo run --release --bin quantize -- \
--model qwen2.5-3b-instruct \
--output ruvltra-medium-q4_k_m.gguf \
--format q4_k_m
```
## Performance Benchmarks
### Inference Speed (Apple M3 Max)
| Configuration | Tokens/sec | Memory | Power |
|---------------|-----------|--------|-------|
| Base Q4_K_M | 68 tok/s | 2.2 GB | 12W |
| Base Q5_K_M | 55 tok/s | 2.7 GB | 14W |
| Base Q8_0 | 42 tok/s | 3.8 GB | 16W |
| Coder Q4_K_M | 65 tok/s | 2.4 GB | 13W |
| Agent Q4_K_M | 72 tok/s | 2.1 GB | 11W |
| + Speculative | 158 tok/s | 2.8 GB | 15W |
### Quality Metrics
| Benchmark | Base | Coder | Agent |
|-----------|------|-------|-------|
| MMLU | 68.2% | 66.8% | 64.5% |
| HumanEval | 52.4% | 61.7% | 48.9% |
| GSM8K | 71.3% | 69.8% | 73.6% |
| TruthfulQA | 45.8% | 44.2% | 47.1% |
## Integration with Claude Flow
### Agent Routing
```rust
use ruvllm::models::ruvltra_medium::RuvLtraMediumConfig;
use ruvllm::claude_flow::AgentRouter;
let config = RuvLtraMediumConfig::agent();
let model = RuvLtraMediumModel::new(&config)?;
// Router uses model embeddings for task classification
let router = AgentRouter::new(model.sona().unwrap());
// Route task to optimal agent
let task = "Implement authentication system";
let agent = router.route(task)?; // Returns: "coder" or "security-architect"
```
### Trajectory Recording
```rust
use ruvllm::sona::Trajectory;
// Create trajectory
let mut trajectory = Trajectory::new("code-generation");
trajectory.add_state(initial_state);
trajectory.add_action("generate_function", quality_score);
// Record in model
model.sona()
.unwrap()
.write()
.record_trajectory(trajectory)?;
```
## Limitations
1. **Context Window**: 32K tokens (not extensible without retraining)
2. **SONA Hooks**: Limited to 4 hooks due to memory overhead
3. **Speculative Decoding**: Requires separate draft model
4. **Quantization**: Q4/Q5 may degrade quality by 2-3%
5. **Hardware**: Optimized for Apple Silicon; GPU acceleration recommended
## Roadmap
- [ ] RuvLTRA-Medium-Vision (multimodal)
- [ ] Context extension to 128K tokens
- [ ] Mixture-of-Experts (MoE) variant
- [ ] On-device fine-tuning
- [ ] Distillation to RuvLTRA-Small
## References
- [Qwen2.5 Technical Report](https://arxiv.org/abs/2407.10671)
- [Flash Attention 2](https://arxiv.org/abs/2307.08691)
- [Speculative Decoding](https://arxiv.org/abs/2211.17192)
- [Grouped Query Attention](https://arxiv.org/abs/2305.13245)
- [HNSW Algorithm](https://arxiv.org/abs/1603.09320)