Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
862
vendor/ruvector/docs/ruvllm/API_REFERENCE.md
vendored
Normal file
862
vendor/ruvector/docs/ruvllm/API_REFERENCE.md
vendored
Normal file
@@ -0,0 +1,862 @@
|
||||
# RuvLLM API Reference
|
||||
|
||||
Complete API documentation for the RuvLLM crate.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Core Types](#core-types)
|
||||
- [Backend Trait](#backend-trait)
|
||||
- [Candle Backend](#candle-backend)
|
||||
- [LoRA Module](#lora-module)
|
||||
- [Optimization Module](#optimization-module)
|
||||
- [Kernel Functions](#kernel-functions)
|
||||
- [KV Cache](#kv-cache)
|
||||
- [Error Handling](#error-handling)
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
### `Precision`
|
||||
|
||||
Numeric precision for model weights and KV cache.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Precision {
|
||||
/// Full 32-bit floating point
|
||||
FP32,
|
||||
/// Half precision 16-bit float
|
||||
FP16,
|
||||
/// Brain floating point (16-bit)
|
||||
BF16,
|
||||
/// 8-bit integer quantization
|
||||
Q8,
|
||||
/// 4-bit integer quantization
|
||||
Q4,
|
||||
/// 4-bit K-quant (GGML-style)
|
||||
Q4K,
|
||||
}
|
||||
|
||||
impl Precision {
|
||||
/// Get bytes per element for this precision
|
||||
pub fn bytes_per_element(&self) -> u8;
|
||||
}
|
||||
```
|
||||
|
||||
### `ModelSize`
|
||||
|
||||
Model size classification for routing.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ModelSize {
|
||||
Tiny, // < 1B params
|
||||
Small, // 1-3B params
|
||||
Medium, // 3-13B params
|
||||
Large, // > 13B params
|
||||
}
|
||||
```
|
||||
|
||||
### `DeviceType`
|
||||
|
||||
Compute device selection.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum DeviceType {
|
||||
/// CPU (fallback)
|
||||
Cpu,
|
||||
/// Apple Metal GPU
|
||||
Metal,
|
||||
/// NVIDIA CUDA GPU
|
||||
Cuda(usize), // device index
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Backend Trait
|
||||
|
||||
### `LlmBackend`
|
||||
|
||||
Main trait for LLM inference backends.
|
||||
|
||||
```rust
|
||||
pub trait LlmBackend: Send + Sync {
|
||||
/// Load a model from HuggingFace Hub or local path
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `model_id` - HuggingFace model ID or local path
|
||||
/// * `config` - Model configuration
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// backend.load_model("Qwen/Qwen2.5-7B-Instruct", config)?;
|
||||
/// ```
|
||||
fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()>;
|
||||
|
||||
/// Generate text from a prompt
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `prompt` - Input text prompt
|
||||
/// * `params` - Generation parameters
|
||||
///
|
||||
/// # Returns
|
||||
/// Generated text response
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let response = backend.generate("Hello!", GenerateParams::default())?;
|
||||
/// ```
|
||||
fn generate(&self, prompt: &str, params: GenerateParams) -> Result<String>;
|
||||
|
||||
/// Streaming text generation
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `prompt` - Input text prompt
|
||||
/// * `params` - Generation parameters
|
||||
/// * `callback` - Called for each generated token
|
||||
fn generate_stream<F>(&self, prompt: &str, params: GenerateParams, callback: F) -> Result<()>
|
||||
where
|
||||
F: FnMut(&str) -> bool;
|
||||
|
||||
/// Get the tokenizer for this model
|
||||
fn tokenizer(&self) -> Option<&dyn Tokenizer>;
|
||||
|
||||
/// Get model metadata
|
||||
fn model_info(&self) -> Option<ModelInfo>;
|
||||
|
||||
/// Check if a model is loaded
|
||||
fn is_loaded(&self) -> bool;
|
||||
}
|
||||
```
|
||||
|
||||
### `ModelConfig`
|
||||
|
||||
Configuration for model loading.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ModelConfig {
|
||||
/// Maximum context length
|
||||
pub max_context: usize,
|
||||
/// Use Flash Attention
|
||||
pub use_flash_attention: bool,
|
||||
/// Weight quantization level
|
||||
pub quantization: Precision,
|
||||
/// KV cache configuration
|
||||
pub kv_cache_config: KvCacheConfig,
|
||||
/// Device to load model on
|
||||
pub device: DeviceType,
|
||||
/// HuggingFace token for gated models
|
||||
pub hf_token: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for ModelConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_context: 4096,
|
||||
use_flash_attention: true,
|
||||
quantization: Precision::Q4K,
|
||||
kv_cache_config: KvCacheConfig::default(),
|
||||
device: DeviceType::Metal,
|
||||
hf_token: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### `GenerateParams`
|
||||
|
||||
Parameters for text generation.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerateParams {
|
||||
/// Maximum tokens to generate
|
||||
pub max_tokens: usize,
|
||||
/// Sampling temperature (0.0 = deterministic)
|
||||
pub temperature: f32,
|
||||
/// Top-p (nucleus) sampling
|
||||
pub top_p: f32,
|
||||
/// Top-k sampling (0 = disabled)
|
||||
pub top_k: usize,
|
||||
/// Repetition penalty
|
||||
pub repetition_penalty: f32,
|
||||
/// Stop sequences
|
||||
pub stop_sequences: Vec<String>,
|
||||
/// Random seed for reproducibility
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for GenerateParams {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_tokens: 256,
|
||||
temperature: 0.7,
|
||||
top_p: 0.9,
|
||||
top_k: 0,
|
||||
repetition_penalty: 1.1,
|
||||
stop_sequences: vec![],
|
||||
seed: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Candle Backend
|
||||
|
||||
### `CandleBackend`
|
||||
|
||||
HuggingFace Candle-based inference backend.
|
||||
|
||||
```rust
|
||||
impl CandleBackend {
|
||||
/// Create a new backend with default device
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let backend = CandleBackend::new()?;
|
||||
/// ```
|
||||
pub fn new() -> Result<Self>;
|
||||
|
||||
/// Create with specific device
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let backend = CandleBackend::with_device(DeviceType::Metal)?;
|
||||
/// ```
|
||||
pub fn with_device(device: DeviceType) -> Result<Self>;
|
||||
|
||||
/// Download model from HuggingFace Hub
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `model_id` - HuggingFace model ID
|
||||
/// * `quantization` - Target quantization
|
||||
/// * `cache_dir` - Local cache directory
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let path = backend.download_model(
|
||||
/// "Qwen/Qwen2.5-7B-Instruct",
|
||||
/// Precision::Q4K,
|
||||
/// "~/.cache/ruvllm"
|
||||
/// ).await?;
|
||||
/// ```
|
||||
pub async fn download_model(
|
||||
&self,
|
||||
model_id: &str,
|
||||
quantization: Precision,
|
||||
cache_dir: &str,
|
||||
) -> Result<PathBuf>;
|
||||
|
||||
/// Get current device
|
||||
pub fn device(&self) -> DeviceType;
|
||||
|
||||
/// Get memory usage statistics
|
||||
pub fn memory_stats(&self) -> MemoryStats;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## LoRA Module
|
||||
|
||||
### `MicroLoRA`
|
||||
|
||||
Real-time per-request fine-tuning with rank 1-2 adapters.
|
||||
|
||||
```rust
|
||||
impl MicroLoRA {
|
||||
/// Create a new MicroLoRA instance
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let config = MicroLoraConfig::for_hidden_dim(4096);
|
||||
/// let lora = MicroLoRA::new(config);
|
||||
/// ```
|
||||
pub fn new(config: MicroLoraConfig) -> Self;
|
||||
|
||||
/// Adapt on new input with feedback
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `input` - Input embedding vector
|
||||
/// * `feedback` - Quality feedback for learning
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let feedback = AdaptFeedback::from_quality(0.9);
|
||||
/// lora.adapt(&input_embedding, feedback)?;
|
||||
/// ```
|
||||
pub fn adapt(&self, input: &[f32], feedback: AdaptFeedback) -> Result<()>;
|
||||
|
||||
/// Forward pass through LoRA adapter
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `input` - Input tensor
|
||||
/// * `module` - Target module (Q, K, V, O projections)
|
||||
///
|
||||
/// # Returns
|
||||
/// Output with LoRA contribution added
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let output = lora.forward(&input, &TargetModule::QProj);
|
||||
/// ```
|
||||
pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
|
||||
|
||||
/// Forward pass that adds to existing output (in-place)
|
||||
pub fn forward_add(&self, input: &[f32], module: &TargetModule, output: &mut [f32]);
|
||||
|
||||
/// Apply accumulated gradient updates
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `learning_rate` - Learning rate for update
|
||||
pub fn apply_updates(&self, learning_rate: f32);
|
||||
|
||||
/// Apply updates with EWC++ regularization
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `learning_rate` - Learning rate
|
||||
/// * `ewc_states` - EWC++ state per module
|
||||
/// * `ewc_lambda` - EWC regularization strength
|
||||
pub fn apply_updates_with_ewc(
|
||||
&self,
|
||||
learning_rate: f32,
|
||||
ewc_states: &HashMap<TargetModule, EwcState>,
|
||||
ewc_lambda: f32,
|
||||
);
|
||||
|
||||
/// Reset all adapter weights
|
||||
pub fn reset(&self);
|
||||
|
||||
/// Get adapter statistics
|
||||
pub fn stats(&self) -> MicroLoraStats;
|
||||
}
|
||||
```
|
||||
|
||||
### `MicroLoraConfig`
|
||||
|
||||
Configuration for MicroLoRA adapters.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MicroLoraConfig {
|
||||
/// Input feature dimension
|
||||
pub in_features: usize,
|
||||
/// Output feature dimension
|
||||
pub out_features: usize,
|
||||
/// LoRA rank (1-2 for MicroLoRA)
|
||||
pub rank: usize,
|
||||
/// LoRA alpha scaling factor
|
||||
pub alpha: f32,
|
||||
/// Dropout probability
|
||||
pub dropout: f32,
|
||||
/// Target modules to adapt
|
||||
pub target_modules: Vec<TargetModule>,
|
||||
/// Enable gradient checkpointing
|
||||
pub gradient_checkpointing: bool,
|
||||
}
|
||||
|
||||
impl MicroLoraConfig {
|
||||
/// Create config for a specific hidden dimension
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let config = MicroLoraConfig::for_hidden_dim(4096);
|
||||
/// assert_eq!(config.in_features, 4096);
|
||||
/// assert_eq!(config.rank, 2);
|
||||
/// ```
|
||||
pub fn for_hidden_dim(hidden_dim: usize) -> Self;
|
||||
}
|
||||
```
|
||||
|
||||
### `TargetModule`
|
||||
|
||||
Transformer modules that can be adapted.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum TargetModule {
|
||||
/// Query projection
|
||||
QProj,
|
||||
/// Key projection
|
||||
KProj,
|
||||
/// Value projection
|
||||
VProj,
|
||||
/// Output projection
|
||||
OProj,
|
||||
/// Gate projection (FFN)
|
||||
GateProj,
|
||||
/// Up projection (FFN)
|
||||
UpProj,
|
||||
/// Down projection (FFN)
|
||||
DownProj,
|
||||
}
|
||||
```
|
||||
|
||||
### `AdaptFeedback`
|
||||
|
||||
Feedback for LoRA adaptation.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AdaptFeedback {
|
||||
/// Quality score (0.0 - 1.0)
|
||||
pub quality: f32,
|
||||
/// Gradient estimate from feedback
|
||||
pub gradient_estimate: Vec<f32>,
|
||||
/// Optional reward signal
|
||||
pub reward: Option<f32>,
|
||||
/// Latency in microseconds
|
||||
pub latency_us: u64,
|
||||
/// Source module (optional)
|
||||
pub source_module: Option<TargetModule>,
|
||||
/// Session identifier
|
||||
pub session_id: Option<String>,
|
||||
}
|
||||
|
||||
impl AdaptFeedback {
|
||||
/// Create feedback from quality score
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let feedback = AdaptFeedback::from_quality(0.85);
|
||||
/// ```
|
||||
pub fn from_quality(quality: f32) -> Self;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Optimization Module
|
||||
|
||||
### `SonaLlm`
|
||||
|
||||
SONA learning integration for LLM inference.
|
||||
|
||||
```rust
|
||||
impl SonaLlm {
|
||||
/// Create new SONA LLM integration
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let sona = SonaLlm::new(SonaLlmConfig::default());
|
||||
/// ```
|
||||
pub fn new(config: SonaLlmConfig) -> Self;
|
||||
|
||||
/// Instant loop: per-request MicroLoRA adaptation
|
||||
///
|
||||
/// Target latency: <1ms
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `request` - User query text
|
||||
/// * `response` - Model response text
|
||||
/// * `feedback` - Quality score (0.0 - 1.0)
|
||||
///
|
||||
/// # Returns
|
||||
/// Adaptation result with statistics
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let result = sona.instant_adapt(
|
||||
/// "What is machine learning?",
|
||||
/// "Machine learning is...",
|
||||
/// 0.9
|
||||
/// );
|
||||
/// assert!(result.applied);
|
||||
/// assert!(result.latency_us < 1000); // <1ms
|
||||
/// ```
|
||||
pub fn instant_adapt(&self, request: &str, response: &str, feedback: f32) -> AdaptationResult;
|
||||
|
||||
/// Background loop: consolidate patterns
|
||||
///
|
||||
/// Called periodically (~100ms interval)
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let result = sona.background_consolidate();
|
||||
/// println!("Consolidated {} samples", result.samples_used);
|
||||
/// ```
|
||||
pub fn background_consolidate(&self) -> AdaptationResult;
|
||||
|
||||
/// Deep loop: trigger full optimization
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `dataset` - Training samples to learn from
|
||||
pub fn deep_optimize(&self, dataset: &[TrainingSample]) -> AdaptationResult;
|
||||
|
||||
/// Check if background loop should run
|
||||
pub fn maybe_background(&self) -> Option<AdaptationResult>;
|
||||
|
||||
/// Check if deep loop should be triggered
|
||||
pub fn should_trigger_deep(&self) -> bool;
|
||||
|
||||
/// Get current statistics
|
||||
pub fn stats(&self) -> LearningLoopStats;
|
||||
|
||||
/// Forward pass through MicroLoRA
|
||||
pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
|
||||
|
||||
/// Reset all learning state
|
||||
pub fn reset(&self);
|
||||
}
|
||||
```
|
||||
|
||||
### `SonaLlmConfig`
|
||||
|
||||
Configuration for SONA LLM integration.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SonaLlmConfig {
|
||||
/// MicroLoRA configuration
|
||||
pub micro_lora: MicroLoraConfig,
|
||||
/// Training pipeline configuration
|
||||
pub training: TrainingConfig,
|
||||
/// SONA core configuration
|
||||
pub sona: SonaConfig,
|
||||
/// Instant loop learning rate
|
||||
pub instant_lr: f32,
|
||||
/// Background loop interval (milliseconds)
|
||||
pub background_interval_ms: u64,
|
||||
/// Minimum samples for background consolidation
|
||||
pub background_min_samples: usize,
|
||||
/// Deep loop trigger threshold
|
||||
pub deep_trigger_threshold: f32,
|
||||
/// Maximum pending samples
|
||||
pub max_pending_samples: usize,
|
||||
/// Consolidation strategy
|
||||
pub consolidation_strategy: ConsolidationStrategy,
|
||||
}
|
||||
```
|
||||
|
||||
### `ConsolidationStrategy`
|
||||
|
||||
Strategy for consolidating learned patterns.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum ConsolidationStrategy {
|
||||
/// Merge with EWC++ regularization (default)
|
||||
EwcMerge,
|
||||
/// Simple averaging
|
||||
Average,
|
||||
/// Weighted by quality
|
||||
QualityWeighted,
|
||||
/// Keep best performing only
|
||||
BestOnly,
|
||||
/// Ensemble multiple adapters
|
||||
Ensemble,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Kernel Functions
|
||||
|
||||
### Attention Kernels
|
||||
|
||||
```rust
|
||||
/// Flash Attention 2 with NEON SIMD optimization
|
||||
///
|
||||
/// Memory-efficient attention with O(N) complexity.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `query` - Query tensor (head_dim,)
|
||||
/// * `key` - Key tensor (kv_len, head_dim)
|
||||
/// * `value` - Value tensor (kv_len, head_dim)
|
||||
/// * `scale` - Softmax scale (typically 1/sqrt(head_dim))
|
||||
/// * `causal` - Apply causal masking
|
||||
///
|
||||
/// # Returns
|
||||
/// Output tensor (head_dim,)
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let scale = 1.0 / (head_dim as f32).sqrt();
|
||||
/// let output = flash_attention_neon(&query, &key, &value, scale, true);
|
||||
/// ```
|
||||
pub fn flash_attention_neon(
|
||||
query: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
scale: f32,
|
||||
causal: bool,
|
||||
) -> Vec<f32>;
|
||||
|
||||
/// Paged Attention for KV cache
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `query` - Query tensor
|
||||
/// * `kv_cache` - Paged KV cache
|
||||
/// * `block_tables` - Block index mapping
|
||||
/// * `scale` - Softmax scale
|
||||
pub fn paged_attention_neon(
|
||||
query: &[f32],
|
||||
kv_cache: &PagedKvCache,
|
||||
block_tables: &[usize],
|
||||
scale: f32,
|
||||
) -> Vec<f32>;
|
||||
|
||||
/// Grouped-Query Attention (GQA)
|
||||
///
|
||||
/// KV heads shared among query head groups.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `queries` - Query tensor (num_heads, head_dim)
|
||||
/// * `keys` - Key tensor (kv_len, num_kv_heads, head_dim)
|
||||
/// * `values` - Value tensor (kv_len, num_kv_heads, head_dim)
|
||||
/// * `config` - Attention configuration
|
||||
pub fn grouped_query_attention_neon(
|
||||
queries: &[f32],
|
||||
keys: &[f32],
|
||||
values: &[f32],
|
||||
config: &AttentionConfig,
|
||||
) -> Vec<f32>;
|
||||
|
||||
/// Multi-Query Attention (MQA)
|
||||
///
|
||||
/// Single KV head shared across all query heads.
|
||||
pub fn multi_query_attention_neon(
|
||||
queries: &[f32],
|
||||
key: &[f32],
|
||||
value: &[f32],
|
||||
config: &AttentionConfig,
|
||||
) -> Vec<f32>;
|
||||
```
|
||||
|
||||
### `AttentionConfig`
|
||||
|
||||
Configuration for attention operations.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AttentionConfig {
|
||||
/// Number of query heads
|
||||
pub num_heads: usize,
|
||||
/// Number of KV heads (for GQA)
|
||||
pub num_kv_heads: usize,
|
||||
/// Dimension per head
|
||||
pub head_dim: usize,
|
||||
/// Apply causal masking
|
||||
pub causal: bool,
|
||||
/// Custom scale factor (None = 1/sqrt(head_dim))
|
||||
pub scale: Option<f32>,
|
||||
}
|
||||
|
||||
impl AttentionConfig {
|
||||
/// Calculate GQA ratio (query heads / KV heads)
|
||||
pub fn gqa_ratio(&self) -> usize;
|
||||
|
||||
/// Get effective scale factor
|
||||
pub fn effective_scale(&self) -> f32;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## KV Cache
|
||||
|
||||
### `TwoTierKvCache`
|
||||
|
||||
Two-tier KV cache with FP16 tail and quantized store.
|
||||
|
||||
```rust
|
||||
impl TwoTierKvCache {
|
||||
/// Create a new two-tier KV cache
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// let config = KvCacheConfig {
|
||||
/// tail_length: 256,
|
||||
/// max_tokens: 4096,
|
||||
/// ..Default::default()
|
||||
/// };
|
||||
/// let cache = TwoTierKvCache::new(config);
|
||||
/// ```
|
||||
pub fn new(config: KvCacheConfig) -> Self;
|
||||
|
||||
/// Append new KV pairs
|
||||
///
|
||||
/// Automatically handles:
|
||||
/// - Adding to tail
|
||||
/// - Migrating to quantized store
|
||||
/// - Evicting oldest tokens
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `keys` - Key tensor
|
||||
/// * `values` - Value tensor
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// cache.append(&keys, &values)?;
|
||||
/// ```
|
||||
pub fn append(&self, keys: &[f32], values: &[f32]) -> Result<()>;
|
||||
|
||||
/// Get all KV pairs for attention
|
||||
///
|
||||
/// Returns (keys, values) with cold tier dequantized.
|
||||
pub fn get_all_kv(&self) -> (Vec<f32>, Vec<f32>);
|
||||
|
||||
/// Compute attention with tier-aware access
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `query` - Query tensor
|
||||
/// * `scale` - Softmax scale
|
||||
pub fn attend(&self, query: &[f32], scale: f32) -> Result<Vec<f32>>;
|
||||
|
||||
/// Get current statistics
|
||||
pub fn stats(&self) -> KvCacheStats;
|
||||
|
||||
/// Clear the cache
|
||||
pub fn clear(&self);
|
||||
|
||||
/// Update quantization policy
|
||||
pub fn update_policy(&self, policy: CacheQuantization);
|
||||
}
|
||||
```
|
||||
|
||||
### `KvCacheConfig`
|
||||
|
||||
Configuration for KV cache.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KvCacheConfig {
|
||||
/// Tokens to keep in high-precision tail
|
||||
pub tail_length: usize,
|
||||
/// Precision for tail storage
|
||||
pub tail_precision: Precision,
|
||||
/// Precision for quantized store
|
||||
pub store_precision: Precision,
|
||||
/// Maximum total tokens
|
||||
pub max_tokens: usize,
|
||||
/// Number of KV heads
|
||||
pub num_kv_heads: usize,
|
||||
/// Head dimension
|
||||
pub head_dim: usize,
|
||||
/// Migration batch size
|
||||
pub migration_batch: usize,
|
||||
}
|
||||
```
|
||||
|
||||
### `KvCacheStats`
|
||||
|
||||
Statistics for KV cache usage.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KvCacheStats {
|
||||
/// Total tokens cached
|
||||
pub total_tokens: usize,
|
||||
/// Tokens in high-precision tail
|
||||
pub tail_tokens: usize,
|
||||
/// Tokens in quantized store
|
||||
pub store_tokens: usize,
|
||||
/// Bytes used by tail
|
||||
pub tail_bytes: usize,
|
||||
/// Bytes used by store
|
||||
pub store_bytes: usize,
|
||||
/// Compression ratio
|
||||
pub compression_ratio: f32,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
### `RuvLLMError`
|
||||
|
||||
Main error type for RuvLLM operations.
|
||||
|
||||
```rust
|
||||
#[derive(Error, Debug)]
|
||||
pub enum RuvLLMError {
|
||||
/// Storage-related errors
|
||||
#[error("Storage error: {0}")]
|
||||
Storage(String),
|
||||
|
||||
/// Session management errors
|
||||
#[error("Session error: {0}")]
|
||||
Session(String),
|
||||
|
||||
/// KV cache errors
|
||||
#[error("KV cache error: {0}")]
|
||||
KvCache(String),
|
||||
|
||||
/// Paged attention errors
|
||||
#[error("Paged attention error: {0}")]
|
||||
PagedAttention(String),
|
||||
|
||||
/// Adapter management errors
|
||||
#[error("Adapter error: {0}")]
|
||||
Adapter(String),
|
||||
|
||||
/// SONA learning errors
|
||||
#[error("SONA error: {0}")]
|
||||
Sona(String),
|
||||
|
||||
/// Configuration errors
|
||||
#[error("Configuration error: {0}")]
|
||||
Config(String),
|
||||
|
||||
/// Out of memory
|
||||
#[error("Out of memory: {0}")]
|
||||
OutOfMemory(String),
|
||||
|
||||
/// Invalid operation
|
||||
#[error("Invalid operation: {0}")]
|
||||
InvalidOperation(String),
|
||||
|
||||
/// Not found
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
/// Backend inference errors
|
||||
#[error("Backend error: {0}")]
|
||||
Backend(String),
|
||||
|
||||
/// Model loading errors
|
||||
#[error("Model error: {0}")]
|
||||
Model(String),
|
||||
|
||||
/// Tokenization errors
|
||||
#[error("Tokenization error: {0}")]
|
||||
Tokenization(String),
|
||||
|
||||
/// Generation errors
|
||||
#[error("Generation error: {0}")]
|
||||
Generation(String),
|
||||
|
||||
/// IO errors
|
||||
#[error("IO error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
```
|
||||
|
||||
### `Result` Type Alias
|
||||
|
||||
```rust
|
||||
/// Result type alias for RuvLLM operations
|
||||
pub type Result<T> = std::result::Result<T, RuvLLMError>;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feature Flags Reference
|
||||
|
||||
| Feature | Dependencies | Description |
|
||||
|---------|-------------|-------------|
|
||||
| `default` | `async-runtime` | Standard async support |
|
||||
| `async-runtime` | `tokio` | Tokio async runtime |
|
||||
| `wasm` | - | WebAssembly support |
|
||||
| `candle` | `candle-*`, `tokenizers`, `hf-hub` | Candle ML backend |
|
||||
| `metal` | `candle/metal` | Apple Metal GPU |
|
||||
| `cuda` | `candle/cuda` | NVIDIA CUDA GPU |
|
||||
| `inference-metal` | `candle`, `metal` | Full Metal stack |
|
||||
| `inference-cuda` | `candle`, `cuda` | Full CUDA stack |
|
||||
402
vendor/ruvector/docs/ruvllm/ARCHITECTURE.md
vendored
Normal file
402
vendor/ruvector/docs/ruvllm/ARCHITECTURE.md
vendored
Normal file
@@ -0,0 +1,402 @@
|
||||
# RuvLLM Architecture (v2.0.0)
|
||||
|
||||
This document describes the system architecture of RuvLLM, a high-performance LLM inference engine optimized for Apple Silicon.
|
||||
|
||||
## v2.0.0 New Features
|
||||
|
||||
| Feature | Description | Performance Impact |
|
||||
|---------|-------------|-------------------|
|
||||
| Multi-threaded GEMM/GEMV | Rayon parallelization | 12.7x speedup on M4 Pro |
|
||||
| Flash Attention 2 | Auto block sizing | +10% throughput |
|
||||
| Quantized Inference | INT8/INT4/Q4_K kernels | 4-8x memory reduction |
|
||||
| Metal GPU Shaders | simdgroup_matrix ops | 3x speedup |
|
||||
| Memory Pool | Arena allocator | Zero-alloc inference |
|
||||
| WASM Support | Browser inference | ~2.5x overhead |
|
||||
| npm Integration | @ruvector/ruvllm | JavaScript/TypeScript API |
|
||||
|
||||
## System Overview
|
||||
|
||||
```
|
||||
+----------------------------------+
|
||||
| User Application |
|
||||
+----------------------------------+
|
||||
|
|
||||
v
|
||||
+-------------------------------------------------------------------------------------+
|
||||
| RuvLLM Core |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | Backend Abstraction | |
|
||||
| | +-------------------------+ +-------------------------+ | |
|
||||
| | | Candle Backend | | mistral-rs Backend | | |
|
||||
| | | - Model Loading | | - Model Loading | | |
|
||||
| | | - Tokenization | | - Tokenization | | |
|
||||
| | | - Forward Pass | | - Forward Pass | | |
|
||||
| | +-------------------------+ +-------------------------+ | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | SONA Learning Layer | |
|
||||
| | +---------------------+ +----------------------+ +---------------------+ | |
|
||||
| | | Instant Loop | | Background Loop | | Deep Loop | | |
|
||||
| | | (<1ms latency) | | (~100ms interval) | | (minutes/hours) | | |
|
||||
| | | - MicroLoRA adapt | | - Pattern merge | | - Full fine-tune | | |
|
||||
| | | - Per-request | | - EWC++ update | | - Model distill | | |
|
||||
| | +---------------------+ +----------------------+ +---------------------+ | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | Optimized Kernels | |
|
||||
| | +------------------+ +------------------+ +------------------+ | |
|
||||
| | | Attention | | Normalization | | Embedding | | |
|
||||
| | | - Flash Attn 2 | | - RMSNorm | | - RoPE | | |
|
||||
| | | - Paged Attn | | - LayerNorm | | - Token Embed | | |
|
||||
| | | - GQA/MQA | | - Fused Ops | | - Pos Embed | | |
|
||||
| | +------------------+ +------------------+ +------------------+ | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
| | Memory Management | |
|
||||
| | +-------------------------+ +-------------------------------------------+ | |
|
||||
| | | Two-Tier KV Cache | | Memory Pool | | |
|
||||
| | | +-------------------+ | | - Slab allocator | | |
|
||||
| | | | FP16 Tail (hot) | | | - Arena allocation | | |
|
||||
| | | +-------------------+ | | - Zero-copy transfers | | |
|
||||
| | | | Q4 Store (cold) | | | | | |
|
||||
| | | +-------------------+ | +-------------------------------------------+ | |
|
||||
| | +-------------------------+ | |
|
||||
| +-------------------------------------------------------------------------------+ |
|
||||
+-------------------------------------------------------------------------------------+
|
||||
|
|
||||
v
|
||||
+-------------------------------------------------------------------------------------+
|
||||
| Hardware Acceleration |
|
||||
| +---------------------------+ +---------------------------+ |
|
||||
| | Metal (Apple GPU) | | CUDA (NVIDIA) | |
|
||||
| | - MLX integration | | - cuBLAS | |
|
||||
| | - Metal Performance | | - cuDNN | |
|
||||
| | Shaders | | - TensorRT | |
|
||||
| +---------------------------+ +---------------------------+ |
|
||||
+-------------------------------------------------------------------------------------+
|
||||
```
|
||||
|
||||
## Component Architecture
|
||||
|
||||
### 1. Backend Abstraction Layer
|
||||
|
||||
The backend abstraction provides a unified interface for different ML frameworks.
|
||||
|
||||
```
|
||||
+---------------------------+
|
||||
| LlmBackend Trait |
|
||||
| - load_model() |
|
||||
| - generate() |
|
||||
| - forward() |
|
||||
| - get_tokenizer() |
|
||||
+---------------------------+
|
||||
^
|
||||
|
|
||||
+------+------+
|
||||
| |
|
||||
+-------+ +-----------+
|
||||
|Candle | |mistral-rs |
|
||||
+-------+ +-----------+
|
||||
```
|
||||
|
||||
**Candle Backend Features:**
|
||||
- HuggingFace model hub integration
|
||||
- Native Rust tensor operations
|
||||
- Metal/CUDA acceleration
|
||||
- Safetensors loading
|
||||
|
||||
### 2. SONA Learning Layer
|
||||
|
||||
Self-Optimizing Neural Architecture with three learning loops:
|
||||
|
||||
```
|
||||
+-------------------+ +-------------------+
|
||||
| Inference Request |---->| Instant Loop |
|
||||
| + feedback | | - MicroLoRA adapt |
|
||||
+-------------------+ | - <1ms latency |
|
||||
+--------+----------+
|
||||
|
|
||||
v (async, 100ms)
|
||||
+--------+----------+
|
||||
| Background Loop |
|
||||
| - Pattern merge |
|
||||
| - Adapter compose |
|
||||
| - EWC++ update |
|
||||
+--------+----------+
|
||||
|
|
||||
v (triggered)
|
||||
+--------+----------+
|
||||
| Deep Loop |
|
||||
| - Full fine-tune |
|
||||
| - Model distill |
|
||||
| - Pattern bank |
|
||||
+-------------------+
|
||||
```
|
||||
|
||||
**Loop Characteristics:**
|
||||
|
||||
| Loop | Latency | Trigger | Purpose |
|
||||
|------|---------|---------|---------|
|
||||
| Instant | <1ms | Per-request | Real-time adaptation |
|
||||
| Background | ~100ms | Interval/threshold | Pattern consolidation |
|
||||
| Deep | Minutes | Accumulated quality | Full optimization |
|
||||
|
||||
### 3. Optimized Kernel Layer
|
||||
|
||||
NEON SIMD-optimized kernels for ARM64:
|
||||
|
||||
```
|
||||
+-----------------------------------------------+
|
||||
| Attention Kernels |
|
||||
+-----------------------------------------------+
|
||||
| |
|
||||
| +------------------+ +------------------+ |
|
||||
| | Flash Attention | | Paged Attention | |
|
||||
| | - Tiled QKV | | - Block tables | |
|
||||
| | - Online softmax| | - Non-contiguous| |
|
||||
| | - O(N) memory | | - KV cache aware| |
|
||||
| +------------------+ +------------------+ |
|
||||
| |
|
||||
| +------------------+ +------------------+ |
|
||||
| | Multi-Query (MQA)| | Grouped-Query | |
|
||||
| | - 1 KV head | | - KV groups | |
|
||||
| | - Shared KV | | - 4-8x savings | |
|
||||
| +------------------+ +------------------+ |
|
||||
+-----------------------------------------------+
|
||||
|
||||
+-----------------------------------------------+
|
||||
| Normalization Kernels |
|
||||
+-----------------------------------------------+
|
||||
| +------------------+ +------------------+ |
|
||||
| | RMSNorm | | LayerNorm | |
|
||||
| | - NEON SIMD | | - NEON SIMD | |
|
||||
| | - Fused ops | | - Fused ops | |
|
||||
| +------------------+ +------------------+ |
|
||||
+-----------------------------------------------+
|
||||
|
||||
+-----------------------------------------------+
|
||||
| Embedding Kernels |
|
||||
+-----------------------------------------------+
|
||||
| +------------------+ +------------------+ |
|
||||
| | Rotary Position | | Token Embedding | |
|
||||
| | (RoPE) | | - Lookup table | |
|
||||
| | - Precomputed | | - Batch gather | |
|
||||
| +------------------+ +------------------+ |
|
||||
+-----------------------------------------------+
|
||||
```
|
||||
|
||||
### 4. Memory Management
|
||||
|
||||
Two-tier KV cache for optimal memory/quality tradeoff:
|
||||
|
||||
```
|
||||
+----------------------------------------------------+
|
||||
| Two-Tier KV Cache |
|
||||
+----------------------------------------------------+
|
||||
| |
|
||||
| Position: 0 tail_length max |
|
||||
| +------------------+------------------+ |
|
||||
| | | | |
|
||||
| | Quantized Store | High-Precision | |
|
||||
| | (Cold) | Tail (Hot) | |
|
||||
| | | | |
|
||||
| | - Q4/Q8 format | - FP16 format | |
|
||||
| | - Older tokens | - Recent tokens | |
|
||||
| | - 4x smaller | - Full quality | |
|
||||
| | | | |
|
||||
| +------------------+------------------+ |
|
||||
| |
|
||||
| Migration: Hot -> Cold (when tail_length exceeded)|
|
||||
| Eviction: Cold first, then Hot |
|
||||
+----------------------------------------------------+
|
||||
```
|
||||
|
||||
**Cache Operations:**
|
||||
|
||||
1. **Append**: Add new KV pairs to tail
|
||||
2. **Migrate**: Move old tokens from tail to quantized store
|
||||
3. **Evict**: Remove oldest tokens when max exceeded
|
||||
4. **Attend**: Dequantize cold + use hot for attention
|
||||
|
||||
## Data Flow
|
||||
|
||||
### Inference Pipeline
|
||||
|
||||
```
|
||||
Input Tokens
|
||||
|
|
||||
v
|
||||
+--------------------+
|
||||
| Token Embedding |
|
||||
| + RoPE Position |
|
||||
+--------------------+
|
||||
|
|
||||
v (for each layer)
|
||||
+--------------------+
|
||||
| Attention Layer |
|
||||
| +---------------+|
|
||||
| | Q,K,V Project ||
|
||||
| +---------------+|
|
||||
| | |
|
||||
| +---------------+|
|
||||
| | KV Cache ||
|
||||
| | Update ||
|
||||
| +---------------+|
|
||||
| | |
|
||||
| +---------------+|
|
||||
| | Flash/Paged ||
|
||||
| | Attention ||
|
||||
| +---------------+|
|
||||
| | |
|
||||
| +---------------+|
|
||||
| | Output Proj ||
|
||||
| +---------------+|
|
||||
+--------------------+
|
||||
|
|
||||
v
|
||||
+--------------------+
|
||||
| FFN Layer |
|
||||
| - Gate Proj |
|
||||
| - Up Proj |
|
||||
| - Down Proj |
|
||||
| - Activation |
|
||||
+--------------------+
|
||||
|
|
||||
v
|
||||
+--------------------+
|
||||
| RMSNorm |
|
||||
+--------------------+
|
||||
|
|
||||
v
|
||||
+--------------------+
|
||||
| LM Head |
|
||||
| (final layer) |
|
||||
+--------------------+
|
||||
|
|
||||
v
|
||||
Logits -> Sampling -> Token
|
||||
```
|
||||
|
||||
### Learning Pipeline
|
||||
|
||||
```
|
||||
Request + Response + Feedback
|
||||
|
|
||||
v
|
||||
+---------------------------+
|
||||
| Instant Loop |
|
||||
| - Compute embeddings |
|
||||
| - Apply MicroLoRA |
|
||||
| - Queue for background |
|
||||
+---------------------------+
|
||||
|
|
||||
v (async)
|
||||
+---------------------------+
|
||||
| Background Loop |
|
||||
| - Batch samples |
|
||||
| - Update EWC++ Fisher |
|
||||
| - Merge adapters |
|
||||
| - Store in ReasoningBank |
|
||||
+---------------------------+
|
||||
|
|
||||
v (threshold triggered)
|
||||
+---------------------------+
|
||||
| Deep Loop |
|
||||
| - Full training pipeline |
|
||||
| - Pattern distillation |
|
||||
| - Catastrophic forget |
|
||||
| prevention (EWC++) |
|
||||
+---------------------------+
|
||||
```
|
||||
|
||||
## Module Structure
|
||||
|
||||
```
|
||||
ruvllm/
|
||||
├── src/
|
||||
│ ├── lib.rs # Crate root, re-exports
|
||||
│ ├── error.rs # Error types
|
||||
│ ├── types.rs # Common types (Precision, etc.)
|
||||
│ │
|
||||
│ ├── backends/ # ML framework backends
|
||||
│ │ ├── mod.rs # Backend trait
|
||||
│ │ ├── candle_backend.rs
|
||||
│ │ └── config.rs
|
||||
│ │
|
||||
│ ├── kernels/ # Optimized kernels
|
||||
│ │ ├── mod.rs # Kernel exports
|
||||
│ │ ├── attention.rs # Attention variants
|
||||
│ │ ├── matmul.rs # Matrix multiplication
|
||||
│ │ ├── norm.rs # Normalization ops
|
||||
│ │ └── rope.rs # Rotary embeddings
|
||||
│ │
|
||||
│ ├── lora/ # LoRA adapters
|
||||
│ │ ├── mod.rs # LoRA exports
|
||||
│ │ ├── micro_lora.rs # Real-time MicroLoRA
|
||||
│ │ └── training.rs # Training pipeline
|
||||
│ │
|
||||
│ ├── optimization/ # SONA integration
|
||||
│ │ ├── mod.rs
|
||||
│ │ └── sona_llm.rs # Learning loops
|
||||
│ │
|
||||
│ ├── kv_cache.rs # Two-tier KV cache
|
||||
│ ├── sona.rs # SONA core integration
|
||||
│ ├── policy_store.rs # Learned policies
|
||||
│ └── witness_log.rs # Inference logging
|
||||
│
|
||||
└── benches/ # Benchmarks
|
||||
├── attention_bench.rs
|
||||
├── lora_bench.rs
|
||||
└── e2e_bench.rs
|
||||
```
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Memory Layout
|
||||
|
||||
| Component | Memory Pattern | Optimization |
|
||||
|-----------|---------------|--------------|
|
||||
| KV Cache Tail | Sequential | NEON vectorized |
|
||||
| KV Cache Store | Quantized blocks | Batch dequant |
|
||||
| Model Weights | Memory-mapped | Zero-copy |
|
||||
| Intermediate | Stack allocated | Arena alloc |
|
||||
|
||||
### Throughput Targets (M4 Pro)
|
||||
|
||||
| Operation | Target | Achieved |
|
||||
|-----------|--------|----------|
|
||||
| Flash Attention | 2.5x vs naive | ~2.3x |
|
||||
| Paged Attention | 1.8x vs contiguous | ~1.7x |
|
||||
| GQA vs MHA | 4x less KV memory | 4x |
|
||||
| MicroLoRA adapt | <1ms | ~0.5ms |
|
||||
|
||||
## Integration Points
|
||||
|
||||
### With RuVector Core
|
||||
|
||||
```rust
|
||||
// Memory backend integration
|
||||
use ruvector_core::storage::Storage;
|
||||
|
||||
// SONA learning integration
|
||||
use ruvector_sona::{SonaEngine, ReasoningBank};
|
||||
```
|
||||
|
||||
### With External Systems
|
||||
|
||||
- **HuggingFace Hub**: Model downloads
|
||||
- **OpenAI API**: Compatible inference endpoint
|
||||
- **Prometheus**: Metrics export
|
||||
- **gRPC**: High-performance RPC
|
||||
|
||||
## Future Architecture
|
||||
|
||||
Planned enhancements:
|
||||
|
||||
1. **Speculative Decoding**: Draft model integration
|
||||
2. **Tensor Parallelism**: Multi-GPU support
|
||||
3. **Continuous Batching**: Dynamic batch scheduling
|
||||
4. **PagedAttention v2**: vLLM-style memory management
|
||||
523
vendor/ruvector/docs/ruvllm/FINE_TUNING.md
vendored
Normal file
523
vendor/ruvector/docs/ruvllm/FINE_TUNING.md
vendored
Normal file
@@ -0,0 +1,523 @@
|
||||
# RuvLLM Fine-Tuning Guide
|
||||
|
||||
This guide covers RuvLLM's fine-tuning capabilities, including MicroLoRA for real-time adaptation and EWC++ for preventing catastrophic forgetting.
|
||||
|
||||
## Overview
|
||||
|
||||
RuvLLM provides three levels of fine-tuning:
|
||||
|
||||
| Level | Technique | Latency | Use Case |
|
||||
|-------|-----------|---------|----------|
|
||||
| Instant | MicroLoRA | <1ms | Per-request adaptation |
|
||||
| Background | Adapter Merge + EWC++ | ~100ms | Pattern consolidation |
|
||||
| Deep | Full Training Pipeline | Minutes | Periodic optimization |
|
||||
|
||||
## MicroLoRA: Real-Time Adaptation
|
||||
|
||||
MicroLoRA enables per-request fine-tuning with minimal overhead.
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
User Request
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Compute Input |
|
||||
| Embedding |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+------------------+ +------------------+
|
||||
| Base Model |--->| MicroLoRA Delta |
|
||||
| Forward Pass | | (rank 1-2) |
|
||||
+------------------+ +------------------+
|
||||
| |
|
||||
+----------+---------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Combined Output |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
Response + Quality Feedback
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Update MicroLoRA |
|
||||
| Weights |
|
||||
+------------------+
|
||||
```
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```rust
|
||||
use ruvllm::lora::{MicroLoRA, MicroLoraConfig, AdaptFeedback, TargetModule};
|
||||
|
||||
// Create MicroLoRA for 4096-dim hidden states
|
||||
let config = MicroLoraConfig::for_hidden_dim(4096);
|
||||
let lora = MicroLoRA::new(config);
|
||||
|
||||
// During inference: apply LoRA delta
|
||||
let base_output = model.forward(&input)?;
|
||||
let lora_delta = lora.forward(&input, &TargetModule::QProj);
|
||||
|
||||
// Combine outputs
|
||||
let output: Vec<f32> = base_output.iter()
|
||||
.zip(lora_delta.iter())
|
||||
.map(|(b, d)| b + d)
|
||||
.collect();
|
||||
|
||||
// After response: adapt based on feedback
|
||||
let feedback = AdaptFeedback::from_quality(0.85);
|
||||
lora.adapt(&input, feedback)?;
|
||||
|
||||
// Periodically apply accumulated gradients
|
||||
lora.apply_updates(0.01); // learning rate
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
```rust
|
||||
let config = MicroLoraConfig {
|
||||
// Input/output dimensions (typically hidden_dim)
|
||||
in_features: 4096,
|
||||
out_features: 4096,
|
||||
|
||||
// LoRA rank: 1-2 for micro, 4-8 for standard
|
||||
rank: 2,
|
||||
|
||||
// Scaling factor (effective_rank = alpha / rank)
|
||||
alpha: 4.0,
|
||||
|
||||
// Dropout for regularization
|
||||
dropout: 0.0,
|
||||
|
||||
// Which modules to adapt
|
||||
target_modules: vec![
|
||||
TargetModule::QProj,
|
||||
TargetModule::VProj,
|
||||
],
|
||||
|
||||
// Memory optimization
|
||||
gradient_checkpointing: false,
|
||||
};
|
||||
```
|
||||
|
||||
### Target Modules
|
||||
|
||||
Choose which transformer components to adapt:
|
||||
|
||||
| Module | Description | Memory | Impact |
|
||||
|--------|-------------|--------|--------|
|
||||
| `QProj` | Query projection | Low | High (attention focus) |
|
||||
| `KProj` | Key projection | Low | Medium |
|
||||
| `VProj` | Value projection | Low | High (content) |
|
||||
| `OProj` | Output projection | Low | Medium |
|
||||
| `GateProj` | FFN gate | Medium | High (routing) |
|
||||
| `UpProj` | FFN up | High | Medium |
|
||||
| `DownProj` | FFN down | High | Medium |
|
||||
|
||||
**Recommended combinations:**
|
||||
- **Speed-focused**: `QProj` only
|
||||
- **Quality-focused**: `QProj`, `VProj`
|
||||
- **Full adaptation**: All attention projections
|
||||
|
||||
## EWC++ (Elastic Weight Consolidation)
|
||||
|
||||
EWC++ prevents catastrophic forgetting when adapting to new tasks.
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
Task 1 Training
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Compute Fisher |
|
||||
| Information |
|
||||
| F = E[grad^2] |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Store Optimal |
|
||||
| Weights θ* |
|
||||
+------------------+
|
||||
|
||||
...later...
|
||||
|
||||
Task 2 Training
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Regularized Loss |
|
||||
| L = L_task + |
|
||||
| λ Σ F_i(θ-θ*)² |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Update with |
|
||||
| Importance |
|
||||
| Weights |
|
||||
+------------------+
|
||||
```
|
||||
|
||||
### Using EWC++ with MicroLoRA
|
||||
|
||||
```rust
|
||||
use ruvllm::lora::{MicroLoRA, TrainingPipeline, TrainingConfig};
|
||||
|
||||
// Create training pipeline with EWC++
|
||||
let training_config = TrainingConfig {
|
||||
learning_rate: 0.001,
|
||||
ewc_lambda: 0.1, // Regularization strength
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut pipeline = TrainingPipeline::new(training_config);
|
||||
pipeline.init_for_lora(&lora);
|
||||
|
||||
// Train on task 1
|
||||
for sample in task1_samples {
|
||||
pipeline.train_step(&lora, &sample.input, sample.feedback)?;
|
||||
}
|
||||
|
||||
// Mark end of task 1 (computes Fisher information)
|
||||
pipeline.start_new_task(&lora);
|
||||
|
||||
// Train on task 2 (EWC++ regularization active)
|
||||
for sample in task2_samples {
|
||||
pipeline.train_step(&lora, &sample.input, sample.feedback)?;
|
||||
}
|
||||
```
|
||||
|
||||
### EWC++ Configuration
|
||||
|
||||
```rust
|
||||
let config = TrainingConfig {
|
||||
// Base learning rate
|
||||
learning_rate: 0.001,
|
||||
|
||||
// EWC regularization strength
|
||||
// Higher = more preservation of old knowledge
|
||||
// Lower = more adaptation to new tasks
|
||||
ewc_lambda: 0.1,
|
||||
|
||||
// Minimum quality for learning
|
||||
quality_threshold: 0.5,
|
||||
|
||||
// Fisher information estimation samples
|
||||
fisher_samples: 100,
|
||||
|
||||
// Online Fisher update rate
|
||||
online_ewc_gamma: 0.95,
|
||||
};
|
||||
```
|
||||
|
||||
## SONA Learning Loops
|
||||
|
||||
SONA provides automated multi-tier learning.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
+-------------------+ +-------------------+
|
||||
| Inference Request |---->| Instant Loop |
|
||||
| + feedback | | - MicroLoRA adapt |
|
||||
+-------------------+ | - <1ms latency |
|
||||
+--------+----------+
|
||||
|
|
||||
v (async, 100ms)
|
||||
+--------+----------+
|
||||
| Background Loop |
|
||||
| - Pattern merge |
|
||||
| - Adapter compose |
|
||||
| - EWC++ update |
|
||||
+--------+----------+
|
||||
|
|
||||
v (triggered)
|
||||
+--------+----------+
|
||||
| Deep Loop |
|
||||
| - Full fine-tune |
|
||||
| - Model distill |
|
||||
| - Pattern bank |
|
||||
+-------------------+
|
||||
```
|
||||
|
||||
### Using SONA
|
||||
|
||||
```rust
|
||||
use ruvllm::optimization::{SonaLlm, SonaLlmConfig};
|
||||
|
||||
// Create SONA integration
|
||||
let config = SonaLlmConfig {
|
||||
instant_lr: 0.01,
|
||||
background_interval_ms: 100,
|
||||
background_min_samples: 10,
|
||||
deep_trigger_threshold: 100.0,
|
||||
consolidation_strategy: ConsolidationStrategy::EwcMerge,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let sona = SonaLlm::new(config);
|
||||
|
||||
// During inference
|
||||
let response = model.generate(&query)?;
|
||||
|
||||
// Record feedback (runs instant loop)
|
||||
let result = sona.instant_adapt(&query, &response, 0.85);
|
||||
println!("Instant adapt latency: {}μs", result.latency_us);
|
||||
|
||||
// Periodically check background loop
|
||||
if let Some(bg_result) = sona.maybe_background() {
|
||||
println!("Background: {} samples, quality delta: {:.3}",
|
||||
bg_result.samples_used, bg_result.quality_delta);
|
||||
}
|
||||
|
||||
// Check if deep loop should trigger
|
||||
if sona.should_trigger_deep() {
|
||||
let samples = collect_training_samples();
|
||||
let deep_result = sona.deep_optimize(&samples);
|
||||
println!("Deep optimization complete");
|
||||
}
|
||||
```
|
||||
|
||||
### Consolidation Strategies
|
||||
|
||||
```rust
|
||||
pub enum ConsolidationStrategy {
|
||||
/// EWC++ merge (default) - preserves important weights
|
||||
EwcMerge,
|
||||
|
||||
/// Simple averaging - fast but may lose specialization
|
||||
Average,
|
||||
|
||||
/// Quality-weighted - higher quality samples have more influence
|
||||
QualityWeighted,
|
||||
|
||||
/// Best only - keep top 20% by quality
|
||||
BestOnly,
|
||||
|
||||
/// Ensemble - maintain multiple adapters
|
||||
Ensemble,
|
||||
}
|
||||
```
|
||||
|
||||
**Recommendations:**
|
||||
- `EwcMerge`: Best for multi-domain use
|
||||
- `QualityWeighted`: Best for quality optimization
|
||||
- `BestOnly`: Best for high-variance feedback
|
||||
- `Ensemble`: Best when you have distinct use cases
|
||||
|
||||
## Training Data Format
|
||||
|
||||
### TrainingSample
|
||||
|
||||
```rust
|
||||
pub struct TrainingSample {
|
||||
/// Input embedding
|
||||
pub input_embedding: Vec<f32>,
|
||||
|
||||
/// Output embedding
|
||||
pub output_embedding: Vec<f32>,
|
||||
|
||||
/// Query text (optional)
|
||||
pub query: Option<String>,
|
||||
|
||||
/// Response text (optional)
|
||||
pub response: Option<String>,
|
||||
|
||||
/// Quality score (0.0 - 1.0)
|
||||
pub quality: f32,
|
||||
|
||||
/// Latency in milliseconds
|
||||
pub latency_ms: f32,
|
||||
|
||||
/// Token count
|
||||
pub token_count: usize,
|
||||
|
||||
/// Session identifier
|
||||
pub session_id: String,
|
||||
}
|
||||
```
|
||||
|
||||
### Creating Training Samples
|
||||
|
||||
```rust
|
||||
let sample = TrainingSample::new(
|
||||
input_embedding,
|
||||
output_embedding,
|
||||
0.9, // quality
|
||||
)
|
||||
.with_query("What is machine learning?".to_string())
|
||||
.with_response("Machine learning is...".to_string())
|
||||
.with_latency(150.0) // ms
|
||||
.with_session("session-123".to_string());
|
||||
```
|
||||
|
||||
## Adapter Management
|
||||
|
||||
### Saving and Loading Adapters
|
||||
|
||||
```rust
|
||||
// Save adapter state
|
||||
let adapter_bytes = lora.export_weights()?;
|
||||
std::fs::write("adapter.bin", &adapter_bytes)?;
|
||||
|
||||
// Load adapter state
|
||||
let adapter_bytes = std::fs::read("adapter.bin")?;
|
||||
lora.import_weights(&adapter_bytes)?;
|
||||
```
|
||||
|
||||
### Merging Adapters
|
||||
|
||||
```rust
|
||||
// Merge multiple adapters with weights
|
||||
let adapters = vec![
|
||||
(adapter1, 0.6), // 60% weight
|
||||
(adapter2, 0.4), // 40% weight
|
||||
];
|
||||
|
||||
let merged = MicroLoRA::merge_adapters(&adapters)?;
|
||||
```
|
||||
|
||||
### Adapter Composition
|
||||
|
||||
```rust
|
||||
// Sequential composition: adapter1 -> adapter2
|
||||
let composed = MicroLoRA::compose_sequential(&[adapter1, adapter2])?;
|
||||
|
||||
// Parallel composition: average outputs
|
||||
let composed = MicroLoRA::compose_parallel(&[adapter1, adapter2])?;
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Quality Threshold Selection
|
||||
|
||||
```rust
|
||||
let config = TrainingConfig {
|
||||
// Too low: learns from poor examples
|
||||
// Too high: learns very slowly
|
||||
// Recommended: 0.5 - 0.7
|
||||
quality_threshold: 0.6,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
### 2. Learning Rate Scheduling
|
||||
|
||||
```rust
|
||||
// Start high for quick adaptation
|
||||
let initial_lr = 0.01;
|
||||
|
||||
// Reduce over time for stability
|
||||
let decay_lr = |epoch: usize| -> f32 {
|
||||
initial_lr * 0.95_f32.powi(epoch as i32)
|
||||
};
|
||||
```
|
||||
|
||||
### 3. Memory Management
|
||||
|
||||
```rust
|
||||
// For memory-constrained environments
|
||||
let config = MicroLoraConfig {
|
||||
rank: 1, // Minimum rank
|
||||
target_modules: vec![TargetModule::QProj], // Single module
|
||||
gradient_checkpointing: true,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
### 4. Preventing Overfitting
|
||||
|
||||
```rust
|
||||
let config = MicroLoraConfig {
|
||||
dropout: 0.1, // Add regularization
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let training_config = TrainingConfig {
|
||||
ewc_lambda: 0.5, // Strong regularization
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
## Monitoring and Debugging
|
||||
|
||||
### Statistics
|
||||
|
||||
```rust
|
||||
let stats = sona.stats();
|
||||
println!("Learning Statistics:");
|
||||
println!(" Instant updates: {}", stats.instant_count);
|
||||
println!(" Avg instant latency: {:.2}μs", stats.instant_avg_latency_us);
|
||||
println!(" Background updates: {}", stats.background_count);
|
||||
println!(" Pending samples: {}", stats.pending_samples);
|
||||
println!(" Accumulated quality: {:.2}", stats.accumulated_quality);
|
||||
```
|
||||
|
||||
### Debugging Adaptation
|
||||
|
||||
```rust
|
||||
// Enable debug logging
|
||||
std::env::set_var("RUST_LOG", "ruvllm::lora=debug");
|
||||
|
||||
// Check adaptation result
|
||||
let result = sona.instant_adapt(&query, &response, feedback);
|
||||
if !result.applied {
|
||||
println!("Adaptation skipped: {:?}", result.notes);
|
||||
}
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Latency Optimization
|
||||
|
||||
| Setting | Low Latency | Balanced | High Quality |
|
||||
|---------|-------------|----------|--------------|
|
||||
| LoRA rank | 1 | 2 | 4 |
|
||||
| Target modules | 1 | 2 | 4 |
|
||||
| Background interval | 200ms | 100ms | 50ms |
|
||||
| EWC lambda | 0.0 | 0.1 | 0.5 |
|
||||
|
||||
### Memory Optimization
|
||||
|
||||
```rust
|
||||
// Minimal memory footprint
|
||||
let config = SonaLlmConfig {
|
||||
max_pending_samples: 100, // Reduce buffer
|
||||
micro_lora: MicroLoraConfig {
|
||||
rank: 1,
|
||||
target_modules: vec![TargetModule::QProj],
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Adaptation Not Improving
|
||||
|
||||
1. Check quality threshold isn't too high
|
||||
2. Verify feedback is meaningful (not always same value)
|
||||
3. Increase learning rate
|
||||
4. Try different target modules
|
||||
|
||||
### Catastrophic Forgetting
|
||||
|
||||
1. Increase EWC lambda
|
||||
2. Use `EwcMerge` consolidation strategy
|
||||
3. Reduce learning rate
|
||||
4. Add more diverse training data
|
||||
|
||||
### High Latency
|
||||
|
||||
1. Reduce LoRA rank to 1
|
||||
2. Reduce target modules
|
||||
3. Increase background interval
|
||||
4. Use `gradient_checkpointing`
|
||||
521
vendor/ruvector/docs/ruvllm/OPTIMIZATION.md
vendored
Normal file
521
vendor/ruvector/docs/ruvllm/OPTIMIZATION.md
vendored
Normal file
@@ -0,0 +1,521 @@
|
||||
# RuvLLM Optimization Guide (v2.0.0)
|
||||
|
||||
This guide covers performance optimization strategies for RuvLLM, including SONA learning loops, batch sizing, KV cache management, and hardware-specific tuning.
|
||||
|
||||
## v2.0.0 Performance Highlights
|
||||
|
||||
| Feature | Improvement | Notes |
|
||||
|---------|-------------|-------|
|
||||
| Multi-threaded GEMM | 12.7x speedup | Rayon on M4 Pro 10-core |
|
||||
| Flash Attention 2 | +10% throughput | Auto block sizing |
|
||||
| Quantized Inference | 4-8x memory | INT8/INT4/Q4_K |
|
||||
| Metal GPU | 3x speedup | simdgroup_matrix |
|
||||
| Memory Pool | Zero-alloc | Arena allocator |
|
||||
|
||||
## Performance Overview
|
||||
|
||||
### Key Metrics
|
||||
|
||||
| Metric | Target (M4 Pro) | Achieved (v2.0.0) | Description |
|
||||
|--------|-----------------|-------------------|-------------|
|
||||
| Prefill | >2000 tok/s | 3500 tok/s | Processing input tokens |
|
||||
| Decode | >80 tok/s | 120 tok/s | Generating output tokens |
|
||||
| TTFT | <50ms | 35ms | Time to first token |
|
||||
| Memory | <8GB for 7B | 3.4GB (Q4K) | Peak memory usage |
|
||||
| MicroLoRA | <1ms | 8.56us | Per-request adaptation |
|
||||
|
||||
### Architecture Impact
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Optimization Layers │
|
||||
├─────────────────────────────────────────────────────────┤
|
||||
│ SONA Learning │ Real-time adaptation, routing │
|
||||
├─────────────────────────────────────────────────────────┤
|
||||
│ Attention │ Flash, Paged, GQA - 2-4x speedup │
|
||||
├─────────────────────────────────────────────────────────┤
|
||||
│ KV Cache │ Two-tier, quantized - 4x memory │
|
||||
├─────────────────────────────────────────────────────────┤
|
||||
│ Quantization │ Q4K, Q8 - 4-8x smaller │
|
||||
├─────────────────────────────────────────────────────────┤
|
||||
│ SIMD/GPU │ NEON, Metal - hardware accel │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## SONA Learning Optimization
|
||||
|
||||
### Instant Loop Tuning
|
||||
|
||||
The instant loop runs per-request with <1ms target latency.
|
||||
|
||||
```rust
|
||||
let config = SonaLlmConfig {
|
||||
// Learning rate for instant updates
|
||||
// Higher = faster adaptation, more variance
|
||||
// Lower = slower adaptation, more stable
|
||||
instant_lr: 0.01,
|
||||
|
||||
// Quality threshold - skip low-quality samples
|
||||
training: TrainingConfig {
|
||||
quality_threshold: 0.5, // 0.0-1.0
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
**Tuning Guidelines:**
|
||||
|
||||
| Use Case | instant_lr | quality_threshold |
|
||||
|----------|------------|-------------------|
|
||||
| High variance tasks | 0.005 | 0.7 |
|
||||
| Stable domains | 0.02 | 0.3 |
|
||||
| User personalization | 0.01 | 0.5 |
|
||||
|
||||
### Background Loop Tuning
|
||||
|
||||
Consolidates patterns without blocking inference.
|
||||
|
||||
```rust
|
||||
let config = SonaLlmConfig {
|
||||
// How often to run (milliseconds)
|
||||
background_interval_ms: 100,
|
||||
|
||||
// Minimum samples before consolidation
|
||||
background_min_samples: 10,
|
||||
|
||||
// Maximum pending (triggers forced consolidation)
|
||||
max_pending_samples: 1000,
|
||||
|
||||
// Consolidation strategy
|
||||
consolidation_strategy: ConsolidationStrategy::EwcMerge,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
**Tuning Guidelines:**
|
||||
|
||||
| Priority | interval_ms | min_samples | Strategy |
|
||||
|----------|-------------|-------------|----------|
|
||||
| Latency | 200 | 20 | Average |
|
||||
| Quality | 50 | 5 | EwcMerge |
|
||||
| Memory | 100 | 50 | BestOnly |
|
||||
|
||||
### Deep Loop Optimization
|
||||
|
||||
Triggered periodically for full optimization.
|
||||
|
||||
```rust
|
||||
let config = SonaLlmConfig {
|
||||
// Accumulated quality threshold to trigger
|
||||
deep_trigger_threshold: 100.0,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Manual trigger for scheduled optimization
|
||||
if sona.should_trigger_deep() || is_scheduled_time() {
|
||||
let samples = collect_high_quality_samples();
|
||||
let result = sona.deep_optimize(&samples);
|
||||
|
||||
// Log improvement
|
||||
println!("Deep optimization: quality delta = {:.3}", result.quality_delta);
|
||||
}
|
||||
```
|
||||
|
||||
## Batch Size Optimization
|
||||
|
||||
### Dynamic Batching
|
||||
|
||||
```rust
|
||||
// Optimal batch sizes vary by operation
|
||||
struct BatchConfig {
|
||||
prefill_batch: usize, // Process multiple prompts together
|
||||
decode_batch: usize, // Parallel token generation
|
||||
lora_batch: usize, // LoRA adaptation batch
|
||||
}
|
||||
|
||||
impl BatchConfig {
|
||||
fn for_memory(available_gb: f32) -> Self {
|
||||
match available_gb {
|
||||
x if x < 8.0 => Self {
|
||||
prefill_batch: 1,
|
||||
decode_batch: 4,
|
||||
lora_batch: 16,
|
||||
},
|
||||
x if x < 16.0 => Self {
|
||||
prefill_batch: 2,
|
||||
decode_batch: 8,
|
||||
lora_batch: 32,
|
||||
},
|
||||
_ => Self {
|
||||
prefill_batch: 4,
|
||||
decode_batch: 16,
|
||||
lora_batch: 64,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Batch Size Impact
|
||||
|
||||
| Batch Size | Throughput | Latency | Memory |
|
||||
|------------|------------|---------|--------|
|
||||
| 1 | Low | Lowest | Lowest |
|
||||
| 4 | Medium | Low | Medium |
|
||||
| 8 | High | Medium | High |
|
||||
| 16+ | Highest | Higher | Highest |
|
||||
|
||||
**Rule of thumb:** Increase batch size until memory pressure or latency constraints are hit.
|
||||
|
||||
## KV Cache Optimization
|
||||
|
||||
### Two-Tier Configuration
|
||||
|
||||
```rust
|
||||
let config = KvCacheConfig {
|
||||
// Tokens in high-precision tail
|
||||
// More = better attention quality for recent context
|
||||
// Less = less memory usage
|
||||
tail_length: 256,
|
||||
|
||||
// Tail precision (FP16 recommended)
|
||||
tail_precision: Precision::FP16,
|
||||
|
||||
// Store precision (Q4 for 4x compression)
|
||||
store_precision: Precision::Q4,
|
||||
|
||||
// Maximum context length
|
||||
max_tokens: 4096,
|
||||
|
||||
// KV heads (depends on model architecture)
|
||||
num_kv_heads: 8,
|
||||
head_dim: 128,
|
||||
|
||||
// Batch size for migration (affects latency spikes)
|
||||
migration_batch: 64,
|
||||
};
|
||||
```
|
||||
|
||||
### Memory Calculation
|
||||
|
||||
```
|
||||
KV Cache Memory = num_layers * 2 * max_tokens * num_kv_heads * head_dim * bytes_per_element
|
||||
|
||||
Example (Qwen2.5-7B with 4096 context):
|
||||
- Layers: 32
|
||||
- KV heads: 8
|
||||
- Head dim: 128
|
||||
- FP16 tail (256 tokens): 32 * 2 * 256 * 8 * 128 * 2 = 33.5 MB
|
||||
- Q4 store (3840 tokens): 32 * 2 * 3840 * 8 * 128 * 0.5 = 125.8 MB
|
||||
- Total: ~160 MB (vs ~672 MB for full FP16)
|
||||
```
|
||||
|
||||
### Cache Strategies by Use Case
|
||||
|
||||
| Use Case | tail_length | store_precision | max_tokens |
|
||||
|----------|-------------|-----------------|------------|
|
||||
| Chat (short) | 128 | Q8 | 2048 |
|
||||
| Chat (long) | 256 | Q4 | 8192 |
|
||||
| Document QA | 512 | Q4 | 16384 |
|
||||
| Code completion | 128 | Q8 | 4096 |
|
||||
|
||||
## Attention Optimization
|
||||
|
||||
### Grouped-Query Attention (GQA)
|
||||
|
||||
```rust
|
||||
let config = AttentionConfig {
|
||||
num_heads: 32, // Query heads
|
||||
num_kv_heads: 8, // KV heads (4:1 ratio)
|
||||
head_dim: 128,
|
||||
causal: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// GQA ratio determines memory savings
|
||||
// 4:1 = ~4x KV cache reduction
|
||||
// 8:1 = ~8x KV cache reduction
|
||||
assert_eq!(config.gqa_ratio(), 4);
|
||||
```
|
||||
|
||||
### Flash Attention Optimization
|
||||
|
||||
```rust
|
||||
// Flash Attention is memory-efficient but has setup overhead
|
||||
// Best for: longer sequences (>256 tokens)
|
||||
|
||||
// For short sequences, standard attention may be faster
|
||||
let use_flash = sequence_length > 256;
|
||||
|
||||
if use_flash {
|
||||
let output = flash_attention_neon(&query, &key, &value, scale, causal);
|
||||
} else {
|
||||
let output = standard_attention(&query, &key, &value, scale, causal);
|
||||
}
|
||||
```
|
||||
|
||||
### Paged Attention for Inference
|
||||
|
||||
```rust
|
||||
// Paged attention enables non-contiguous KV cache
|
||||
// Best for: long-running inference with variable context
|
||||
|
||||
let mut cache = PagedKvCache::new(
|
||||
16, // block_size: tokens per block
|
||||
8, // num_kv_heads
|
||||
128, // head_dim
|
||||
);
|
||||
|
||||
// Append incrementally
|
||||
for token in tokens {
|
||||
let (k, v) = compute_kv(token)?;
|
||||
cache.append(&k, &v);
|
||||
}
|
||||
|
||||
// Efficient attention over paged cache
|
||||
let output = paged_attention_neon(&query, &cache, &block_tables, scale);
|
||||
```
|
||||
|
||||
## Quantization Optimization
|
||||
|
||||
### Model Quantization
|
||||
|
||||
| Precision | Memory | Quality | Speed |
|
||||
|-----------|--------|---------|-------|
|
||||
| FP32 | 4x | Best | Slowest |
|
||||
| FP16 | 2x | Excellent | Fast |
|
||||
| Q8 | 1x | Very Good | Faster |
|
||||
| Q4K | 0.5x | Good | Fastest |
|
||||
| Q4 | 0.5x | Acceptable | Fastest |
|
||||
|
||||
**Recommendations:**
|
||||
|
||||
```rust
|
||||
// High quality (16GB+ RAM)
|
||||
let config = ModelConfig {
|
||||
quantization: Precision::Q8,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Balanced (8-16GB RAM)
|
||||
let config = ModelConfig {
|
||||
quantization: Precision::Q4K, // K-quant preserves quality
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Memory constrained (<8GB RAM)
|
||||
let config = ModelConfig {
|
||||
quantization: Precision::Q4,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
### KV Cache Quantization
|
||||
|
||||
```rust
|
||||
// Hybrid quantization: recent tokens in high precision
|
||||
let config = KvCacheConfig {
|
||||
tail_length: 256, // Recent: FP16
|
||||
tail_precision: Precision::FP16,
|
||||
store_precision: Precision::Q4, // Older: Q4
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Quality impact by position
|
||||
// Position 0-256 (tail): Full quality
|
||||
// Position 256+: ~95% quality with Q4
|
||||
```
|
||||
|
||||
## Hardware-Specific Optimization
|
||||
|
||||
### Apple Silicon (M1/M2/M3/M4)
|
||||
|
||||
```rust
|
||||
// Metal backend for GPU acceleration
|
||||
let backend = CandleBackend::with_device(DeviceType::Metal)?;
|
||||
|
||||
// Optimize for unified memory
|
||||
let config = ModelConfig {
|
||||
// Unified memory = larger KV cache possible
|
||||
kv_cache_config: KvCacheConfig {
|
||||
max_tokens: 8192, // Can be larger on M-series
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
**M4 Pro Specific:**
|
||||
- Use `metal` feature for GPU acceleration
|
||||
- NEON SIMD enabled by default
|
||||
- Leverage unified memory for larger context
|
||||
|
||||
### NVIDIA GPUs
|
||||
|
||||
```rust
|
||||
// CUDA backend
|
||||
let backend = CandleBackend::with_device(DeviceType::Cuda(0))?;
|
||||
|
||||
// Optimize for separate VRAM
|
||||
let config = ModelConfig {
|
||||
kv_cache_config: KvCacheConfig {
|
||||
// Conservative: VRAM is limited
|
||||
max_tokens: 4096,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
### CPU Fallback
|
||||
|
||||
```rust
|
||||
// CPU with SIMD optimization
|
||||
let backend = CandleBackend::with_device(DeviceType::Cpu)?;
|
||||
|
||||
// Reduce memory pressure
|
||||
let config = ModelConfig {
|
||||
quantization: Precision::Q4,
|
||||
kv_cache_config: KvCacheConfig {
|
||||
tail_length: 128,
|
||||
max_tokens: 2048,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
## Real-Time Optimization
|
||||
|
||||
### Adaptive Optimization
|
||||
|
||||
```rust
|
||||
use ruvllm::optimization::{RealTimeOptimizer, OptimizerConfig};
|
||||
|
||||
let optimizer = RealTimeOptimizer::new(OptimizerConfig {
|
||||
target_latency_ms: 100.0,
|
||||
min_throughput: 50.0, // tokens/sec
|
||||
memory_threshold: 0.9, // 90% of available
|
||||
});
|
||||
|
||||
// Optimizer adjusts parameters in real-time
|
||||
loop {
|
||||
let metrics = backend.get_metrics();
|
||||
let adjustments = optimizer.recommend(&metrics);
|
||||
|
||||
if adjustments.reduce_batch_size {
|
||||
config.batch_size -= 1;
|
||||
}
|
||||
if adjustments.increase_quantization {
|
||||
config.kv_cache_config.store_precision = Precision::Q4;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Latency Monitoring
|
||||
|
||||
```rust
|
||||
// Track latency components
|
||||
struct LatencyBreakdown {
|
||||
tokenization_us: u64,
|
||||
prefill_us: u64,
|
||||
decode_us: u64,
|
||||
sampling_us: u64,
|
||||
lora_us: u64,
|
||||
}
|
||||
|
||||
impl LatencyBreakdown {
|
||||
fn total_ms(&self) -> f64 {
|
||||
(self.tokenization_us + self.prefill_us +
|
||||
self.decode_us + self.sampling_us + self.lora_us) as f64 / 1000.0
|
||||
}
|
||||
|
||||
fn bottleneck(&self) -> &str {
|
||||
let max = [
|
||||
(self.tokenization_us, "tokenization"),
|
||||
(self.prefill_us, "prefill"),
|
||||
(self.decode_us, "decode"),
|
||||
(self.sampling_us, "sampling"),
|
||||
(self.lora_us, "lora"),
|
||||
].into_iter().max_by_key(|(v, _)| *v).unwrap();
|
||||
max.1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Benchmarking
|
||||
|
||||
### Running Benchmarks
|
||||
|
||||
```bash
|
||||
# All benchmarks
|
||||
cargo bench
|
||||
|
||||
# Specific benchmarks
|
||||
cargo bench --bench attention_bench
|
||||
cargo bench --bench lora_bench
|
||||
cargo bench --bench e2e_bench
|
||||
|
||||
# With specific features
|
||||
cargo bench --features metal
|
||||
cargo bench --features cuda
|
||||
```
|
||||
|
||||
### Custom Benchmarks
|
||||
|
||||
```rust
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use ruvllm::kernels::attention::flash_attention_neon;
|
||||
|
||||
fn bench_attention(c: &mut Criterion) {
|
||||
let query = vec![0.1f32; 128];
|
||||
let key = vec![0.1f32; 512 * 128];
|
||||
let value = vec![0.1f32; 512 * 128];
|
||||
let scale = 1.0 / 128.0_f32.sqrt();
|
||||
|
||||
c.bench_function("flash_attention_512", |b| {
|
||||
b.iter(|| {
|
||||
flash_attention_neon(
|
||||
black_box(&query),
|
||||
black_box(&key),
|
||||
black_box(&value),
|
||||
scale,
|
||||
true,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_attention);
|
||||
criterion_main!(benches);
|
||||
```
|
||||
|
||||
## Optimization Checklist
|
||||
|
||||
### Before Deployment
|
||||
|
||||
- [ ] Choose appropriate quantization (Q4K for most cases)
|
||||
- [ ] Configure KV cache for expected context length
|
||||
- [ ] Enable GQA if model supports it
|
||||
- [ ] Set appropriate batch sizes for memory
|
||||
- [ ] Configure SONA learning rates
|
||||
- [ ] Test with representative workloads
|
||||
|
||||
### Monitoring
|
||||
|
||||
- [ ] Track prefill and decode throughput
|
||||
- [ ] Monitor memory usage over time
|
||||
- [ ] Log KV cache hit rates
|
||||
- [ ] Track SONA learning metrics
|
||||
- [ ] Alert on latency spikes
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
| Symptom | Likely Cause | Solution |
|
||||
|---------|--------------|----------|
|
||||
| High latency | Batch too large | Reduce batch size |
|
||||
| OOM errors | KV cache too large | Reduce max_tokens or use Q4 |
|
||||
| Quality degradation | Over-quantization | Use Q8 instead of Q4 |
|
||||
| Slow adaptation | Learning rate too low | Increase instant_lr |
|
||||
| Forgetting | EWC lambda too low | Increase ewc_lambda |
|
||||
417
vendor/ruvector/docs/ruvllm/ruvltra-medium.md
vendored
Normal file
417
vendor/ruvector/docs/ruvllm/ruvltra-medium.md
vendored
Normal file
@@ -0,0 +1,417 @@
|
||||
# RuvLTRA-Medium: 3B Parameter Model Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
RuvLTRA-Medium is a 3 billion parameter language model based on the Qwen2.5-3B-Instruct architecture, enhanced with advanced learning capabilities and optimized for Apple Silicon and modern GPU acceleration.
|
||||
|
||||
## Architecture Specifications
|
||||
|
||||
### Model Configuration
|
||||
|
||||
| Parameter | Value | Description |
|
||||
|-----------|-------|-------------|
|
||||
| **Total Parameters** | ~3.0B | Full model size |
|
||||
| **Hidden Size** | 2048 | Embedding dimension |
|
||||
| **Layers** | 32 | Transformer decoder layers |
|
||||
| **Attention Heads** | 16 | Query heads |
|
||||
| **KV Heads** | 2 | Key-value heads (GQA) |
|
||||
| **GQA Ratio** | 8:1 | Grouped Query Attention ratio |
|
||||
| **Head Dimension** | 128 | Per-head dimension |
|
||||
| **Intermediate Size** | 11008 | MLP hidden dimension |
|
||||
| **Vocabulary Size** | 151936 | Qwen tokenizer |
|
||||
| **Context Length** | 32768 | Maximum sequence length |
|
||||
| **RoPE Theta** | 1,000,000 | RoPE base frequency |
|
||||
|
||||
### Quantization Options
|
||||
|
||||
| Format | Model Size | Quality | Speed | Recommended Use |
|
||||
|--------|-----------|---------|-------|-----------------|
|
||||
| **Q4_K_M** | ~2.0 GB | Good | Fast | Production inference |
|
||||
| **Q5_K_M** | ~2.5 GB | Better | Medium | Balanced quality/speed |
|
||||
| **Q8_0** | ~3.5 GB | Best | Slower | Maximum quality |
|
||||
| **Mixed** | ~2.8 GB | Excellent | Medium | FP16 attn + Q4 MLP |
|
||||
|
||||
## Model Variants
|
||||
|
||||
### 1. RuvLTRA-Medium-Base
|
||||
|
||||
General-purpose model for diverse tasks.
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::base();
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Temperature: 0.7
|
||||
- Top-p: 0.9
|
||||
- SONA hooks: Layers 8, 16, 24
|
||||
- Pattern capacity: 50,000
|
||||
|
||||
**Use Cases:**
|
||||
- General conversation
|
||||
- Text completion
|
||||
- Summarization
|
||||
- Question answering
|
||||
|
||||
### 2. RuvLTRA-Medium-Coder
|
||||
|
||||
Optimized for code generation and analysis.
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::coder();
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Temperature: 0.2 (deterministic)
|
||||
- Top-p: 0.95
|
||||
- SONA hooks: Layers 8, 16, 24, 28 (extra late-layer)
|
||||
- Pattern capacity: 100,000
|
||||
- Quality threshold: 0.7 (stricter)
|
||||
|
||||
**Use Cases:**
|
||||
- Code completion
|
||||
- Bug fixing
|
||||
- Code refactoring
|
||||
- API generation
|
||||
|
||||
### 3. RuvLTRA-Medium-Agent
|
||||
|
||||
Routing and planning optimized for agent systems.
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::agent();
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Temperature: 0.3
|
||||
- Top-p: 0.85
|
||||
- SONA hooks: Layers 8, 16, 24
|
||||
- HNSW M: 32 (higher connectivity)
|
||||
- HNSW ef_construction: 400
|
||||
- Micro-LoRA rank: 2 (low latency)
|
||||
|
||||
**Use Cases:**
|
||||
- Claude Flow agent routing
|
||||
- Task planning
|
||||
- Decision making
|
||||
- Multi-agent coordination
|
||||
|
||||
## RuvLTRA Enhancements
|
||||
|
||||
### 1. SONA Learning Hooks
|
||||
|
||||
SONA (Self-Optimizing Neural Architecture) hooks enable continuous learning during inference.
|
||||
|
||||
**Hook Layers:**
|
||||
- **Layer 8**: Early pattern recognition (shallow semantics)
|
||||
- **Layer 16**: Mid-layer semantic extraction (concepts)
|
||||
- **Layer 24**: Deep reasoning capture (abstract thinking)
|
||||
|
||||
**Implementation:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::base();
|
||||
let mut model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// Enable custom hook layers
|
||||
model.enable_sona_with_hooks(&[8, 16, 24])?;
|
||||
```
|
||||
|
||||
**Learning Loop:**
|
||||
1. **Instant Loop**: Ring buffer with MicroLoRA (rank 4)
|
||||
2. **Background Loop**: Router training with EWC++ Fisher
|
||||
3. **Deep Loop**: Pattern bank consolidation
|
||||
|
||||
### 2. HNSW Routing Integration
|
||||
|
||||
HNSW (Hierarchical Navigable Small World) enables fast agent routing.
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::agent();
|
||||
assert_eq!(config.sona_hooks.hnsw_m, 32);
|
||||
assert_eq!(config.sona_hooks.hnsw_ef_construction, 400);
|
||||
```
|
||||
|
||||
**Performance:**
|
||||
- Search: 150x-12,500x faster than brute-force
|
||||
- Insertion: O(log n) complexity
|
||||
- Memory: ~4 bytes per node per connection
|
||||
|
||||
### 3. Claude Flow Agent Embeddings
|
||||
|
||||
Integration with Claude Flow for intelligent task routing.
|
||||
|
||||
**Features:**
|
||||
- Agent type classification
|
||||
- Task complexity estimation
|
||||
- Quality prediction
|
||||
- Trajectory recording
|
||||
|
||||
**Usage:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::agent();
|
||||
config.enable_agent_routing = true;
|
||||
|
||||
let model = RuvLtraMediumModel::new(&config)?;
|
||||
// Model automatically records trajectories for routing
|
||||
```
|
||||
|
||||
### 4. ReasoningBank Trajectory Storage
|
||||
|
||||
Stores successful reasoning patterns for future retrieval.
|
||||
|
||||
**Storage Format:**
|
||||
- State-action pairs
|
||||
- Quality scores (0.0-1.0)
|
||||
- Contextual embeddings
|
||||
- Temporal metadata
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::base();
|
||||
config.enable_reasoning_bank = true;
|
||||
config.sona_config.pattern_capacity = 50000;
|
||||
```
|
||||
|
||||
## Memory Optimization
|
||||
|
||||
### 1. Paged KV Cache
|
||||
|
||||
Efficient memory management for attention computation.
|
||||
|
||||
**Block Size:** 64 tokens per page
|
||||
|
||||
**Benefits:**
|
||||
- 40-60% memory reduction
|
||||
- Dynamic sequence handling
|
||||
- Copy-on-write semantics
|
||||
- Efficient prefix caching
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::base();
|
||||
assert!(config.use_paged_attention);
|
||||
assert_eq!(config.paged_config.page_size, 64);
|
||||
```
|
||||
|
||||
### 2. Flash Attention 2
|
||||
|
||||
Optimized attention kernel for 2.49x-7.47x speedup.
|
||||
|
||||
**Algorithm:**
|
||||
- Tiled computation
|
||||
- Recomputation on-the-fly
|
||||
- IO-aware optimization
|
||||
- Causal masking
|
||||
|
||||
**Performance:**
|
||||
| Sequence Length | Speedup | Memory Savings |
|
||||
|-----------------|---------|----------------|
|
||||
| 2K tokens | 2.5x | 30% |
|
||||
| 8K tokens | 4.2x | 50% |
|
||||
| 32K tokens | 7.1x | 70% |
|
||||
|
||||
### 3. Speculative Decoding
|
||||
|
||||
Uses RuvLTRA-Small (0.5B) as draft model for 2-3x speedup.
|
||||
|
||||
**Configuration:**
|
||||
```rust
|
||||
let mut config = RuvLtraMediumConfig::base();
|
||||
config.use_speculative_decoding = true;
|
||||
config.speculative_config.lookahead = 4;
|
||||
config.draft_model_path = Some("models/ruvltra-small-q4.gguf".into());
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- Lookahead: 4 tokens (default)
|
||||
- Acceptance threshold: 0.7
|
||||
- Draft temperature: 0.0 (greedy)
|
||||
- Adaptive lookahead: enabled
|
||||
|
||||
**Expected Speedup:**
|
||||
| Temperature | Speedup |
|
||||
|-------------|---------|
|
||||
| 0.0 (greedy) | 2.8-3.2x |
|
||||
| 0.5 | 2.2-2.6x |
|
||||
| 1.0 | 1.5-1.8x |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Inference
|
||||
|
||||
```rust
|
||||
use ruvllm::models::ruvltra_medium::{RuvLtraMediumConfig, RuvLtraMediumModel};
|
||||
|
||||
// Create model
|
||||
let config = RuvLtraMediumConfig::base();
|
||||
let mut model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// Tokenize input
|
||||
let input_ids = vec![151643, 9521, 11, 1917]; // "Hello, world"
|
||||
let positions = (0..input_ids.len()).collect::<Vec<_>>();
|
||||
|
||||
// Run inference
|
||||
let logits = model.forward(&input_ids, &positions)?;
|
||||
|
||||
// Get next token
|
||||
let next_token = argmax(&logits[logits.len() - config.vocab_size..]);
|
||||
```
|
||||
|
||||
### Code Generation (Coder Variant)
|
||||
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::coder();
|
||||
let mut model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// Enable SONA hooks for learning
|
||||
model.enable_sona_with_hooks(&[8, 16, 24, 28])?;
|
||||
|
||||
// Generate code
|
||||
let prompt = "fn fibonacci(n: u32) -> u32 {";
|
||||
let output = model.generate(prompt, GenerateParams {
|
||||
max_tokens: 256,
|
||||
temperature: 0.2,
|
||||
top_p: 0.95,
|
||||
..Default::default()
|
||||
})?;
|
||||
```
|
||||
|
||||
### Agent Routing (Agent Variant)
|
||||
|
||||
```rust
|
||||
let config = RuvLtraMediumConfig::agent();
|
||||
let model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// Enable Claude Flow integration
|
||||
assert!(config.enable_agent_routing);
|
||||
|
||||
// Model automatically:
|
||||
// - Records trajectories
|
||||
// - Updates HNSW index
|
||||
// - Learns routing patterns
|
||||
```
|
||||
|
||||
### Speculative Decoding
|
||||
|
||||
```rust
|
||||
let mut config = RuvLtraMediumConfig::base();
|
||||
config.use_speculative_decoding = true;
|
||||
config.draft_model_path = Some("ruvltra-small-q4.gguf".into());
|
||||
|
||||
let model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// 2-3x faster generation
|
||||
let output = model.generate("Once upon a time", params)?;
|
||||
```
|
||||
|
||||
## Model Loading
|
||||
|
||||
### From GGUF
|
||||
|
||||
```rust
|
||||
use ruvllm::gguf::loader::GGUFLoader;
|
||||
|
||||
let loader = GGUFLoader::new("ruvltra-medium-q4_k_m.gguf")?;
|
||||
let model = loader.load_ruvltra_medium()?;
|
||||
```
|
||||
|
||||
### Quantization Formats
|
||||
|
||||
```bash
|
||||
# Download pre-quantized models
|
||||
wget https://huggingface.co/ruvector/ruvltra-medium-q4_k_m-gguf
|
||||
wget https://huggingface.co/ruvector/ruvltra-medium-q5_k_m-gguf
|
||||
wget https://huggingface.co/ruvector/ruvltra-medium-q8_0-gguf
|
||||
|
||||
# Or quantize yourself
|
||||
cargo run --release --bin quantize -- \
|
||||
--model qwen2.5-3b-instruct \
|
||||
--output ruvltra-medium-q4_k_m.gguf \
|
||||
--format q4_k_m
|
||||
```
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
### Inference Speed (Apple M3 Max)
|
||||
|
||||
| Configuration | Tokens/sec | Memory | Power |
|
||||
|---------------|-----------|--------|-------|
|
||||
| Base Q4_K_M | 68 tok/s | 2.2 GB | 12W |
|
||||
| Base Q5_K_M | 55 tok/s | 2.7 GB | 14W |
|
||||
| Base Q8_0 | 42 tok/s | 3.8 GB | 16W |
|
||||
| Coder Q4_K_M | 65 tok/s | 2.4 GB | 13W |
|
||||
| Agent Q4_K_M | 72 tok/s | 2.1 GB | 11W |
|
||||
| + Speculative | 158 tok/s | 2.8 GB | 15W |
|
||||
|
||||
### Quality Metrics
|
||||
|
||||
| Benchmark | Base | Coder | Agent |
|
||||
|-----------|------|-------|-------|
|
||||
| MMLU | 68.2% | 66.8% | 64.5% |
|
||||
| HumanEval | 52.4% | 61.7% | 48.9% |
|
||||
| GSM8K | 71.3% | 69.8% | 73.6% |
|
||||
| TruthfulQA | 45.8% | 44.2% | 47.1% |
|
||||
|
||||
## Integration with Claude Flow
|
||||
|
||||
### Agent Routing
|
||||
|
||||
```rust
|
||||
use ruvllm::models::ruvltra_medium::RuvLtraMediumConfig;
|
||||
use ruvllm::claude_flow::AgentRouter;
|
||||
|
||||
let config = RuvLtraMediumConfig::agent();
|
||||
let model = RuvLtraMediumModel::new(&config)?;
|
||||
|
||||
// Router uses model embeddings for task classification
|
||||
let router = AgentRouter::new(model.sona().unwrap());
|
||||
|
||||
// Route task to optimal agent
|
||||
let task = "Implement authentication system";
|
||||
let agent = router.route(task)?; // Returns: "coder" or "security-architect"
|
||||
```
|
||||
|
||||
### Trajectory Recording
|
||||
|
||||
```rust
|
||||
use ruvllm::sona::Trajectory;
|
||||
|
||||
// Create trajectory
|
||||
let mut trajectory = Trajectory::new("code-generation");
|
||||
trajectory.add_state(initial_state);
|
||||
trajectory.add_action("generate_function", quality_score);
|
||||
|
||||
// Record in model
|
||||
model.sona()
|
||||
.unwrap()
|
||||
.write()
|
||||
.record_trajectory(trajectory)?;
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
1. **Context Window**: 32K tokens (not extensible without retraining)
|
||||
2. **SONA Hooks**: Limited to 4 hooks due to memory overhead
|
||||
3. **Speculative Decoding**: Requires separate draft model
|
||||
4. **Quantization**: Q4/Q5 may degrade quality by 2-3%
|
||||
5. **Hardware**: Optimized for Apple Silicon; GPU acceleration recommended
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [ ] RuvLTRA-Medium-Vision (multimodal)
|
||||
- [ ] Context extension to 128K tokens
|
||||
- [ ] Mixture-of-Experts (MoE) variant
|
||||
- [ ] On-device fine-tuning
|
||||
- [ ] Distillation to RuvLTRA-Small
|
||||
|
||||
## References
|
||||
|
||||
- [Qwen2.5 Technical Report](https://arxiv.org/abs/2407.10671)
|
||||
- [Flash Attention 2](https://arxiv.org/abs/2307.08691)
|
||||
- [Speculative Decoding](https://arxiv.org/abs/2211.17192)
|
||||
- [Grouped Query Attention](https://arxiv.org/abs/2305.13245)
|
||||
- [HNSW Algorithm](https://arxiv.org/abs/1603.09320)
|
||||
Reference in New Issue
Block a user