Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/docs/ruvllm/API_REFERENCE.md
+++ b/vendor/ruvector/docs/ruvllm/API_REFERENCE.md
@@ -0,0 +1,862 @@
+# RuvLLM API Reference
+
+Complete API documentation for the RuvLLM crate.
+
+## Table of Contents
+
+- [Core Types](#core-types)
+- [Backend Trait](#backend-trait)
+- [Candle Backend](#candle-backend)
+- [LoRA Module](#lora-module)
+- [Optimization Module](#optimization-module)
+- [Kernel Functions](#kernel-functions)
+- [KV Cache](#kv-cache)
+- [Error Handling](#error-handling)
+
+---
+
+## Core Types
+
+### `Precision`
+
+Numeric precision for model weights and KV cache.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Precision {
+    /// Full 32-bit floating point
+    FP32,
+    /// Half precision 16-bit float
+    FP16,
+    /// Brain floating point (16-bit)
+    BF16,
+    /// 8-bit integer quantization
+    Q8,
+    /// 4-bit integer quantization
+    Q4,
+    /// 4-bit K-quant (GGML-style)
+    Q4K,
+}
+
+impl Precision {
+    /// Get bytes per element for this precision
+    pub fn bytes_per_element(&self) -> u8;
+}
+```
+
+### `ModelSize`
+
+Model size classification for routing.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ModelSize {
+    Tiny,   // < 1B params
+    Small,  // 1-3B params
+    Medium, // 3-13B params
+    Large,  // > 13B params
+}
+```
+
+### `DeviceType`
+
+Compute device selection.
+
+```rust
+#[derive(Debug, Clone, Copy)]
+pub enum DeviceType {
+    /// CPU (fallback)
+    Cpu,
+    /// Apple Metal GPU
+    Metal,
+    /// NVIDIA CUDA GPU
+    Cuda(usize),  // device index
+}
+```
+
+---
+
+## Backend Trait
+
+### `LlmBackend`
+
+Main trait for LLM inference backends.
+
+```rust
+pub trait LlmBackend: Send + Sync {
+    /// Load a model from HuggingFace Hub or local path
+    ///
+    /// # Arguments
+    /// * `model_id` - HuggingFace model ID or local path
+    /// * `config` - Model configuration
+    ///
+    /// # Example
+    /// ```
+    /// backend.load_model("Qwen/Qwen2.5-7B-Instruct", config)?;
+    /// ```
+    fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()>;
+
+    /// Generate text from a prompt
+    ///
+    /// # Arguments
+    /// * `prompt` - Input text prompt
+    /// * `params` - Generation parameters
+    ///
+    /// # Returns
+    /// Generated text response
+    ///
+    /// # Example
+    /// ```
+    /// let response = backend.generate("Hello!", GenerateParams::default())?;
+    /// ```
+    fn generate(&self, prompt: &str, params: GenerateParams) -> Result<String>;
+
+    /// Streaming text generation
+    ///
+    /// # Arguments
+    /// * `prompt` - Input text prompt
+    /// * `params` - Generation parameters
+    /// * `callback` - Called for each generated token
+    fn generate_stream<F>(&self, prompt: &str, params: GenerateParams, callback: F) -> Result<()>
+    where
+        F: FnMut(&str) -> bool;
+
+    /// Get the tokenizer for this model
+    fn tokenizer(&self) -> Option<&dyn Tokenizer>;
+
+    /// Get model metadata
+    fn model_info(&self) -> Option<ModelInfo>;
+
+    /// Check if a model is loaded
+    fn is_loaded(&self) -> bool;
+}
+```
+
+### `ModelConfig`
+
+Configuration for model loading.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct ModelConfig {
+    /// Maximum context length
+    pub max_context: usize,
+    /// Use Flash Attention
+    pub use_flash_attention: bool,
+    /// Weight quantization level
+    pub quantization: Precision,
+    /// KV cache configuration
+    pub kv_cache_config: KvCacheConfig,
+    /// Device to load model on
+    pub device: DeviceType,
+    /// HuggingFace token for gated models
+    pub hf_token: Option<String>,
+}
+
+impl Default for ModelConfig {
+    fn default() -> Self {
+        Self {
+            max_context: 4096,
+            use_flash_attention: true,
+            quantization: Precision::Q4K,
+            kv_cache_config: KvCacheConfig::default(),
+            device: DeviceType::Metal,
+            hf_token: None,
+        }
+    }
+}
+```
+
+### `GenerateParams`
+
+Parameters for text generation.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct GenerateParams {
+    /// Maximum tokens to generate
+    pub max_tokens: usize,
+    /// Sampling temperature (0.0 = deterministic)
+    pub temperature: f32,
+    /// Top-p (nucleus) sampling
+    pub top_p: f32,
+    /// Top-k sampling (0 = disabled)
+    pub top_k: usize,
+    /// Repetition penalty
+    pub repetition_penalty: f32,
+    /// Stop sequences
+    pub stop_sequences: Vec<String>,
+    /// Random seed for reproducibility
+    pub seed: Option<u64>,
+}
+
+impl Default for GenerateParams {
+    fn default() -> Self {
+        Self {
+            max_tokens: 256,
+            temperature: 0.7,
+            top_p: 0.9,
+            top_k: 0,
+            repetition_penalty: 1.1,
+            stop_sequences: vec![],
+            seed: None,
+        }
+    }
+}
+```
+
+---
+
+## Candle Backend
+
+### `CandleBackend`
+
+HuggingFace Candle-based inference backend.
+
+```rust
+impl CandleBackend {
+    /// Create a new backend with default device
+    ///
+    /// # Example
+    /// ```
+    /// let backend = CandleBackend::new()?;
+    /// ```
+    pub fn new() -> Result<Self>;
+
+    /// Create with specific device
+    ///
+    /// # Example
+    /// ```
+    /// let backend = CandleBackend::with_device(DeviceType::Metal)?;
+    /// ```
+    pub fn with_device(device: DeviceType) -> Result<Self>;
+
+    /// Download model from HuggingFace Hub
+    ///
+    /// # Arguments
+    /// * `model_id` - HuggingFace model ID
+    /// * `quantization` - Target quantization
+    /// * `cache_dir` - Local cache directory
+    ///
+    /// # Example
+    /// ```
+    /// let path = backend.download_model(
+    ///     "Qwen/Qwen2.5-7B-Instruct",
+    ///     Precision::Q4K,
+    ///     "~/.cache/ruvllm"
+    /// ).await?;
+    /// ```
+    pub async fn download_model(
+        &self,
+        model_id: &str,
+        quantization: Precision,
+        cache_dir: &str,
+    ) -> Result<PathBuf>;
+
+    /// Get current device
+    pub fn device(&self) -> DeviceType;
+
+    /// Get memory usage statistics
+    pub fn memory_stats(&self) -> MemoryStats;
+}
+```
+
+---
+
+## LoRA Module
+
+### `MicroLoRA`
+
+Real-time per-request fine-tuning with rank 1-2 adapters.
+
+```rust
+impl MicroLoRA {
+    /// Create a new MicroLoRA instance
+    ///
+    /// # Example
+    /// ```
+    /// let config = MicroLoraConfig::for_hidden_dim(4096);
+    /// let lora = MicroLoRA::new(config);
+    /// ```
+    pub fn new(config: MicroLoraConfig) -> Self;
+
+    /// Adapt on new input with feedback
+    ///
+    /// # Arguments
+    /// * `input` - Input embedding vector
+    /// * `feedback` - Quality feedback for learning
+    ///
+    /// # Example
+    /// ```
+    /// let feedback = AdaptFeedback::from_quality(0.9);
+    /// lora.adapt(&input_embedding, feedback)?;
+    /// ```
+    pub fn adapt(&self, input: &[f32], feedback: AdaptFeedback) -> Result<()>;
+
+    /// Forward pass through LoRA adapter
+    ///
+    /// # Arguments
+    /// * `input` - Input tensor
+    /// * `module` - Target module (Q, K, V, O projections)
+    ///
+    /// # Returns
+    /// Output with LoRA contribution added
+    ///
+    /// # Example
+    /// ```
+    /// let output = lora.forward(&input, &TargetModule::QProj);
+    /// ```
+    pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
+
+    /// Forward pass that adds to existing output (in-place)
+    pub fn forward_add(&self, input: &[f32], module: &TargetModule, output: &mut [f32]);
+
+    /// Apply accumulated gradient updates
+    ///
+    /// # Arguments
+    /// * `learning_rate` - Learning rate for update
+    pub fn apply_updates(&self, learning_rate: f32);
+
+    /// Apply updates with EWC++ regularization
+    ///
+    /// # Arguments
+    /// * `learning_rate` - Learning rate
+    /// * `ewc_states` - EWC++ state per module
+    /// * `ewc_lambda` - EWC regularization strength
+    pub fn apply_updates_with_ewc(
+        &self,
+        learning_rate: f32,
+        ewc_states: &HashMap<TargetModule, EwcState>,
+        ewc_lambda: f32,
+    );
+
+    /// Reset all adapter weights
+    pub fn reset(&self);
+
+    /// Get adapter statistics
+    pub fn stats(&self) -> MicroLoraStats;
+}
+```
+
+### `MicroLoraConfig`
+
+Configuration for MicroLoRA adapters.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct MicroLoraConfig {
+    /// Input feature dimension
+    pub in_features: usize,
+    /// Output feature dimension
+    pub out_features: usize,
+    /// LoRA rank (1-2 for MicroLoRA)
+    pub rank: usize,
+    /// LoRA alpha scaling factor
+    pub alpha: f32,
+    /// Dropout probability
+    pub dropout: f32,
+    /// Target modules to adapt
+    pub target_modules: Vec<TargetModule>,
+    /// Enable gradient checkpointing
+    pub gradient_checkpointing: bool,
+}
+
+impl MicroLoraConfig {
+    /// Create config for a specific hidden dimension
+    ///
+    /// # Example
+    /// ```
+    /// let config = MicroLoraConfig::for_hidden_dim(4096);
+    /// assert_eq!(config.in_features, 4096);
+    /// assert_eq!(config.rank, 2);
+    /// ```
+    pub fn for_hidden_dim(hidden_dim: usize) -> Self;
+}
+```
+
+### `TargetModule`
+
+Transformer modules that can be adapted.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum TargetModule {
+    /// Query projection
+    QProj,
+    /// Key projection
+    KProj,
+    /// Value projection
+    VProj,
+    /// Output projection
+    OProj,
+    /// Gate projection (FFN)
+    GateProj,
+    /// Up projection (FFN)
+    UpProj,
+    /// Down projection (FFN)
+    DownProj,
+}
+```
+
+### `AdaptFeedback`
+
+Feedback for LoRA adaptation.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct AdaptFeedback {
+    /// Quality score (0.0 - 1.0)
+    pub quality: f32,
+    /// Gradient estimate from feedback
+    pub gradient_estimate: Vec<f32>,
+    /// Optional reward signal
+    pub reward: Option<f32>,
+    /// Latency in microseconds
+    pub latency_us: u64,
+    /// Source module (optional)
+    pub source_module: Option<TargetModule>,
+    /// Session identifier
+    pub session_id: Option<String>,
+}
+
+impl AdaptFeedback {
+    /// Create feedback from quality score
+    ///
+    /// # Example
+    /// ```
+    /// let feedback = AdaptFeedback::from_quality(0.85);
+    /// ```
+    pub fn from_quality(quality: f32) -> Self;
+}
+```
+
+---
+
+## Optimization Module
+
+### `SonaLlm`
+
+SONA learning integration for LLM inference.
+
+```rust
+impl SonaLlm {
+    /// Create new SONA LLM integration
+    ///
+    /// # Example
+    /// ```
+    /// let sona = SonaLlm::new(SonaLlmConfig::default());
+    /// ```
+    pub fn new(config: SonaLlmConfig) -> Self;
+
+    /// Instant loop: per-request MicroLoRA adaptation
+    ///
+    /// Target latency: <1ms
+    ///
+    /// # Arguments
+    /// * `request` - User query text
+    /// * `response` - Model response text
+    /// * `feedback` - Quality score (0.0 - 1.0)
+    ///
+    /// # Returns
+    /// Adaptation result with statistics
+    ///
+    /// # Example
+    /// ```
+    /// let result = sona.instant_adapt(
+    ///     "What is machine learning?",
+    ///     "Machine learning is...",
+    ///     0.9
+    /// );
+    /// assert!(result.applied);
+    /// assert!(result.latency_us < 1000); // <1ms
+    /// ```
+    pub fn instant_adapt(&self, request: &str, response: &str, feedback: f32) -> AdaptationResult;
+
+    /// Background loop: consolidate patterns
+    ///
+    /// Called periodically (~100ms interval)
+    ///
+    /// # Example
+    /// ```
+    /// let result = sona.background_consolidate();
+    /// println!("Consolidated {} samples", result.samples_used);
+    /// ```
+    pub fn background_consolidate(&self) -> AdaptationResult;
+
+    /// Deep loop: trigger full optimization
+    ///
+    /// # Arguments
+    /// * `dataset` - Training samples to learn from
+    pub fn deep_optimize(&self, dataset: &[TrainingSample]) -> AdaptationResult;
+
+    /// Check if background loop should run
+    pub fn maybe_background(&self) -> Option<AdaptationResult>;
+
+    /// Check if deep loop should be triggered
+    pub fn should_trigger_deep(&self) -> bool;
+
+    /// Get current statistics
+    pub fn stats(&self) -> LearningLoopStats;
+
+    /// Forward pass through MicroLoRA
+    pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
+
+    /// Reset all learning state
+    pub fn reset(&self);
+}
+```
+
+### `SonaLlmConfig`
+
+Configuration for SONA LLM integration.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct SonaLlmConfig {
+    /// MicroLoRA configuration
+    pub micro_lora: MicroLoraConfig,
+    /// Training pipeline configuration
+    pub training: TrainingConfig,
+    /// SONA core configuration
+    pub sona: SonaConfig,
+    /// Instant loop learning rate
+    pub instant_lr: f32,
+    /// Background loop interval (milliseconds)
+    pub background_interval_ms: u64,
+    /// Minimum samples for background consolidation
+    pub background_min_samples: usize,
+    /// Deep loop trigger threshold
+    pub deep_trigger_threshold: f32,
+    /// Maximum pending samples
+    pub max_pending_samples: usize,
+    /// Consolidation strategy
+    pub consolidation_strategy: ConsolidationStrategy,
+}
+```
+
+### `ConsolidationStrategy`
+
+Strategy for consolidating learned patterns.
+
+```rust
+#[derive(Debug, Clone, Copy)]
+pub enum ConsolidationStrategy {
+    /// Merge with EWC++ regularization (default)
+    EwcMerge,
+    /// Simple averaging
+    Average,
+    /// Weighted by quality
+    QualityWeighted,
+    /// Keep best performing only
+    BestOnly,
+    /// Ensemble multiple adapters
+    Ensemble,
+}
+```
+
+---
+
+## Kernel Functions
+
+### Attention Kernels
+
+```rust
+/// Flash Attention 2 with NEON SIMD optimization
+///
+/// Memory-efficient attention with O(N) complexity.
+///
+/// # Arguments
+/// * `query` - Query tensor (head_dim,)
+/// * `key` - Key tensor (kv_len, head_dim)
+/// * `value` - Value tensor (kv_len, head_dim)
+/// * `scale` - Softmax scale (typically 1/sqrt(head_dim))
+/// * `causal` - Apply causal masking
+///
+/// # Returns
+/// Output tensor (head_dim,)
+///
+/// # Example
+/// ```
+/// let scale = 1.0 / (head_dim as f32).sqrt();
+/// let output = flash_attention_neon(&query, &key, &value, scale, true);
+/// ```
+pub fn flash_attention_neon(
+    query: &[f32],
+    key: &[f32],
+    value: &[f32],
+    scale: f32,
+    causal: bool,
+) -> Vec<f32>;
+
+/// Paged Attention for KV cache
+///
+/// # Arguments
+/// * `query` - Query tensor
+/// * `kv_cache` - Paged KV cache
+/// * `block_tables` - Block index mapping
+/// * `scale` - Softmax scale
+pub fn paged_attention_neon(
+    query: &[f32],
+    kv_cache: &PagedKvCache,
+    block_tables: &[usize],
+    scale: f32,
+) -> Vec<f32>;
+
+/// Grouped-Query Attention (GQA)
+///
+/// KV heads shared among query head groups.
+///
+/// # Arguments
+/// * `queries` - Query tensor (num_heads, head_dim)
+/// * `keys` - Key tensor (kv_len, num_kv_heads, head_dim)
+/// * `values` - Value tensor (kv_len, num_kv_heads, head_dim)
+/// * `config` - Attention configuration
+pub fn grouped_query_attention_neon(
+    queries: &[f32],
+    keys: &[f32],
+    values: &[f32],
+    config: &AttentionConfig,
+) -> Vec<f32>;
+
+/// Multi-Query Attention (MQA)
+///
+/// Single KV head shared across all query heads.
+pub fn multi_query_attention_neon(
+    queries: &[f32],
+    key: &[f32],
+    value: &[f32],
+    config: &AttentionConfig,
+) -> Vec<f32>;
+```
+
+### `AttentionConfig`
+
+Configuration for attention operations.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct AttentionConfig {
+    /// Number of query heads
+    pub num_heads: usize,
+    /// Number of KV heads (for GQA)
+    pub num_kv_heads: usize,
+    /// Dimension per head
+    pub head_dim: usize,
+    /// Apply causal masking
+    pub causal: bool,
+    /// Custom scale factor (None = 1/sqrt(head_dim))
+    pub scale: Option<f32>,
+}
+
+impl AttentionConfig {
+    /// Calculate GQA ratio (query heads / KV heads)
+    pub fn gqa_ratio(&self) -> usize;
+
+    /// Get effective scale factor
+    pub fn effective_scale(&self) -> f32;
+}
+```
+
+---
+
+## KV Cache
+
+### `TwoTierKvCache`
+
+Two-tier KV cache with FP16 tail and quantized store.
+
+```rust
+impl TwoTierKvCache {
+    /// Create a new two-tier KV cache
+    ///
+    /// # Example
+    /// ```
+    /// let config = KvCacheConfig {
+    ///     tail_length: 256,
+    ///     max_tokens: 4096,
+    ///     ..Default::default()
+    /// };
+    /// let cache = TwoTierKvCache::new(config);
+    /// ```
+    pub fn new(config: KvCacheConfig) -> Self;
+
+    /// Append new KV pairs
+    ///
+    /// Automatically handles:
+    /// - Adding to tail
+    /// - Migrating to quantized store
+    /// - Evicting oldest tokens
+    ///
+    /// # Arguments
+    /// * `keys` - Key tensor
+    /// * `values` - Value tensor
+    ///
+    /// # Example
+    /// ```
+    /// cache.append(&keys, &values)?;
+    /// ```
+    pub fn append(&self, keys: &[f32], values: &[f32]) -> Result<()>;
+
+    /// Get all KV pairs for attention
+    ///
+    /// Returns (keys, values) with cold tier dequantized.
+    pub fn get_all_kv(&self) -> (Vec<f32>, Vec<f32>);
+
+    /// Compute attention with tier-aware access
+    ///
+    /// # Arguments
+    /// * `query` - Query tensor
+    /// * `scale` - Softmax scale
+    pub fn attend(&self, query: &[f32], scale: f32) -> Result<Vec<f32>>;
+
+    /// Get current statistics
+    pub fn stats(&self) -> KvCacheStats;
+
+    /// Clear the cache
+    pub fn clear(&self);
+
+    /// Update quantization policy
+    pub fn update_policy(&self, policy: CacheQuantization);
+}
+```
+
+### `KvCacheConfig`
+
+Configuration for KV cache.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct KvCacheConfig {
+    /// Tokens to keep in high-precision tail
+    pub tail_length: usize,
+    /// Precision for tail storage
+    pub tail_precision: Precision,
+    /// Precision for quantized store
+    pub store_precision: Precision,
+    /// Maximum total tokens
+    pub max_tokens: usize,
+    /// Number of KV heads
+    pub num_kv_heads: usize,
+    /// Head dimension
+    pub head_dim: usize,
+    /// Migration batch size
+    pub migration_batch: usize,
+}
+```
+
+### `KvCacheStats`
+
+Statistics for KV cache usage.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct KvCacheStats {
+    /// Total tokens cached
+    pub total_tokens: usize,
+    /// Tokens in high-precision tail
+    pub tail_tokens: usize,
+    /// Tokens in quantized store
+    pub store_tokens: usize,
+    /// Bytes used by tail
+    pub tail_bytes: usize,
+    /// Bytes used by store
+    pub store_bytes: usize,
+    /// Compression ratio
+    pub compression_ratio: f32,
+}
+```
+
+---
+
+## Error Handling
+
+### `RuvLLMError`
+
+Main error type for RuvLLM operations.
+
+```rust
+#[derive(Error, Debug)]
+pub enum RuvLLMError {
+    /// Storage-related errors
+    #[error("Storage error: {0}")]
+    Storage(String),
+
+    /// Session management errors
+    #[error("Session error: {0}")]
+    Session(String),
+
+    /// KV cache errors
+    #[error("KV cache error: {0}")]
+    KvCache(String),
+
+    /// Paged attention errors
+    #[error("Paged attention error: {0}")]
+    PagedAttention(String),
+
+    /// Adapter management errors
+    #[error("Adapter error: {0}")]
+    Adapter(String),
+
+    /// SONA learning errors
+    #[error("SONA error: {0}")]
+    Sona(String),
+
+    /// Configuration errors
+    #[error("Configuration error: {0}")]
+    Config(String),
+
+    /// Out of memory
+    #[error("Out of memory: {0}")]
+    OutOfMemory(String),
+
+    /// Invalid operation
+    #[error("Invalid operation: {0}")]
+    InvalidOperation(String),
+
+    /// Not found
+    #[error("Not found: {0}")]
+    NotFound(String),
+
+    /// Backend inference errors
+    #[error("Backend error: {0}")]
+    Backend(String),
+
+    /// Model loading errors
+    #[error("Model error: {0}")]
+    Model(String),
+
+    /// Tokenization errors
+    #[error("Tokenization error: {0}")]
+    Tokenization(String),
+
+    /// Generation errors
+    #[error("Generation error: {0}")]
+    Generation(String),
+
+    /// IO errors
+    #[error("IO error: {0}")]
+    Io(#[from] std::io::Error),
+}
+```
+
+### `Result` Type Alias
+
+```rust
+/// Result type alias for RuvLLM operations
+pub type Result<T> = std::result::Result<T, RuvLLMError>;
+```
+
+---
+
+## Feature Flags Reference
+
+| Feature | Dependencies | Description |
+|---------|-------------|-------------|
+| `default` | `async-runtime` | Standard async support |
+| `async-runtime` | `tokio` | Tokio async runtime |
+| `wasm` | - | WebAssembly support |
+| `candle` | `candle-*`, `tokenizers`, `hf-hub` | Candle ML backend |
+| `metal` | `candle/metal` | Apple Metal GPU |
+| `cuda` | `candle/cuda` | NVIDIA CUDA GPU |
+| `inference-metal` | `candle`, `metal` | Full Metal stack |
+| `inference-cuda` | `candle`, `cuda` | Full CUDA stack |
--- a/vendor/ruvector/docs/ruvllm/ARCHITECTURE.md
+++ b/vendor/ruvector/docs/ruvllm/ARCHITECTURE.md
@@ -0,0 +1,402 @@
+# RuvLLM Architecture (v2.0.0)
+
+This document describes the system architecture of RuvLLM, a high-performance LLM inference engine optimized for Apple Silicon.
+
+## v2.0.0 New Features
+
+| Feature | Description | Performance Impact |
+|---------|-------------|-------------------|
+| Multi-threaded GEMM/GEMV | Rayon parallelization | 12.7x speedup on M4 Pro |
+| Flash Attention 2 | Auto block sizing | +10% throughput |
+| Quantized Inference | INT8/INT4/Q4_K kernels | 4-8x memory reduction |
+| Metal GPU Shaders | simdgroup_matrix ops | 3x speedup |
+| Memory Pool | Arena allocator | Zero-alloc inference |
+| WASM Support | Browser inference | ~2.5x overhead |
+| npm Integration | @ruvector/ruvllm | JavaScript/TypeScript API |
+
+## System Overview
+
+```
+                              +----------------------------------+
+                              |          User Application        |
+                              +----------------------------------+
+                                              |
+                                              v
+-------------------------------------------------------------------------------------+
+|                                    RuvLLM Core                                       |
+|  +-------------------------------------------------------------------------------+  |
+|  |                              Backend Abstraction                               |  |
+|  |  +-------------------------+  +-------------------------+                     |  |
+|  |  |    Candle Backend       |  |    mistral-rs Backend   |                     |  |
+|  |  |  - Model Loading        |  |  - Model Loading        |                     |  |
+|  |  |  - Tokenization         |  |  - Tokenization         |                     |  |
+|  |  |  - Forward Pass         |  |  - Forward Pass         |                     |  |
+|  |  +-------------------------+  +-------------------------+                     |  |
+|  +-------------------------------------------------------------------------------+  |
+|                                          |                                          |
+|  +-------------------------------------------------------------------------------+  |
+|  |                              SONA Learning Layer                               |  |
+|  |  +---------------------+  +----------------------+  +---------------------+   |  |
+|  |  |    Instant Loop     |  |   Background Loop    |  |     Deep Loop       |   |  |
+|  |  |    (<1ms latency)   |  |   (~100ms interval)  |  |   (minutes/hours)   |   |  |
+|  |  |  - MicroLoRA adapt  |  |  - Pattern merge     |  |  - Full fine-tune   |   |  |
+|  |  |  - Per-request      |  |  - EWC++ update      |  |  - Model distill    |   |  |
+|  |  +---------------------+  +----------------------+  +---------------------+   |  |
+|  +-------------------------------------------------------------------------------+  |
+|                                          |                                          |
+|  +-------------------------------------------------------------------------------+  |
+|  |                              Optimized Kernels                                 |  |
+|  |  +------------------+  +------------------+  +------------------+              |  |
+|  |  |  Attention       |  |  Normalization   |  |  Embedding       |              |  |
+|  |  |  - Flash Attn 2  |  |  - RMSNorm       |  |  - RoPE          |              |  |
+|  |  |  - Paged Attn    |  |  - LayerNorm     |  |  - Token Embed   |              |  |
+|  |  |  - GQA/MQA       |  |  - Fused Ops     |  |  - Pos Embed     |              |  |
+|  |  +------------------+  +------------------+  +------------------+              |  |
+|  +-------------------------------------------------------------------------------+  |
+|                                          |                                          |
+|  +-------------------------------------------------------------------------------+  |
+|  |                              Memory Management                                 |  |
+|  |  +-------------------------+  +-------------------------------------------+   |  |
+|  |  |   Two-Tier KV Cache     |  |           Memory Pool                     |   |  |
+|  |  |  +-------------------+  |  |  - Slab allocator                         |   |  |
+|  |  |  |  FP16 Tail (hot)  |  |  |  - Arena allocation                       |   |  |
+|  |  |  +-------------------+  |  |  - Zero-copy transfers                    |   |  |
+|  |  |  |  Q4 Store (cold)  |  |  |                                           |   |  |
+|  |  |  +-------------------+  |  +-------------------------------------------+   |  |
+|  |  +-------------------------+                                                  |  |
+|  +-------------------------------------------------------------------------------+  |
+-------------------------------------------------------------------------------------+
+                                          |
+                                          v
+-------------------------------------------------------------------------------------+
+|                              Hardware Acceleration                                   |
+|  +---------------------------+  +---------------------------+                       |
+|  |     Metal (Apple GPU)     |  |      CUDA (NVIDIA)        |                       |
+|  |  - MLX integration        |  |  - cuBLAS                 |                       |
+|  |  - Metal Performance      |  |  - cuDNN                  |                       |
+|  |    Shaders                |  |  - TensorRT               |                       |
+|  +---------------------------+  +---------------------------+                       |
+-------------------------------------------------------------------------------------+
+```
+
+## Component Architecture
+
+### 1. Backend Abstraction Layer
+
+The backend abstraction provides a unified interface for different ML frameworks.
+
+```
+---------------------------+
+|     LlmBackend Trait      |
+|  - load_model()           |
+|  - generate()             |
+|  - forward()              |
+|  - get_tokenizer()        |
+---------------------------+
+           ^
+           |
+    +------+------+
+    |             |
+-------+   +-----------+
+|Candle |   |mistral-rs |
+-------+   +-----------+
+```
+
+**Candle Backend Features:**
+- HuggingFace model hub integration
+- Native Rust tensor operations
+- Metal/CUDA acceleration
+- Safetensors loading
+
+### 2. SONA Learning Layer
+
+Self-Optimizing Neural Architecture with three learning loops:
+
+```
+-------------------+     +-------------------+
+| Inference Request |---->| Instant Loop      |
+| + feedback        |     | - MicroLoRA adapt |
+-------------------+     | - <1ms latency    |
+                          +--------+----------+
+                                   |
+                                   v (async, 100ms)
+                          +--------+----------+
+                          | Background Loop   |
+                          | - Pattern merge   |
+                          | - Adapter compose |
+                          | - EWC++ update    |
+                          +--------+----------+
+                                   |
+                                   v (triggered)
+                          +--------+----------+
+                          | Deep Loop         |
+                          | - Full fine-tune  |
+                          | - Model distill   |
+                          | - Pattern bank    |
+                          +-------------------+
+```
+
+**Loop Characteristics:**
+
+| Loop | Latency | Trigger | Purpose |
+|------|---------|---------|---------|
+| Instant | <1ms | Per-request | Real-time adaptation |
+| Background | ~100ms | Interval/threshold | Pattern consolidation |
+| Deep | Minutes | Accumulated quality | Full optimization |
+
+### 3. Optimized Kernel Layer
+
+NEON SIMD-optimized kernels for ARM64:
+
+```
+-----------------------------------------------+
+|              Attention Kernels                 |
+-----------------------------------------------+
+|                                               |
+|  +------------------+  +------------------+   |
+|  | Flash Attention  |  | Paged Attention  |   |
+|  |  - Tiled QKV     |  |  - Block tables  |   |
+|  |  - Online softmax|  |  - Non-contiguous|   |
+|  |  - O(N) memory   |  |  - KV cache aware|   |
+|  +------------------+  +------------------+   |
+|                                               |
+|  +------------------+  +------------------+   |
+|  | Multi-Query (MQA)|  | Grouped-Query    |   |
+|  |  - 1 KV head     |  |  - KV groups     |   |
+|  |  - Shared KV     |  |  - 4-8x savings  |   |
+|  +------------------+  +------------------+   |
+-----------------------------------------------+
+
+-----------------------------------------------+
+|            Normalization Kernels               |
+-----------------------------------------------+
+|  +------------------+  +------------------+   |
+|  |    RMSNorm       |  |    LayerNorm     |   |
+|  |  - NEON SIMD     |  |  - NEON SIMD     |   |
+|  |  - Fused ops     |  |  - Fused ops     |   |
+|  +------------------+  +------------------+   |
+-----------------------------------------------+
+
+-----------------------------------------------+
+|             Embedding Kernels                  |
+-----------------------------------------------+
+|  +------------------+  +------------------+   |
+|  | Rotary Position  |  | Token Embedding  |   |
+|  |  (RoPE)          |  |  - Lookup table  |   |
+|  |  - Precomputed   |  |  - Batch gather  |   |
+|  +------------------+  +------------------+   |
+-----------------------------------------------+
+```
+
+### 4. Memory Management
+
+Two-tier KV cache for optimal memory/quality tradeoff:
+
+```
+----------------------------------------------------+
+|                  Two-Tier KV Cache                  |
+----------------------------------------------------+
+|                                                    |
+|   Position: 0            tail_length        max    |
+|   +------------------+------------------+          |
+|   |                  |                  |          |
+|   |  Quantized Store |  High-Precision  |          |
+|   |     (Cold)       |    Tail (Hot)    |          |
+|   |                  |                  |          |
+|   |  - Q4/Q8 format  |  - FP16 format   |          |
+|   |  - Older tokens  |  - Recent tokens |          |
+|   |  - 4x smaller    |  - Full quality  |          |
+|   |                  |                  |          |
+|   +------------------+------------------+          |
+|                                                    |
+|   Migration: Hot -> Cold (when tail_length exceeded)|
+|   Eviction:  Cold first, then Hot                  |
+----------------------------------------------------+
+```
+
+**Cache Operations:**
+
+1. **Append**: Add new KV pairs to tail
+2. **Migrate**: Move old tokens from tail to quantized store
+3. **Evict**: Remove oldest tokens when max exceeded
+4. **Attend**: Dequantize cold + use hot for attention
+
+## Data Flow
+
+### Inference Pipeline
+
+```
+Input Tokens
+     |
+     v
+--------------------+
+|   Token Embedding  |
+|   + RoPE Position  |
+--------------------+
+     |
+     v (for each layer)
+--------------------+
+|   Attention Layer  |
+|   +---------------+|
+|   | Q,K,V Project ||
+|   +---------------+|
+|          |         |
+|   +---------------+|
+|   | KV Cache      ||
+|   | Update        ||
+|   +---------------+|
+|          |         |
+|   +---------------+|
+|   | Flash/Paged   ||
+|   | Attention     ||
+|   +---------------+|
+|          |         |
+|   +---------------+|
+|   | Output Proj   ||
+|   +---------------+|
+--------------------+
+     |
+     v
+--------------------+
+|   FFN Layer        |
+|   - Gate Proj      |
+|   - Up Proj        |
+|   - Down Proj      |
+|   - Activation     |
+--------------------+
+     |
+     v
+--------------------+
+|   RMSNorm          |
+--------------------+
+     |
+     v
+--------------------+
+|   LM Head          |
+|   (final layer)    |
+--------------------+
+     |
+     v
+Logits -> Sampling -> Token
+```
+
+### Learning Pipeline
+
+```
+Request + Response + Feedback
+              |
+              v
+---------------------------+
+|      Instant Loop         |
+|  - Compute embeddings     |
+|  - Apply MicroLoRA        |
+|  - Queue for background   |
+---------------------------+
+              |
+              v (async)
+---------------------------+
+|    Background Loop        |
+|  - Batch samples          |
+|  - Update EWC++ Fisher    |
+|  - Merge adapters         |
+|  - Store in ReasoningBank |
+---------------------------+
+              |
+              v (threshold triggered)
+---------------------------+
+|       Deep Loop           |
+|  - Full training pipeline |
+|  - Pattern distillation   |
+|  - Catastrophic forget    |
+|    prevention (EWC++)     |
+---------------------------+
+```
+
+## Module Structure
+
+```
+ruvllm/
+├── src/
+│   ├── lib.rs              # Crate root, re-exports
+│   ├── error.rs            # Error types
+│   ├── types.rs            # Common types (Precision, etc.)
+│   │
+│   ├── backends/           # ML framework backends
+│   │   ├── mod.rs          # Backend trait
+│   │   ├── candle_backend.rs
+│   │   └── config.rs
+│   │
+│   ├── kernels/            # Optimized kernels
+│   │   ├── mod.rs          # Kernel exports
+│   │   ├── attention.rs    # Attention variants
+│   │   ├── matmul.rs       # Matrix multiplication
+│   │   ├── norm.rs         # Normalization ops
+│   │   └── rope.rs         # Rotary embeddings
+│   │
+│   ├── lora/               # LoRA adapters
+│   │   ├── mod.rs          # LoRA exports
+│   │   ├── micro_lora.rs   # Real-time MicroLoRA
+│   │   └── training.rs     # Training pipeline
+│   │
+│   ├── optimization/       # SONA integration
+│   │   ├── mod.rs
+│   │   └── sona_llm.rs     # Learning loops
+│   │
+│   ├── kv_cache.rs         # Two-tier KV cache
+│   ├── sona.rs             # SONA core integration
+│   ├── policy_store.rs     # Learned policies
+│   └── witness_log.rs      # Inference logging
+│
+└── benches/                # Benchmarks
+    ├── attention_bench.rs
+    ├── lora_bench.rs
+    └── e2e_bench.rs
+```
+
+## Performance Characteristics
+
+### Memory Layout
+
+| Component | Memory Pattern | Optimization |
+|-----------|---------------|--------------|
+| KV Cache Tail | Sequential | NEON vectorized |
+| KV Cache Store | Quantized blocks | Batch dequant |
+| Model Weights | Memory-mapped | Zero-copy |
+| Intermediate | Stack allocated | Arena alloc |
+
+### Throughput Targets (M4 Pro)
+
+| Operation | Target | Achieved |
+|-----------|--------|----------|
+| Flash Attention | 2.5x vs naive | ~2.3x |
+| Paged Attention | 1.8x vs contiguous | ~1.7x |
+| GQA vs MHA | 4x less KV memory | 4x |
+| MicroLoRA adapt | <1ms | ~0.5ms |
+
+## Integration Points
+
+### With RuVector Core
+
+```rust
+// Memory backend integration
+use ruvector_core::storage::Storage;
+
+// SONA learning integration
+use ruvector_sona::{SonaEngine, ReasoningBank};
+```
+
+### With External Systems
+
+- **HuggingFace Hub**: Model downloads
+- **OpenAI API**: Compatible inference endpoint
+- **Prometheus**: Metrics export
+- **gRPC**: High-performance RPC
+
+## Future Architecture
+
+Planned enhancements:
+
+1. **Speculative Decoding**: Draft model integration
+2. **Tensor Parallelism**: Multi-GPU support
+3. **Continuous Batching**: Dynamic batch scheduling
+4. **PagedAttention v2**: vLLM-style memory management
--- a/vendor/ruvector/docs/ruvllm/FINE_TUNING.md
+++ b/vendor/ruvector/docs/ruvllm/FINE_TUNING.md
@@ -0,0 +1,523 @@
+# RuvLLM Fine-Tuning Guide
+
+This guide covers RuvLLM's fine-tuning capabilities, including MicroLoRA for real-time adaptation and EWC++ for preventing catastrophic forgetting.
+
+## Overview
+
+RuvLLM provides three levels of fine-tuning:
+
+| Level | Technique | Latency | Use Case |
+|-------|-----------|---------|----------|
+| Instant | MicroLoRA | <1ms | Per-request adaptation |
+| Background | Adapter Merge + EWC++ | ~100ms | Pattern consolidation |
+| Deep | Full Training Pipeline | Minutes | Periodic optimization |
+
+## MicroLoRA: Real-Time Adaptation
+
+MicroLoRA enables per-request fine-tuning with minimal overhead.
+
+### How It Works
+
+```
+User Request
+     |
+     v
+------------------+
+| Compute Input    |
+| Embedding        |
+------------------+
+     |
+     v
+------------------+    +------------------+
+| Base Model       |--->| MicroLoRA Delta  |
+| Forward Pass     |    | (rank 1-2)       |
+------------------+    +------------------+
+     |                          |
+     +----------+---------------+
+                |
+                v
+------------------+
+| Combined Output  |
+------------------+
+     |
+     v
+Response + Quality Feedback
+     |
+     v
+------------------+
+| Update MicroLoRA |
+| Weights          |
+------------------+
+```
+
+### Basic Usage
+
+```rust
+use ruvllm::lora::{MicroLoRA, MicroLoraConfig, AdaptFeedback, TargetModule};
+
+// Create MicroLoRA for 4096-dim hidden states
+let config = MicroLoraConfig::for_hidden_dim(4096);
+let lora = MicroLoRA::new(config);
+
+// During inference: apply LoRA delta
+let base_output = model.forward(&input)?;
+let lora_delta = lora.forward(&input, &TargetModule::QProj);
+
+// Combine outputs
+let output: Vec<f32> = base_output.iter()
+    .zip(lora_delta.iter())
+    .map(|(b, d)| b + d)
+    .collect();
+
+// After response: adapt based on feedback
+let feedback = AdaptFeedback::from_quality(0.85);
+lora.adapt(&input, feedback)?;
+
+// Periodically apply accumulated gradients
+lora.apply_updates(0.01); // learning rate
+```
+
+### Configuration Options
+
+```rust
+let config = MicroLoraConfig {
+    // Input/output dimensions (typically hidden_dim)
+    in_features: 4096,
+    out_features: 4096,
+
+    // LoRA rank: 1-2 for micro, 4-8 for standard
+    rank: 2,
+
+    // Scaling factor (effective_rank = alpha / rank)
+    alpha: 4.0,
+
+    // Dropout for regularization
+    dropout: 0.0,
+
+    // Which modules to adapt
+    target_modules: vec![
+        TargetModule::QProj,
+        TargetModule::VProj,
+    ],
+
+    // Memory optimization
+    gradient_checkpointing: false,
+};
+```
+
+### Target Modules
+
+Choose which transformer components to adapt:
+
+| Module | Description | Memory | Impact |
+|--------|-------------|--------|--------|
+| `QProj` | Query projection | Low | High (attention focus) |
+| `KProj` | Key projection | Low | Medium |
+| `VProj` | Value projection | Low | High (content) |
+| `OProj` | Output projection | Low | Medium |
+| `GateProj` | FFN gate | Medium | High (routing) |
+| `UpProj` | FFN up | High | Medium |
+| `DownProj` | FFN down | High | Medium |
+
+**Recommended combinations:**
+- **Speed-focused**: `QProj` only
+- **Quality-focused**: `QProj`, `VProj`
+- **Full adaptation**: All attention projections
+
+## EWC++ (Elastic Weight Consolidation)
+
+EWC++ prevents catastrophic forgetting when adapting to new tasks.
+
+### How It Works
+
+```
+Task 1 Training
+     |
+     v
+------------------+
+| Compute Fisher   |
+| Information      |
+| F = E[grad^2]    |
+------------------+
+     |
+     v
+------------------+
+| Store Optimal    |
+| Weights θ*       |
+------------------+
+
+...later...
+
+Task 2 Training
+     |
+     v
+------------------+
+| Regularized Loss |
+| L = L_task +     |
+| λ Σ F_i(θ-θ*)²   |
+------------------+
+     |
+     v
+------------------+
+| Update with      |
+| Importance       |
+| Weights          |
+------------------+
+```
+
+### Using EWC++ with MicroLoRA
+
+```rust
+use ruvllm::lora::{MicroLoRA, TrainingPipeline, TrainingConfig};
+
+// Create training pipeline with EWC++
+let training_config = TrainingConfig {
+    learning_rate: 0.001,
+    ewc_lambda: 0.1,  // Regularization strength
+    ..Default::default()
+};
+
+let mut pipeline = TrainingPipeline::new(training_config);
+pipeline.init_for_lora(&lora);
+
+// Train on task 1
+for sample in task1_samples {
+    pipeline.train_step(&lora, &sample.input, sample.feedback)?;
+}
+
+// Mark end of task 1 (computes Fisher information)
+pipeline.start_new_task(&lora);
+
+// Train on task 2 (EWC++ regularization active)
+for sample in task2_samples {
+    pipeline.train_step(&lora, &sample.input, sample.feedback)?;
+}
+```
+
+### EWC++ Configuration
+
+```rust
+let config = TrainingConfig {
+    // Base learning rate
+    learning_rate: 0.001,
+
+    // EWC regularization strength
+    // Higher = more preservation of old knowledge
+    // Lower = more adaptation to new tasks
+    ewc_lambda: 0.1,
+
+    // Minimum quality for learning
+    quality_threshold: 0.5,
+
+    // Fisher information estimation samples
+    fisher_samples: 100,
+
+    // Online Fisher update rate
+    online_ewc_gamma: 0.95,
+};
+```
+
+## SONA Learning Loops
+
+SONA provides automated multi-tier learning.
+
+### Architecture
+
+```
+-------------------+     +-------------------+
+| Inference Request |---->| Instant Loop      |
+| + feedback        |     | - MicroLoRA adapt |
+-------------------+     | - <1ms latency    |
+                          +--------+----------+
+                                   |
+                                   v (async, 100ms)
+                          +--------+----------+
+                          | Background Loop   |
+                          | - Pattern merge   |
+                          | - Adapter compose |
+                          | - EWC++ update    |
+                          +--------+----------+
+                                   |
+                                   v (triggered)
+                          +--------+----------+
+                          | Deep Loop         |
+                          | - Full fine-tune  |
+                          | - Model distill   |
+                          | - Pattern bank    |
+                          +-------------------+
+```
+
+### Using SONA
+
+```rust
+use ruvllm::optimization::{SonaLlm, SonaLlmConfig};
+
+// Create SONA integration
+let config = SonaLlmConfig {
+    instant_lr: 0.01,
+    background_interval_ms: 100,
+    background_min_samples: 10,
+    deep_trigger_threshold: 100.0,
+    consolidation_strategy: ConsolidationStrategy::EwcMerge,
+    ..Default::default()
+};
+
+let sona = SonaLlm::new(config);
+
+// During inference
+let response = model.generate(&query)?;
+
+// Record feedback (runs instant loop)
+let result = sona.instant_adapt(&query, &response, 0.85);
+println!("Instant adapt latency: {}μs", result.latency_us);
+
+// Periodically check background loop
+if let Some(bg_result) = sona.maybe_background() {
+    println!("Background: {} samples, quality delta: {:.3}",
+        bg_result.samples_used, bg_result.quality_delta);
+}
+
+// Check if deep loop should trigger
+if sona.should_trigger_deep() {
+    let samples = collect_training_samples();
+    let deep_result = sona.deep_optimize(&samples);
+    println!("Deep optimization complete");
+}
+```
+
+### Consolidation Strategies
+
+```rust
+pub enum ConsolidationStrategy {
+    /// EWC++ merge (default) - preserves important weights
+    EwcMerge,
+
+    /// Simple averaging - fast but may lose specialization
+    Average,
+
+    /// Quality-weighted - higher quality samples have more influence
+    QualityWeighted,
+
+    /// Best only - keep top 20% by quality
+    BestOnly,
+
+    /// Ensemble - maintain multiple adapters
+    Ensemble,
+}
+```
+
+**Recommendations:**
+- `EwcMerge`: Best for multi-domain use
+- `QualityWeighted`: Best for quality optimization
+- `BestOnly`: Best for high-variance feedback
+- `Ensemble`: Best when you have distinct use cases
+
+## Training Data Format
+
+### TrainingSample
+
+```rust
+pub struct TrainingSample {
+    /// Input embedding
+    pub input_embedding: Vec<f32>,
+
+    /// Output embedding
+    pub output_embedding: Vec<f32>,
+
+    /// Query text (optional)
+    pub query: Option<String>,
+
+    /// Response text (optional)
+    pub response: Option<String>,
+
+    /// Quality score (0.0 - 1.0)
+    pub quality: f32,
+
+    /// Latency in milliseconds
+    pub latency_ms: f32,
+
+    /// Token count
+    pub token_count: usize,
+
+    /// Session identifier
+    pub session_id: String,
+}
+```
+
+### Creating Training Samples
+
+```rust
+let sample = TrainingSample::new(
+    input_embedding,
+    output_embedding,
+    0.9,  // quality
+)
+.with_query("What is machine learning?".to_string())
+.with_response("Machine learning is...".to_string())
+.with_latency(150.0)  // ms
+.with_session("session-123".to_string());
+```
+
+## Adapter Management
+
+### Saving and Loading Adapters
+
+```rust
+// Save adapter state
+let adapter_bytes = lora.export_weights()?;
+std::fs::write("adapter.bin", &adapter_bytes)?;
+
+// Load adapter state
+let adapter_bytes = std::fs::read("adapter.bin")?;
+lora.import_weights(&adapter_bytes)?;
+```
+
+### Merging Adapters
+
+```rust
+// Merge multiple adapters with weights
+let adapters = vec![
+    (adapter1, 0.6),  // 60% weight
+    (adapter2, 0.4),  // 40% weight
+];
+
+let merged = MicroLoRA::merge_adapters(&adapters)?;
+```
+
+### Adapter Composition
+
+```rust
+// Sequential composition: adapter1 -> adapter2
+let composed = MicroLoRA::compose_sequential(&[adapter1, adapter2])?;
+
+// Parallel composition: average outputs
+let composed = MicroLoRA::compose_parallel(&[adapter1, adapter2])?;
+```
+
+## Best Practices
+
+### 1. Quality Threshold Selection
+
+```rust
+let config = TrainingConfig {
+    // Too low: learns from poor examples
+    // Too high: learns very slowly
+    // Recommended: 0.5 - 0.7
+    quality_threshold: 0.6,
+    ..Default::default()
+};
+```
+
+### 2. Learning Rate Scheduling
+
+```rust
+// Start high for quick adaptation
+let initial_lr = 0.01;
+
+// Reduce over time for stability
+let decay_lr = |epoch: usize| -> f32 {
+    initial_lr * 0.95_f32.powi(epoch as i32)
+};
+```
+
+### 3. Memory Management
+
+```rust
+// For memory-constrained environments
+let config = MicroLoraConfig {
+    rank: 1,  // Minimum rank
+    target_modules: vec![TargetModule::QProj],  // Single module
+    gradient_checkpointing: true,
+    ..Default::default()
+};
+```
+
+### 4. Preventing Overfitting
+
+```rust
+let config = MicroLoraConfig {
+    dropout: 0.1,  // Add regularization
+    ..Default::default()
+};
+
+let training_config = TrainingConfig {
+    ewc_lambda: 0.5,  // Strong regularization
+    ..Default::default()
+};
+```
+
+## Monitoring and Debugging
+
+### Statistics
+
+```rust
+let stats = sona.stats();
+println!("Learning Statistics:");
+println!("  Instant updates: {}", stats.instant_count);
+println!("  Avg instant latency: {:.2}μs", stats.instant_avg_latency_us);
+println!("  Background updates: {}", stats.background_count);
+println!("  Pending samples: {}", stats.pending_samples);
+println!("  Accumulated quality: {:.2}", stats.accumulated_quality);
+```
+
+### Debugging Adaptation
+
+```rust
+// Enable debug logging
+std::env::set_var("RUST_LOG", "ruvllm::lora=debug");
+
+// Check adaptation result
+let result = sona.instant_adapt(&query, &response, feedback);
+if !result.applied {
+    println!("Adaptation skipped: {:?}", result.notes);
+}
+```
+
+## Performance Tuning
+
+### Latency Optimization
+
+| Setting | Low Latency | Balanced | High Quality |
+|---------|-------------|----------|--------------|
+| LoRA rank | 1 | 2 | 4 |
+| Target modules | 1 | 2 | 4 |
+| Background interval | 200ms | 100ms | 50ms |
+| EWC lambda | 0.0 | 0.1 | 0.5 |
+
+### Memory Optimization
+
+```rust
+// Minimal memory footprint
+let config = SonaLlmConfig {
+    max_pending_samples: 100,  // Reduce buffer
+    micro_lora: MicroLoraConfig {
+        rank: 1,
+        target_modules: vec![TargetModule::QProj],
+        ..Default::default()
+    },
+    ..Default::default()
+};
+```
+
+## Troubleshooting
+
+### Adaptation Not Improving
+
+1. Check quality threshold isn't too high
+2. Verify feedback is meaningful (not always same value)
+3. Increase learning rate
+4. Try different target modules
+
+### Catastrophic Forgetting
+
+1. Increase EWC lambda
+2. Use `EwcMerge` consolidation strategy
+3. Reduce learning rate
+4. Add more diverse training data
+
+### High Latency
+
+1. Reduce LoRA rank to 1
+2. Reduce target modules
+3. Increase background interval
+4. Use `gradient_checkpointing`
--- a/vendor/ruvector/docs/ruvllm/OPTIMIZATION.md
+++ b/vendor/ruvector/docs/ruvllm/OPTIMIZATION.md
@@ -0,0 +1,521 @@
+# RuvLLM Optimization Guide (v2.0.0)
+
+This guide covers performance optimization strategies for RuvLLM, including SONA learning loops, batch sizing, KV cache management, and hardware-specific tuning.
+
+## v2.0.0 Performance Highlights
+
+| Feature | Improvement | Notes |
+|---------|-------------|-------|
+| Multi-threaded GEMM | 12.7x speedup | Rayon on M4 Pro 10-core |
+| Flash Attention 2 | +10% throughput | Auto block sizing |
+| Quantized Inference | 4-8x memory | INT8/INT4/Q4_K |
+| Metal GPU | 3x speedup | simdgroup_matrix |
+| Memory Pool | Zero-alloc | Arena allocator |
+
+## Performance Overview
+
+### Key Metrics
+
+| Metric | Target (M4 Pro) | Achieved (v2.0.0) | Description |
+|--------|-----------------|-------------------|-------------|
+| Prefill | >2000 tok/s | 3500 tok/s | Processing input tokens |
+| Decode | >80 tok/s | 120 tok/s | Generating output tokens |
+| TTFT | <50ms | 35ms | Time to first token |
+| Memory | <8GB for 7B | 3.4GB (Q4K) | Peak memory usage |
+| MicroLoRA | <1ms | 8.56us | Per-request adaptation |
+
+### Architecture Impact
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    Optimization Layers                   │
+├─────────────────────────────────────────────────────────┤
+│  SONA Learning      │ Real-time adaptation, routing     │
+├─────────────────────────────────────────────────────────┤
+│  Attention          │ Flash, Paged, GQA - 2-4x speedup │
+├─────────────────────────────────────────────────────────┤
+│  KV Cache           │ Two-tier, quantized - 4x memory  │
+├─────────────────────────────────────────────────────────┤
+│  Quantization       │ Q4K, Q8 - 4-8x smaller           │
+├─────────────────────────────────────────────────────────┤
+│  SIMD/GPU           │ NEON, Metal - hardware accel     │
+└─────────────────────────────────────────────────────────┘
+```
+
+## SONA Learning Optimization
+
+### Instant Loop Tuning
+
+The instant loop runs per-request with <1ms target latency.
+
+```rust
+let config = SonaLlmConfig {
+    // Learning rate for instant updates
+    // Higher = faster adaptation, more variance
+    // Lower = slower adaptation, more stable
+    instant_lr: 0.01,
+
+    // Quality threshold - skip low-quality samples
+    training: TrainingConfig {
+        quality_threshold: 0.5,  // 0.0-1.0
+        ..Default::default()
+    },
+    ..Default::default()
+};
+```
+
+**Tuning Guidelines:**
+
+| Use Case | instant_lr | quality_threshold |
+|----------|------------|-------------------|
+| High variance tasks | 0.005 | 0.7 |
+| Stable domains | 0.02 | 0.3 |
+| User personalization | 0.01 | 0.5 |
+
+### Background Loop Tuning
+
+Consolidates patterns without blocking inference.
+
+```rust
+let config = SonaLlmConfig {
+    // How often to run (milliseconds)
+    background_interval_ms: 100,
+
+    // Minimum samples before consolidation
+    background_min_samples: 10,
+
+    // Maximum pending (triggers forced consolidation)
+    max_pending_samples: 1000,
+
+    // Consolidation strategy
+    consolidation_strategy: ConsolidationStrategy::EwcMerge,
+    ..Default::default()
+};
+```
+
+**Tuning Guidelines:**
+
+| Priority | interval_ms | min_samples | Strategy |
+|----------|-------------|-------------|----------|
+| Latency | 200 | 20 | Average |
+| Quality | 50 | 5 | EwcMerge |
+| Memory | 100 | 50 | BestOnly |
+
+### Deep Loop Optimization
+
+Triggered periodically for full optimization.
+
+```rust
+let config = SonaLlmConfig {
+    // Accumulated quality threshold to trigger
+    deep_trigger_threshold: 100.0,
+    ..Default::default()
+};
+
+// Manual trigger for scheduled optimization
+if sona.should_trigger_deep() || is_scheduled_time() {
+    let samples = collect_high_quality_samples();
+    let result = sona.deep_optimize(&samples);
+
+    // Log improvement
+    println!("Deep optimization: quality delta = {:.3}", result.quality_delta);
+}
+```
+
+## Batch Size Optimization
+
+### Dynamic Batching
+
+```rust
+// Optimal batch sizes vary by operation
+struct BatchConfig {
+    prefill_batch: usize,   // Process multiple prompts together
+    decode_batch: usize,    // Parallel token generation
+    lora_batch: usize,      // LoRA adaptation batch
+}
+
+impl BatchConfig {
+    fn for_memory(available_gb: f32) -> Self {
+        match available_gb {
+            x if x < 8.0 => Self {
+                prefill_batch: 1,
+                decode_batch: 4,
+                lora_batch: 16,
+            },
+            x if x < 16.0 => Self {
+                prefill_batch: 2,
+                decode_batch: 8,
+                lora_batch: 32,
+            },
+            _ => Self {
+                prefill_batch: 4,
+                decode_batch: 16,
+                lora_batch: 64,
+            },
+        }
+    }
+}
+```
+
+### Batch Size Impact
+
+| Batch Size | Throughput | Latency | Memory |
+|------------|------------|---------|--------|
+| 1 | Low | Lowest | Lowest |
+| 4 | Medium | Low | Medium |
+| 8 | High | Medium | High |
+| 16+ | Highest | Higher | Highest |
+
+**Rule of thumb:** Increase batch size until memory pressure or latency constraints are hit.
+
+## KV Cache Optimization
+
+### Two-Tier Configuration
+
+```rust
+let config = KvCacheConfig {
+    // Tokens in high-precision tail
+    // More = better attention quality for recent context
+    // Less = less memory usage
+    tail_length: 256,
+
+    // Tail precision (FP16 recommended)
+    tail_precision: Precision::FP16,
+
+    // Store precision (Q4 for 4x compression)
+    store_precision: Precision::Q4,
+
+    // Maximum context length
+    max_tokens: 4096,
+
+    // KV heads (depends on model architecture)
+    num_kv_heads: 8,
+    head_dim: 128,
+
+    // Batch size for migration (affects latency spikes)
+    migration_batch: 64,
+};
+```
+
+### Memory Calculation
+
+```
+KV Cache Memory = num_layers * 2 * max_tokens * num_kv_heads * head_dim * bytes_per_element
+
+Example (Qwen2.5-7B with 4096 context):
+- Layers: 32
+- KV heads: 8
+- Head dim: 128
+- FP16 tail (256 tokens): 32 * 2 * 256 * 8 * 128 * 2 = 33.5 MB
+- Q4 store (3840 tokens): 32 * 2 * 3840 * 8 * 128 * 0.5 = 125.8 MB
+- Total: ~160 MB (vs ~672 MB for full FP16)
+```
+
+### Cache Strategies by Use Case
+
+| Use Case | tail_length | store_precision | max_tokens |
+|----------|-------------|-----------------|------------|
+| Chat (short) | 128 | Q8 | 2048 |
+| Chat (long) | 256 | Q4 | 8192 |
+| Document QA | 512 | Q4 | 16384 |
+| Code completion | 128 | Q8 | 4096 |
+
+## Attention Optimization
+
+### Grouped-Query Attention (GQA)
+
+```rust
+let config = AttentionConfig {
+    num_heads: 32,      // Query heads
+    num_kv_heads: 8,    // KV heads (4:1 ratio)
+    head_dim: 128,
+    causal: true,
+    ..Default::default()
+};
+
+// GQA ratio determines memory savings
+// 4:1 = ~4x KV cache reduction
+// 8:1 = ~8x KV cache reduction
+assert_eq!(config.gqa_ratio(), 4);
+```
+
+### Flash Attention Optimization
+
+```rust
+// Flash Attention is memory-efficient but has setup overhead
+// Best for: longer sequences (>256 tokens)
+
+// For short sequences, standard attention may be faster
+let use_flash = sequence_length > 256;
+
+if use_flash {
+    let output = flash_attention_neon(&query, &key, &value, scale, causal);
+} else {
+    let output = standard_attention(&query, &key, &value, scale, causal);
+}
+```
+
+### Paged Attention for Inference
+
+```rust
+// Paged attention enables non-contiguous KV cache
+// Best for: long-running inference with variable context
+
+let mut cache = PagedKvCache::new(
+    16,     // block_size: tokens per block
+    8,      // num_kv_heads
+    128,    // head_dim
+);
+
+// Append incrementally
+for token in tokens {
+    let (k, v) = compute_kv(token)?;
+    cache.append(&k, &v);
+}
+
+// Efficient attention over paged cache
+let output = paged_attention_neon(&query, &cache, &block_tables, scale);
+```
+
+## Quantization Optimization
+
+### Model Quantization
+
+| Precision | Memory | Quality | Speed |
+|-----------|--------|---------|-------|
+| FP32 | 4x | Best | Slowest |
+| FP16 | 2x | Excellent | Fast |
+| Q8 | 1x | Very Good | Faster |
+| Q4K | 0.5x | Good | Fastest |
+| Q4 | 0.5x | Acceptable | Fastest |
+
+**Recommendations:**
+
+```rust
+// High quality (16GB+ RAM)
+let config = ModelConfig {
+    quantization: Precision::Q8,
+    ..Default::default()
+};
+
+// Balanced (8-16GB RAM)
+let config = ModelConfig {
+    quantization: Precision::Q4K,  // K-quant preserves quality
+    ..Default::default()
+};
+
+// Memory constrained (<8GB RAM)
+let config = ModelConfig {
+    quantization: Precision::Q4,
+    ..Default::default()
+};
+```
+
+### KV Cache Quantization
+
+```rust
+// Hybrid quantization: recent tokens in high precision
+let config = KvCacheConfig {
+    tail_length: 256,           // Recent: FP16
+    tail_precision: Precision::FP16,
+    store_precision: Precision::Q4,  // Older: Q4
+    ..Default::default()
+};
+
+// Quality impact by position
+// Position 0-256 (tail): Full quality
+// Position 256+: ~95% quality with Q4
+```
+
+## Hardware-Specific Optimization
+
+### Apple Silicon (M1/M2/M3/M4)
+
+```rust
+// Metal backend for GPU acceleration
+let backend = CandleBackend::with_device(DeviceType::Metal)?;
+
+// Optimize for unified memory
+let config = ModelConfig {
+    // Unified memory = larger KV cache possible
+    kv_cache_config: KvCacheConfig {
+        max_tokens: 8192,  // Can be larger on M-series
+        ..Default::default()
+    },
+    ..Default::default()
+};
+```
+
+**M4 Pro Specific:**
+- Use `metal` feature for GPU acceleration
+- NEON SIMD enabled by default
+- Leverage unified memory for larger context
+
+### NVIDIA GPUs
+
+```rust
+// CUDA backend
+let backend = CandleBackend::with_device(DeviceType::Cuda(0))?;
+
+// Optimize for separate VRAM
+let config = ModelConfig {
+    kv_cache_config: KvCacheConfig {
+        // Conservative: VRAM is limited
+        max_tokens: 4096,
+        ..Default::default()
+    },
+    ..Default::default()
+};
+```
+
+### CPU Fallback
+
+```rust
+// CPU with SIMD optimization
+let backend = CandleBackend::with_device(DeviceType::Cpu)?;
+
+// Reduce memory pressure
+let config = ModelConfig {
+    quantization: Precision::Q4,
+    kv_cache_config: KvCacheConfig {
+        tail_length: 128,
+        max_tokens: 2048,
+        ..Default::default()
+    },
+    ..Default::default()
+};
+```
+
+## Real-Time Optimization
+
+### Adaptive Optimization
+
+```rust
+use ruvllm::optimization::{RealTimeOptimizer, OptimizerConfig};
+
+let optimizer = RealTimeOptimizer::new(OptimizerConfig {
+    target_latency_ms: 100.0,
+    min_throughput: 50.0,  // tokens/sec
+    memory_threshold: 0.9,  // 90% of available
+});
+
+// Optimizer adjusts parameters in real-time
+loop {
+    let metrics = backend.get_metrics();
+    let adjustments = optimizer.recommend(&metrics);
+
+    if adjustments.reduce_batch_size {
+        config.batch_size -= 1;
+    }
+    if adjustments.increase_quantization {
+        config.kv_cache_config.store_precision = Precision::Q4;
+    }
+}
+```
+
+### Latency Monitoring
+
+```rust
+// Track latency components
+struct LatencyBreakdown {
+    tokenization_us: u64,
+    prefill_us: u64,
+    decode_us: u64,
+    sampling_us: u64,
+    lora_us: u64,
+}
+
+impl LatencyBreakdown {
+    fn total_ms(&self) -> f64 {
+        (self.tokenization_us + self.prefill_us +
+         self.decode_us + self.sampling_us + self.lora_us) as f64 / 1000.0
+    }
+
+    fn bottleneck(&self) -> &str {
+        let max = [
+            (self.tokenization_us, "tokenization"),
+            (self.prefill_us, "prefill"),
+            (self.decode_us, "decode"),
+            (self.sampling_us, "sampling"),
+            (self.lora_us, "lora"),
+        ].into_iter().max_by_key(|(v, _)| *v).unwrap();
+        max.1
+    }
+}
+```
+
+## Benchmarking
+
+### Running Benchmarks
+
+```bash
+# All benchmarks
+cargo bench
+
+# Specific benchmarks
+cargo bench --bench attention_bench
+cargo bench --bench lora_bench
+cargo bench --bench e2e_bench
+
+# With specific features
+cargo bench --features metal
+cargo bench --features cuda
+```
+
+### Custom Benchmarks
+
+```rust
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use ruvllm::kernels::attention::flash_attention_neon;
+
+fn bench_attention(c: &mut Criterion) {
+    let query = vec![0.1f32; 128];
+    let key = vec![0.1f32; 512 * 128];
+    let value = vec![0.1f32; 512 * 128];
+    let scale = 1.0 / 128.0_f32.sqrt();
+
+    c.bench_function("flash_attention_512", |b| {
+        b.iter(|| {
+            flash_attention_neon(
+                black_box(&query),
+                black_box(&key),
+                black_box(&value),
+                scale,
+                true,
+            )
+        })
+    });
+}
+
+criterion_group!(benches, bench_attention);
+criterion_main!(benches);
+```
+
+## Optimization Checklist
+
+### Before Deployment
+
+- [ ] Choose appropriate quantization (Q4K for most cases)
+- [ ] Configure KV cache for expected context length
+- [ ] Enable GQA if model supports it
+- [ ] Set appropriate batch sizes for memory
+- [ ] Configure SONA learning rates
+- [ ] Test with representative workloads
+
+### Monitoring
+
+- [ ] Track prefill and decode throughput
+- [ ] Monitor memory usage over time
+- [ ] Log KV cache hit rates
+- [ ] Track SONA learning metrics
+- [ ] Alert on latency spikes
+
+### Troubleshooting
+
+| Symptom | Likely Cause | Solution |
+|---------|--------------|----------|
+| High latency | Batch too large | Reduce batch size |
+| OOM errors | KV cache too large | Reduce max_tokens or use Q4 |
+| Quality degradation | Over-quantization | Use Q8 instead of Q4 |
+| Slow adaptation | Learning rate too low | Increase instant_lr |
+| Forgetting | EWC lambda too low | Increase ewc_lambda |
--- a/vendor/ruvector/docs/ruvllm/ruvltra-medium.md
+++ b/vendor/ruvector/docs/ruvllm/ruvltra-medium.md
@@ -0,0 +1,417 @@
+# RuvLTRA-Medium: 3B Parameter Model Architecture
+
+## Overview
+
+RuvLTRA-Medium is a 3 billion parameter language model based on the Qwen2.5-3B-Instruct architecture, enhanced with advanced learning capabilities and optimized for Apple Silicon and modern GPU acceleration.
+
+## Architecture Specifications
+
+### Model Configuration
+
+| Parameter | Value | Description |
+|-----------|-------|-------------|
+| **Total Parameters** | ~3.0B | Full model size |
+| **Hidden Size** | 2048 | Embedding dimension |
+| **Layers** | 32 | Transformer decoder layers |
+| **Attention Heads** | 16 | Query heads |
+| **KV Heads** | 2 | Key-value heads (GQA) |
+| **GQA Ratio** | 8:1 | Grouped Query Attention ratio |
+| **Head Dimension** | 128 | Per-head dimension |
+| **Intermediate Size** | 11008 | MLP hidden dimension |
+| **Vocabulary Size** | 151936 | Qwen tokenizer |
+| **Context Length** | 32768 | Maximum sequence length |
+| **RoPE Theta** | 1,000,000 | RoPE base frequency |
+
+### Quantization Options
+
+| Format | Model Size | Quality | Speed | Recommended Use |
+|--------|-----------|---------|-------|-----------------|
+| **Q4_K_M** | ~2.0 GB | Good | Fast | Production inference |
+| **Q5_K_M** | ~2.5 GB | Better | Medium | Balanced quality/speed |
+| **Q8_0** | ~3.5 GB | Best | Slower | Maximum quality |
+| **Mixed** | ~2.8 GB | Excellent | Medium | FP16 attn + Q4 MLP |
+
+## Model Variants
+
+### 1. RuvLTRA-Medium-Base
+
+General-purpose model for diverse tasks.
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::base();
+```
+
+**Characteristics:**
+- Temperature: 0.7
+- Top-p: 0.9
+- SONA hooks: Layers 8, 16, 24
+- Pattern capacity: 50,000
+
+**Use Cases:**
+- General conversation
+- Text completion
+- Summarization
+- Question answering
+
+### 2. RuvLTRA-Medium-Coder
+
+Optimized for code generation and analysis.
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::coder();
+```
+
+**Characteristics:**
+- Temperature: 0.2 (deterministic)
+- Top-p: 0.95
+- SONA hooks: Layers 8, 16, 24, 28 (extra late-layer)
+- Pattern capacity: 100,000
+- Quality threshold: 0.7 (stricter)
+
+**Use Cases:**
+- Code completion
+- Bug fixing
+- Code refactoring
+- API generation
+
+### 3. RuvLTRA-Medium-Agent
+
+Routing and planning optimized for agent systems.
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::agent();
+```
+
+**Characteristics:**
+- Temperature: 0.3
+- Top-p: 0.85
+- SONA hooks: Layers 8, 16, 24
+- HNSW M: 32 (higher connectivity)
+- HNSW ef_construction: 400
+- Micro-LoRA rank: 2 (low latency)
+
+**Use Cases:**
+- Claude Flow agent routing
+- Task planning
+- Decision making
+- Multi-agent coordination
+
+## RuvLTRA Enhancements
+
+### 1. SONA Learning Hooks
+
+SONA (Self-Optimizing Neural Architecture) hooks enable continuous learning during inference.
+
+**Hook Layers:**
+- **Layer 8**: Early pattern recognition (shallow semantics)
+- **Layer 16**: Mid-layer semantic extraction (concepts)
+- **Layer 24**: Deep reasoning capture (abstract thinking)
+
+**Implementation:**
+```rust
+let config = RuvLtraMediumConfig::base();
+let mut model = RuvLtraMediumModel::new(&config)?;
+
+// Enable custom hook layers
+model.enable_sona_with_hooks(&[8, 16, 24])?;
+```
+
+**Learning Loop:**
+1. **Instant Loop**: Ring buffer with MicroLoRA (rank 4)
+2. **Background Loop**: Router training with EWC++ Fisher
+3. **Deep Loop**: Pattern bank consolidation
+
+### 2. HNSW Routing Integration
+
+HNSW (Hierarchical Navigable Small World) enables fast agent routing.
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::agent();
+assert_eq!(config.sona_hooks.hnsw_m, 32);
+assert_eq!(config.sona_hooks.hnsw_ef_construction, 400);
+```
+
+**Performance:**
+- Search: 150x-12,500x faster than brute-force
+- Insertion: O(log n) complexity
+- Memory: ~4 bytes per node per connection
+
+### 3. Claude Flow Agent Embeddings
+
+Integration with Claude Flow for intelligent task routing.
+
+**Features:**
+- Agent type classification
+- Task complexity estimation
+- Quality prediction
+- Trajectory recording
+
+**Usage:**
+```rust
+let config = RuvLtraMediumConfig::agent();
+config.enable_agent_routing = true;
+
+let model = RuvLtraMediumModel::new(&config)?;
+// Model automatically records trajectories for routing
+```
+
+### 4. ReasoningBank Trajectory Storage
+
+Stores successful reasoning patterns for future retrieval.
+
+**Storage Format:**
+- State-action pairs
+- Quality scores (0.0-1.0)
+- Contextual embeddings
+- Temporal metadata
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::base();
+config.enable_reasoning_bank = true;
+config.sona_config.pattern_capacity = 50000;
+```
+
+## Memory Optimization
+
+### 1. Paged KV Cache
+
+Efficient memory management for attention computation.
+
+**Block Size:** 64 tokens per page
+
+**Benefits:**
+- 40-60% memory reduction
+- Dynamic sequence handling
+- Copy-on-write semantics
+- Efficient prefix caching
+
+**Configuration:**
+```rust
+let config = RuvLtraMediumConfig::base();
+assert!(config.use_paged_attention);
+assert_eq!(config.paged_config.page_size, 64);
+```
+
+### 2. Flash Attention 2
+
+Optimized attention kernel for 2.49x-7.47x speedup.
+
+**Algorithm:**
+- Tiled computation
+- Recomputation on-the-fly
+- IO-aware optimization
+- Causal masking
+
+**Performance:**
+| Sequence Length | Speedup | Memory Savings |
+|-----------------|---------|----------------|
+| 2K tokens | 2.5x | 30% |
+| 8K tokens | 4.2x | 50% |
+| 32K tokens | 7.1x | 70% |
+
+### 3. Speculative Decoding
+
+Uses RuvLTRA-Small (0.5B) as draft model for 2-3x speedup.
+
+**Configuration:**
+```rust
+let mut config = RuvLtraMediumConfig::base();
+config.use_speculative_decoding = true;
+config.speculative_config.lookahead = 4;
+config.draft_model_path = Some("models/ruvltra-small-q4.gguf".into());
+```
+
+**Parameters:**
+- Lookahead: 4 tokens (default)
+- Acceptance threshold: 0.7
+- Draft temperature: 0.0 (greedy)
+- Adaptive lookahead: enabled
+
+**Expected Speedup:**
+| Temperature | Speedup |
+|-------------|---------|
+| 0.0 (greedy) | 2.8-3.2x |
+| 0.5 | 2.2-2.6x |
+| 1.0 | 1.5-1.8x |
+
+## Usage Examples
+
+### Basic Inference
+
+```rust
+use ruvllm::models::ruvltra_medium::{RuvLtraMediumConfig, RuvLtraMediumModel};
+
+// Create model
+let config = RuvLtraMediumConfig::base();
+let mut model = RuvLtraMediumModel::new(&config)?;
+
+// Tokenize input
+let input_ids = vec![151643, 9521, 11, 1917]; // "Hello, world"
+let positions = (0..input_ids.len()).collect::<Vec<_>>();
+
+// Run inference
+let logits = model.forward(&input_ids, &positions)?;
+
+// Get next token
+let next_token = argmax(&logits[logits.len() - config.vocab_size..]);
+```
+
+### Code Generation (Coder Variant)
+
+```rust
+let config = RuvLtraMediumConfig::coder();
+let mut model = RuvLtraMediumModel::new(&config)?;
+
+// Enable SONA hooks for learning
+model.enable_sona_with_hooks(&[8, 16, 24, 28])?;
+
+// Generate code
+let prompt = "fn fibonacci(n: u32) -> u32 {";
+let output = model.generate(prompt, GenerateParams {
+    max_tokens: 256,
+    temperature: 0.2,
+    top_p: 0.95,
+    ..Default::default()
+})?;
+```
+
+### Agent Routing (Agent Variant)
+
+```rust
+let config = RuvLtraMediumConfig::agent();
+let model = RuvLtraMediumModel::new(&config)?;
+
+// Enable Claude Flow integration
+assert!(config.enable_agent_routing);
+
+// Model automatically:
+// - Records trajectories
+// - Updates HNSW index
+// - Learns routing patterns
+```
+
+### Speculative Decoding
+
+```rust
+let mut config = RuvLtraMediumConfig::base();
+config.use_speculative_decoding = true;
+config.draft_model_path = Some("ruvltra-small-q4.gguf".into());
+
+let model = RuvLtraMediumModel::new(&config)?;
+
+// 2-3x faster generation
+let output = model.generate("Once upon a time", params)?;
+```
+
+## Model Loading
+
+### From GGUF
+
+```rust
+use ruvllm::gguf::loader::GGUFLoader;
+
+let loader = GGUFLoader::new("ruvltra-medium-q4_k_m.gguf")?;
+let model = loader.load_ruvltra_medium()?;
+```
+
+### Quantization Formats
+
+```bash
+# Download pre-quantized models
+wget https://huggingface.co/ruvector/ruvltra-medium-q4_k_m-gguf
+wget https://huggingface.co/ruvector/ruvltra-medium-q5_k_m-gguf
+wget https://huggingface.co/ruvector/ruvltra-medium-q8_0-gguf
+
+# Or quantize yourself
+cargo run --release --bin quantize -- \
+  --model qwen2.5-3b-instruct \
+  --output ruvltra-medium-q4_k_m.gguf \
+  --format q4_k_m
+```
+
+## Performance Benchmarks
+
+### Inference Speed (Apple M3 Max)
+
+| Configuration | Tokens/sec | Memory | Power |
+|---------------|-----------|--------|-------|
+| Base Q4_K_M | 68 tok/s | 2.2 GB | 12W |
+| Base Q5_K_M | 55 tok/s | 2.7 GB | 14W |
+| Base Q8_0 | 42 tok/s | 3.8 GB | 16W |
+| Coder Q4_K_M | 65 tok/s | 2.4 GB | 13W |
+| Agent Q4_K_M | 72 tok/s | 2.1 GB | 11W |
+| + Speculative | 158 tok/s | 2.8 GB | 15W |
+
+### Quality Metrics
+
+| Benchmark | Base | Coder | Agent |
+|-----------|------|-------|-------|
+| MMLU | 68.2% | 66.8% | 64.5% |
+| HumanEval | 52.4% | 61.7% | 48.9% |
+| GSM8K | 71.3% | 69.8% | 73.6% |
+| TruthfulQA | 45.8% | 44.2% | 47.1% |
+
+## Integration with Claude Flow
+
+### Agent Routing
+
+```rust
+use ruvllm::models::ruvltra_medium::RuvLtraMediumConfig;
+use ruvllm::claude_flow::AgentRouter;
+
+let config = RuvLtraMediumConfig::agent();
+let model = RuvLtraMediumModel::new(&config)?;
+
+// Router uses model embeddings for task classification
+let router = AgentRouter::new(model.sona().unwrap());
+
+// Route task to optimal agent
+let task = "Implement authentication system";
+let agent = router.route(task)?; // Returns: "coder" or "security-architect"
+```
+
+### Trajectory Recording
+
+```rust
+use ruvllm::sona::Trajectory;
+
+// Create trajectory
+let mut trajectory = Trajectory::new("code-generation");
+trajectory.add_state(initial_state);
+trajectory.add_action("generate_function", quality_score);
+
+// Record in model
+model.sona()
+    .unwrap()
+    .write()
+    .record_trajectory(trajectory)?;
+```
+
+## Limitations
+
+1. **Context Window**: 32K tokens (not extensible without retraining)
+2. **SONA Hooks**: Limited to 4 hooks due to memory overhead
+3. **Speculative Decoding**: Requires separate draft model
+4. **Quantization**: Q4/Q5 may degrade quality by 2-3%
+5. **Hardware**: Optimized for Apple Silicon; GPU acceleration recommended
+
+## Roadmap
+
+- [ ] RuvLTRA-Medium-Vision (multimodal)
+- [ ] Context extension to 128K tokens
+- [ ] Mixture-of-Experts (MoE) variant
+- [ ] On-device fine-tuning
+- [ ] Distillation to RuvLTRA-Small
+
+## References
+
+- [Qwen2.5 Technical Report](https://arxiv.org/abs/2407.10671)
+- [Flash Attention 2](https://arxiv.org/abs/2307.08691)
+- [Speculative Decoding](https://arxiv.org/abs/2211.17192)
+- [Grouped Query Attention](https://arxiv.org/abs/2305.13245)
+- [HNSW Algorithm](https://arxiv.org/abs/1603.09320)