Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/ruvLLM/docs/SONA/01-LORA-ULTRA.md
+++ b/vendor/ruvector/examples/ruvLLM/docs/SONA/01-LORA-ULTRA.md
@@ -0,0 +1,559 @@
+# SONA LoRA-Ultra: Sub-100μs Adaptive Fine-Tuning
+
+## Ultra-Low Latency LoRA for Real-Time Self-Improvement
+
+---
+
+## 1. Architecture Overview
+
+### Traditional LoRA vs SONA LoRA-Ultra
+
+```
+TRADITIONAL LoRA                      SONA LoRA-ULTRA
+─────────────────                     ─────────────────
+• Offline training                    • Online per-request adaptation
+• Full batch updates                  • Single-sample micro-updates
+• GPU required                        • CPU SIMD optimized
+• Minutes to hours                    • <100 microseconds
+• Periodic deployment                 • Continuous integration
+```
+
+### Core Formula
+
+```
+Standard LoRA:
+    W_adapted = W_frozen + ΔW
+    ΔW = α · (A @ B)
+    where A ∈ ℝ^(d×r), B ∈ ℝ^(r×k), r << min(d,k)
+
+SONA LoRA-Ultra Extension:
+    W_adapted = W_frozen + α · (A @ B) + β · (A_micro @ B_micro)
+                          └─────────┘   └───────────────────┘
+                          Base LoRA     Instant Micro-LoRA
+                          (rank 4-16)   (rank 1-2)
+```
+
+---
+
+## 2. Two-Tier LoRA Architecture
+
+### Tier 1: Base LoRA (Updated Hourly)
+
+```rust
+/// Base LoRA adapter for major capability shifts
+pub struct BaseLoRA {
+    /// Low-rank matrix A: d_model × rank
+    pub a: Array2<f32>,
+    /// Low-rank matrix B: rank × d_out
+    pub b: Array2<f32>,
+    /// Scaling factor
+    pub alpha: f32,
+    /// Rank (typically 4-16)
+    pub rank: usize,
+    /// Target layer indices
+    pub target_layers: Vec<usize>,
+}
+
+impl BaseLoRA {
+    /// Compute adapted weights (cached for inference)
+    #[inline]
+    pub fn delta_w(&self) -> Array2<f32> {
+        let scale = self.alpha / self.rank as f32;
+        scale * self.a.dot(&self.b)
+    }
+
+    /// Update from accumulated gradients (hourly)
+    pub fn update(&mut self, grad_a: &Array2<f32>, grad_b: &Array2<f32>, lr: f32) {
+        // SGD with momentum
+        self.a = &self.a - lr * grad_a;
+        self.b = &self.b - lr * grad_b;
+    }
+}
+```
+
+### Tier 2: Micro-LoRA (Updated Per-Request)
+
+```rust
+/// Ultra-fast micro-adapter for instant learning
+pub struct MicroLoRA {
+    /// Micro A: d_model × micro_rank (typically 1-2)
+    pub a_micro: Array2<f32>,
+    /// Micro B: micro_rank × d_out
+    pub b_micro: Array2<f32>,
+    /// Micro scaling (smaller than base)
+    pub beta: f32,
+    /// Micro rank (1-2 for speed)
+    pub micro_rank: usize,
+    /// Decay factor for temporal smoothing
+    pub decay: f32,
+    /// Momentum buffer
+    momentum_a: Array2<f32>,
+    momentum_b: Array2<f32>,
+}
+
+impl MicroLoRA {
+    /// Ultra-fast single-sample update (<50μs target)
+    #[inline]
+    pub fn micro_update(&mut self, signal: &LearningSignal) {
+        // Rank-1 outer product update
+        let grad_direction = signal.to_gradient_direction();
+
+        // Exponential moving average for stability
+        self.momentum_a = self.decay * &self.momentum_a
+            + (1.0 - self.decay) * &grad_direction.a_component;
+        self.momentum_b = self.decay * &self.momentum_b
+            + (1.0 - self.decay) * &grad_direction.b_component;
+
+        // Apply micro-update
+        self.a_micro = &self.a_micro + self.beta * &self.momentum_a;
+        self.b_micro = &self.b_micro + self.beta * &self.momentum_b;
+    }
+
+    /// Periodic consolidation into base LoRA
+    pub fn consolidate_to_base(&mut self, base: &mut BaseLoRA) {
+        // Merge micro adaptations into base
+        // Then reset micro to zero
+        base.a = &base.a + &self.a_micro;
+        base.b = &base.b + &self.b_micro;
+        self.a_micro.fill(0.0);
+        self.b_micro.fill(0.0);
+    }
+}
+```
+
+---
+
+## 3. SIMD-Optimized LoRA Computation
+
+### AVX2 Accelerated Forward Pass
+
+```rust
+#[cfg(target_arch = "x86_64")]
+mod simd {
+    use std::arch::x86_64::*;
+
+    /// SIMD-optimized LoRA forward: x @ (W + A @ B)
+    /// Fuses base weight multiplication with LoRA delta
+    #[target_feature(enable = "avx2", enable = "fma")]
+    pub unsafe fn lora_forward_avx2(
+        x: &[f32],           // Input: [batch, d_in]
+        w_base: &[f32],      // Base weights: [d_in, d_out]
+        lora_a: &[f32],      // LoRA A: [d_in, rank]
+        lora_b: &[f32],      // LoRA B: [rank, d_out]
+        alpha: f32,
+        d_in: usize,
+        d_out: usize,
+        rank: usize,
+        output: &mut [f32],  // Output: [batch, d_out]
+    ) {
+        let scale = alpha / rank as f32;
+        let scale_vec = _mm256_set1_ps(scale);
+
+        // Step 1: Compute x @ A (input projection to rank space)
+        let mut x_projected = vec![0.0f32; rank];
+        for r in 0..rank {
+            let mut sum = _mm256_setzero_ps();
+            let mut i = 0;
+            while i + 8 <= d_in {
+                let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
+                let a_vec = _mm256_loadu_ps(lora_a.as_ptr().add(r * d_in + i));
+                sum = _mm256_fmadd_ps(x_vec, a_vec, sum);
+                i += 8;
+            }
+            x_projected[r] = horizontal_sum_avx2(sum);
+            // Handle remainder
+            while i < d_in {
+                x_projected[r] += x[i] * lora_a[r * d_in + i];
+                i += 1;
+            }
+        }
+
+        // Step 2: Compute (x @ W_base) + scale * (x_projected @ B)
+        for j in 0..d_out {
+            // Base weight contribution
+            let mut sum = _mm256_setzero_ps();
+            let mut i = 0;
+            while i + 8 <= d_in {
+                let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
+                let w_vec = _mm256_loadu_ps(w_base.as_ptr().add(j * d_in + i));
+                sum = _mm256_fmadd_ps(x_vec, w_vec, sum);
+                i += 8;
+            }
+            let mut base_result = horizontal_sum_avx2(sum);
+            while i < d_in {
+                base_result += x[i] * w_base[j * d_in + i];
+                i += 1;
+            }
+
+            // LoRA contribution
+            let mut lora_result = 0.0f32;
+            for r in 0..rank {
+                lora_result += x_projected[r] * lora_b[j * rank + r];
+            }
+
+            output[j] = base_result + scale * lora_result;
+        }
+    }
+
+    #[inline]
+    unsafe fn horizontal_sum_avx2(v: __m256) -> f32 {
+        let high = _mm256_extractf128_ps(v, 1);
+        let low = _mm256_castps256_ps128(v);
+        let sum128 = _mm_add_ps(high, low);
+        let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
+        let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
+        _mm_cvtss_f32(sum32)
+    }
+}
+```
+
+---
+
+## 4. Learning Signal Extraction
+
+### From Query Feedback to Gradient Direction
+
+```rust
+/// Learning signal extracted from each interaction
+#[derive(Clone)]
+pub struct LearningSignal {
+    /// Query embedding
+    pub query_embedding: Vec<f32>,
+    /// Response quality score (0-1)
+    pub quality_score: f32,
+    /// User feedback (explicit)
+    pub explicit_feedback: Option<FeedbackType>,
+    /// Latency deviation from target
+    pub latency_ratio: f32,
+    /// Model tier used
+    pub model_tier: ModelTier,
+    /// Context tokens used
+    pub context_tokens: usize,
+}
+
+impl LearningSignal {
+    /// Convert signal to gradient direction for micro-LoRA
+    pub fn to_gradient_direction(&self) -> GradientDirection {
+        // Reward = quality * (1 - latency_penalty)
+        let reward = self.quality_score * (2.0 - self.latency_ratio).max(0.0);
+
+        // Direction = embedding * reward_sign
+        let direction = if reward > 0.5 {
+            // Reinforce current behavior
+            1.0
+        } else {
+            // Explore alternative
+            -0.1
+        };
+
+        // Scale by uncertainty (more learning when uncertain)
+        let uncertainty = 1.0 - self.quality_score.abs();
+        let learning_rate = 0.001 * (1.0 + uncertainty);
+
+        GradientDirection {
+            a_component: self.compute_a_gradient(direction, learning_rate),
+            b_component: self.compute_b_gradient(direction, learning_rate),
+        }
+    }
+
+    fn compute_a_gradient(&self, direction: f32, lr: f32) -> Array2<f32> {
+        // Outer product of query embedding with hidden state
+        // Approximated via reservoir-sampled historical embeddings
+        let emb = Array1::from_vec(self.query_embedding.clone());
+        let grad = direction * lr * outer_product(&emb, &self.get_hidden_direction());
+        grad
+    }
+
+    fn compute_b_gradient(&self, direction: f32, lr: f32) -> Array2<f32> {
+        // Output gradient based on prediction error
+        let output_error = self.compute_output_error();
+        direction * lr * output_error
+    }
+}
+```
+
+---
+
+## 5. Target Layer Selection
+
+### Which Layers to Apply LoRA
+
+```rust
+/// Layer selection strategy for LoRA application
+pub enum LoRATargetStrategy {
+    /// Apply to all attention layers (Q, K, V, O projections)
+    AllAttention,
+    /// Apply to FFN layers only
+    AllFFN,
+    /// Apply to output heads only (fastest, good for routing)
+    OutputHeadsOnly,
+    /// Apply to specific layers by index
+    SpecificLayers(Vec<usize>),
+    /// Adaptive: select based on gradient magnitude
+    AdaptiveTopK(usize),
+}
+
+impl LoRATargetStrategy {
+    /// For ultra-low latency: output heads only
+    pub fn ultra_fast() -> Self {
+        Self::OutputHeadsOnly
+    }
+
+    /// For moderate adaptation: attention Q and V
+    pub fn attention_qv() -> Self {
+        Self::SpecificLayers(vec![0, 2]) // Q and V typically
+    }
+
+    /// Select layers with highest gradient magnitude
+    pub fn adaptive_top_k(k: usize) -> Self {
+        Self::AdaptiveTopK(k)
+    }
+}
+
+/// SONA default: Output heads for micro, attention for base
+pub const SONA_DEFAULT_TARGETS: [LoRATargetStrategy; 2] = [
+    LoRATargetStrategy::OutputHeadsOnly,  // Micro-LoRA
+    LoRATargetStrategy::AllAttention,     // Base LoRA
+];
+```
+
+---
+
+## 6. Memory-Efficient Storage
+
+### Quantized LoRA Matrices
+
+```rust
+/// Q4-quantized LoRA for memory efficiency
+pub struct QuantizedLoRA {
+    /// Quantized A matrix (4-bit)
+    pub a_q4: Q4Matrix,
+    /// Quantized B matrix (4-bit)
+    pub b_q4: Q4Matrix,
+    /// Full-precision alpha
+    pub alpha: f32,
+    /// Full-precision scaling factors
+    pub a_scales: Vec<f32>,
+    pub b_scales: Vec<f32>,
+}
+
+impl QuantizedLoRA {
+    /// Memory usage comparison
+    ///
+    /// FP32 LoRA (rank 8, 768 dim):
+    ///   A: 768 × 8 × 4 bytes = 24.6 KB
+    ///   B: 8 × 768 × 4 bytes = 24.6 KB
+    ///   Total: ~50 KB per layer
+    ///
+    /// Q4 LoRA (rank 8, 768 dim):
+    ///   A: 768 × 8 × 0.5 bytes = 3.1 KB
+    ///   B: 8 × 768 × 0.5 bytes = 3.1 KB
+    ///   Scales: 2 × 768 × 4 bytes = 6.1 KB
+    ///   Total: ~12 KB per layer (4x reduction)
+
+    pub fn from_fp32(lora: &BaseLoRA) -> Self {
+        Self {
+            a_q4: Q4Matrix::quantize(&lora.a),
+            b_q4: Q4Matrix::quantize(&lora.b),
+            alpha: lora.alpha,
+            a_scales: compute_scales(&lora.a),
+            b_scales: compute_scales(&lora.b),
+        }
+    }
+
+    /// Dequantize on-the-fly during forward pass
+    #[inline]
+    pub fn forward(&self, x: &[f32]) -> Vec<f32> {
+        // Dequantize A, compute x @ A
+        let projected = self.a_q4.matmul_dequant(x, &self.a_scales);
+        // Dequantize B, compute projected @ B
+        let output = self.b_q4.matmul_dequant(&projected, &self.b_scales);
+        // Scale by alpha
+        output.iter().map(|v| v * self.alpha).collect()
+    }
+}
+```
+
+---
+
+## 7. Latency Breakdown
+
+### Target: <100μs Total LoRA Overhead
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                  LoRA-ULTRA LATENCY BUDGET                  │
+├─────────────────────────────────────────────────────────────┤
+│                                                             │
+│  Signal Extraction:    10μs  ████░░░░░░░░░░░░░░░░░░░░░░░░  │
+│  Gradient Direction:   15μs  ██████░░░░░░░░░░░░░░░░░░░░░░  │
+│  Micro-LoRA Update:    25μs  ██████████░░░░░░░░░░░░░░░░░░  │
+│  Forward Pass Delta:   30μs  ████████████░░░░░░░░░░░░░░░░  │
+│  Momentum Averaging:   10μs  ████░░░░░░░░░░░░░░░░░░░░░░░░  │
+│  Memory Bookkeeping:   10μs  ████░░░░░░░░░░░░░░░░░░░░░░░░  │
+│                        ─────                                │
+│  TOTAL:              ~100μs                                │
+│                                                             │
+│  Amortized (batched):  ~30μs per query                     │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 8. Integration with FastGRNN Router
+
+### Router-Specific LoRA Configuration
+
+```rust
+/// LoRA configuration for FastGRNN router
+pub struct RouterLoRAConfig {
+    /// Base LoRA for hidden state transformations
+    pub hidden_lora: BaseLoRA,
+    /// Micro LoRA for gate adjustments
+    pub gate_micro_lora: MicroLoRA,
+    /// Per-output-head LoRA adapters
+    pub head_loras: Vec<BaseLoRA>,
+}
+
+impl RouterLoRAConfig {
+    pub fn new(hidden_dim: usize, output_dims: &[usize]) -> Self {
+        Self {
+            hidden_lora: BaseLoRA::new(hidden_dim, hidden_dim, 8), // rank 8
+            gate_micro_lora: MicroLoRA::new(hidden_dim, hidden_dim, 2), // rank 2
+            head_loras: output_dims.iter()
+                .map(|&dim| BaseLoRA::new(hidden_dim, dim, 4)) // rank 4
+                .collect(),
+        }
+    }
+
+    /// Apply LoRA to FastGRNN forward pass
+    pub fn apply(&self, base_output: &FastGRNNOutput) -> FastGRNNOutput {
+        let mut output = base_output.clone();
+
+        // Apply hidden state LoRA
+        output.hidden = self.hidden_lora.apply(&output.hidden);
+
+        // Apply micro-LoRA to gates
+        output.update_gate = self.gate_micro_lora.apply(&output.update_gate);
+
+        // Apply per-head LoRA
+        for (i, head_lora) in self.head_loras.iter().enumerate() {
+            output.heads[i] = head_lora.apply(&output.heads[i]);
+        }
+
+        output
+    }
+}
+```
+
+---
+
+## 9. Checkpointing and Recovery
+
+### Efficient LoRA State Management
+
+```rust
+/// LoRA checkpoint for persistence and recovery
+#[derive(Serialize, Deserialize)]
+pub struct LoRACheckpoint {
+    /// Base LoRA matrices (serialized as FP16 for space)
+    pub base_lora: SerializedLoRA,
+    /// Micro LoRA state
+    pub micro_lora: SerializedLoRA,
+    /// Momentum buffers
+    pub momentum_state: MomentumState,
+    /// Training statistics
+    pub stats: LoRAStats,
+    /// Checkpoint version
+    pub version: u32,
+    /// Timestamp
+    pub timestamp: i64,
+}
+
+impl LoRACheckpoint {
+    /// Save checkpoint (async, non-blocking)
+    pub async fn save_async(&self, path: &Path) -> Result<()> {
+        let bytes = bincode::serialize(self)?;
+        tokio::fs::write(path, &bytes).await?;
+        Ok(())
+    }
+
+    /// Load checkpoint
+    pub fn load(path: &Path) -> Result<Self> {
+        let bytes = std::fs::read(path)?;
+        Ok(bincode::deserialize(&bytes)?)
+    }
+
+    /// Incremental checkpoint (only changed matrices)
+    pub fn save_incremental(&self, previous: &Self, path: &Path) -> Result<()> {
+        let delta = self.compute_delta(previous);
+        // Only save changed blocks
+        delta.save(path)
+    }
+}
+```
+
+---
+
+## 10. Benchmark Targets
+
+### Performance Validation
+
+```rust
+#[cfg(test)]
+mod benchmarks {
+    use super::*;
+    use criterion::{black_box, Criterion};
+
+    /// Target: <50μs for micro-LoRA update
+    fn bench_micro_lora_update(c: &mut Criterion) {
+        let mut micro = MicroLoRA::new(768, 768, 2);
+        let signal = LearningSignal::random();
+
+        c.bench_function("micro_lora_update", |b| {
+            b.iter(|| {
+                micro.micro_update(black_box(&signal));
+            })
+        });
+    }
+
+    /// Target: <30μs for LoRA forward pass
+    fn bench_lora_forward(c: &mut Criterion) {
+        let lora = BaseLoRA::new(768, 768, 8);
+        let input = vec![0.0f32; 768];
+
+        c.bench_function("lora_forward", |b| {
+            b.iter(|| {
+                lora.forward(black_box(&input))
+            })
+        });
+    }
+
+    /// Target: <10μs for signal extraction
+    fn bench_signal_extraction(c: &mut Criterion) {
+        let query = "test query".to_string();
+        let response = "test response".to_string();
+
+        c.bench_function("signal_extraction", |b| {
+            b.iter(|| {
+                LearningSignal::extract(black_box(&query), black_box(&response))
+            })
+        });
+    }
+}
+```
+
+---
+
+## Summary
+
+SONA LoRA-Ultra achieves sub-100μs adaptive fine-tuning through:
+
+1. **Two-Tier Architecture**: Base LoRA (hourly) + Micro-LoRA (per-request)
+2. **SIMD Optimization**: AVX2-accelerated forward pass
+3. **Quantized Storage**: Q4 matrices for 4x memory reduction
+4. **Smart Targeting**: Output heads for speed, attention for capability
+5. **Momentum Smoothing**: Stable micro-updates with EMA
+6. **Async Checkpointing**: Non-blocking persistence
+
+This enables true real-time self-improvement where every query makes the model incrementally smarter.