Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
559
vendor/ruvector/examples/ruvLLM/docs/SONA/01-LORA-ULTRA.md
vendored
Normal file
559
vendor/ruvector/examples/ruvLLM/docs/SONA/01-LORA-ULTRA.md
vendored
Normal file
@@ -0,0 +1,559 @@
|
||||
# SONA LoRA-Ultra: Sub-100μs Adaptive Fine-Tuning
|
||||
|
||||
## Ultra-Low Latency LoRA for Real-Time Self-Improvement
|
||||
|
||||
---
|
||||
|
||||
## 1. Architecture Overview
|
||||
|
||||
### Traditional LoRA vs SONA LoRA-Ultra
|
||||
|
||||
```
|
||||
TRADITIONAL LoRA SONA LoRA-ULTRA
|
||||
───────────────── ─────────────────
|
||||
• Offline training • Online per-request adaptation
|
||||
• Full batch updates • Single-sample micro-updates
|
||||
• GPU required • CPU SIMD optimized
|
||||
• Minutes to hours • <100 microseconds
|
||||
• Periodic deployment • Continuous integration
|
||||
```
|
||||
|
||||
### Core Formula
|
||||
|
||||
```
|
||||
Standard LoRA:
|
||||
W_adapted = W_frozen + ΔW
|
||||
ΔW = α · (A @ B)
|
||||
where A ∈ ℝ^(d×r), B ∈ ℝ^(r×k), r << min(d,k)
|
||||
|
||||
SONA LoRA-Ultra Extension:
|
||||
W_adapted = W_frozen + α · (A @ B) + β · (A_micro @ B_micro)
|
||||
└─────────┘ └───────────────────┘
|
||||
Base LoRA Instant Micro-LoRA
|
||||
(rank 4-16) (rank 1-2)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Two-Tier LoRA Architecture
|
||||
|
||||
### Tier 1: Base LoRA (Updated Hourly)
|
||||
|
||||
```rust
|
||||
/// Base LoRA adapter for major capability shifts
|
||||
pub struct BaseLoRA {
|
||||
/// Low-rank matrix A: d_model × rank
|
||||
pub a: Array2<f32>,
|
||||
/// Low-rank matrix B: rank × d_out
|
||||
pub b: Array2<f32>,
|
||||
/// Scaling factor
|
||||
pub alpha: f32,
|
||||
/// Rank (typically 4-16)
|
||||
pub rank: usize,
|
||||
/// Target layer indices
|
||||
pub target_layers: Vec<usize>,
|
||||
}
|
||||
|
||||
impl BaseLoRA {
|
||||
/// Compute adapted weights (cached for inference)
|
||||
#[inline]
|
||||
pub fn delta_w(&self) -> Array2<f32> {
|
||||
let scale = self.alpha / self.rank as f32;
|
||||
scale * self.a.dot(&self.b)
|
||||
}
|
||||
|
||||
/// Update from accumulated gradients (hourly)
|
||||
pub fn update(&mut self, grad_a: &Array2<f32>, grad_b: &Array2<f32>, lr: f32) {
|
||||
// SGD with momentum
|
||||
self.a = &self.a - lr * grad_a;
|
||||
self.b = &self.b - lr * grad_b;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Tier 2: Micro-LoRA (Updated Per-Request)
|
||||
|
||||
```rust
|
||||
/// Ultra-fast micro-adapter for instant learning
|
||||
pub struct MicroLoRA {
|
||||
/// Micro A: d_model × micro_rank (typically 1-2)
|
||||
pub a_micro: Array2<f32>,
|
||||
/// Micro B: micro_rank × d_out
|
||||
pub b_micro: Array2<f32>,
|
||||
/// Micro scaling (smaller than base)
|
||||
pub beta: f32,
|
||||
/// Micro rank (1-2 for speed)
|
||||
pub micro_rank: usize,
|
||||
/// Decay factor for temporal smoothing
|
||||
pub decay: f32,
|
||||
/// Momentum buffer
|
||||
momentum_a: Array2<f32>,
|
||||
momentum_b: Array2<f32>,
|
||||
}
|
||||
|
||||
impl MicroLoRA {
|
||||
/// Ultra-fast single-sample update (<50μs target)
|
||||
#[inline]
|
||||
pub fn micro_update(&mut self, signal: &LearningSignal) {
|
||||
// Rank-1 outer product update
|
||||
let grad_direction = signal.to_gradient_direction();
|
||||
|
||||
// Exponential moving average for stability
|
||||
self.momentum_a = self.decay * &self.momentum_a
|
||||
+ (1.0 - self.decay) * &grad_direction.a_component;
|
||||
self.momentum_b = self.decay * &self.momentum_b
|
||||
+ (1.0 - self.decay) * &grad_direction.b_component;
|
||||
|
||||
// Apply micro-update
|
||||
self.a_micro = &self.a_micro + self.beta * &self.momentum_a;
|
||||
self.b_micro = &self.b_micro + self.beta * &self.momentum_b;
|
||||
}
|
||||
|
||||
/// Periodic consolidation into base LoRA
|
||||
pub fn consolidate_to_base(&mut self, base: &mut BaseLoRA) {
|
||||
// Merge micro adaptations into base
|
||||
// Then reset micro to zero
|
||||
base.a = &base.a + &self.a_micro;
|
||||
base.b = &base.b + &self.b_micro;
|
||||
self.a_micro.fill(0.0);
|
||||
self.b_micro.fill(0.0);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. SIMD-Optimized LoRA Computation
|
||||
|
||||
### AVX2 Accelerated Forward Pass
|
||||
|
||||
```rust
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod simd {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
/// SIMD-optimized LoRA forward: x @ (W + A @ B)
|
||||
/// Fuses base weight multiplication with LoRA delta
|
||||
#[target_feature(enable = "avx2", enable = "fma")]
|
||||
pub unsafe fn lora_forward_avx2(
|
||||
x: &[f32], // Input: [batch, d_in]
|
||||
w_base: &[f32], // Base weights: [d_in, d_out]
|
||||
lora_a: &[f32], // LoRA A: [d_in, rank]
|
||||
lora_b: &[f32], // LoRA B: [rank, d_out]
|
||||
alpha: f32,
|
||||
d_in: usize,
|
||||
d_out: usize,
|
||||
rank: usize,
|
||||
output: &mut [f32], // Output: [batch, d_out]
|
||||
) {
|
||||
let scale = alpha / rank as f32;
|
||||
let scale_vec = _mm256_set1_ps(scale);
|
||||
|
||||
// Step 1: Compute x @ A (input projection to rank space)
|
||||
let mut x_projected = vec![0.0f32; rank];
|
||||
for r in 0..rank {
|
||||
let mut sum = _mm256_setzero_ps();
|
||||
let mut i = 0;
|
||||
while i + 8 <= d_in {
|
||||
let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
|
||||
let a_vec = _mm256_loadu_ps(lora_a.as_ptr().add(r * d_in + i));
|
||||
sum = _mm256_fmadd_ps(x_vec, a_vec, sum);
|
||||
i += 8;
|
||||
}
|
||||
x_projected[r] = horizontal_sum_avx2(sum);
|
||||
// Handle remainder
|
||||
while i < d_in {
|
||||
x_projected[r] += x[i] * lora_a[r * d_in + i];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Compute (x @ W_base) + scale * (x_projected @ B)
|
||||
for j in 0..d_out {
|
||||
// Base weight contribution
|
||||
let mut sum = _mm256_setzero_ps();
|
||||
let mut i = 0;
|
||||
while i + 8 <= d_in {
|
||||
let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
|
||||
let w_vec = _mm256_loadu_ps(w_base.as_ptr().add(j * d_in + i));
|
||||
sum = _mm256_fmadd_ps(x_vec, w_vec, sum);
|
||||
i += 8;
|
||||
}
|
||||
let mut base_result = horizontal_sum_avx2(sum);
|
||||
while i < d_in {
|
||||
base_result += x[i] * w_base[j * d_in + i];
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// LoRA contribution
|
||||
let mut lora_result = 0.0f32;
|
||||
for r in 0..rank {
|
||||
lora_result += x_projected[r] * lora_b[j * rank + r];
|
||||
}
|
||||
|
||||
output[j] = base_result + scale * lora_result;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
unsafe fn horizontal_sum_avx2(v: __m256) -> f32 {
|
||||
let high = _mm256_extractf128_ps(v, 1);
|
||||
let low = _mm256_castps256_ps128(v);
|
||||
let sum128 = _mm_add_ps(high, low);
|
||||
let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
|
||||
let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
|
||||
_mm_cvtss_f32(sum32)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Learning Signal Extraction
|
||||
|
||||
### From Query Feedback to Gradient Direction
|
||||
|
||||
```rust
|
||||
/// Learning signal extracted from each interaction
|
||||
#[derive(Clone)]
|
||||
pub struct LearningSignal {
|
||||
/// Query embedding
|
||||
pub query_embedding: Vec<f32>,
|
||||
/// Response quality score (0-1)
|
||||
pub quality_score: f32,
|
||||
/// User feedback (explicit)
|
||||
pub explicit_feedback: Option<FeedbackType>,
|
||||
/// Latency deviation from target
|
||||
pub latency_ratio: f32,
|
||||
/// Model tier used
|
||||
pub model_tier: ModelTier,
|
||||
/// Context tokens used
|
||||
pub context_tokens: usize,
|
||||
}
|
||||
|
||||
impl LearningSignal {
|
||||
/// Convert signal to gradient direction for micro-LoRA
|
||||
pub fn to_gradient_direction(&self) -> GradientDirection {
|
||||
// Reward = quality * (1 - latency_penalty)
|
||||
let reward = self.quality_score * (2.0 - self.latency_ratio).max(0.0);
|
||||
|
||||
// Direction = embedding * reward_sign
|
||||
let direction = if reward > 0.5 {
|
||||
// Reinforce current behavior
|
||||
1.0
|
||||
} else {
|
||||
// Explore alternative
|
||||
-0.1
|
||||
};
|
||||
|
||||
// Scale by uncertainty (more learning when uncertain)
|
||||
let uncertainty = 1.0 - self.quality_score.abs();
|
||||
let learning_rate = 0.001 * (1.0 + uncertainty);
|
||||
|
||||
GradientDirection {
|
||||
a_component: self.compute_a_gradient(direction, learning_rate),
|
||||
b_component: self.compute_b_gradient(direction, learning_rate),
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_a_gradient(&self, direction: f32, lr: f32) -> Array2<f32> {
|
||||
// Outer product of query embedding with hidden state
|
||||
// Approximated via reservoir-sampled historical embeddings
|
||||
let emb = Array1::from_vec(self.query_embedding.clone());
|
||||
let grad = direction * lr * outer_product(&emb, &self.get_hidden_direction());
|
||||
grad
|
||||
}
|
||||
|
||||
fn compute_b_gradient(&self, direction: f32, lr: f32) -> Array2<f32> {
|
||||
// Output gradient based on prediction error
|
||||
let output_error = self.compute_output_error();
|
||||
direction * lr * output_error
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Target Layer Selection
|
||||
|
||||
### Which Layers to Apply LoRA
|
||||
|
||||
```rust
|
||||
/// Layer selection strategy for LoRA application
|
||||
pub enum LoRATargetStrategy {
|
||||
/// Apply to all attention layers (Q, K, V, O projections)
|
||||
AllAttention,
|
||||
/// Apply to FFN layers only
|
||||
AllFFN,
|
||||
/// Apply to output heads only (fastest, good for routing)
|
||||
OutputHeadsOnly,
|
||||
/// Apply to specific layers by index
|
||||
SpecificLayers(Vec<usize>),
|
||||
/// Adaptive: select based on gradient magnitude
|
||||
AdaptiveTopK(usize),
|
||||
}
|
||||
|
||||
impl LoRATargetStrategy {
|
||||
/// For ultra-low latency: output heads only
|
||||
pub fn ultra_fast() -> Self {
|
||||
Self::OutputHeadsOnly
|
||||
}
|
||||
|
||||
/// For moderate adaptation: attention Q and V
|
||||
pub fn attention_qv() -> Self {
|
||||
Self::SpecificLayers(vec![0, 2]) // Q and V typically
|
||||
}
|
||||
|
||||
/// Select layers with highest gradient magnitude
|
||||
pub fn adaptive_top_k(k: usize) -> Self {
|
||||
Self::AdaptiveTopK(k)
|
||||
}
|
||||
}
|
||||
|
||||
/// SONA default: Output heads for micro, attention for base
|
||||
pub const SONA_DEFAULT_TARGETS: [LoRATargetStrategy; 2] = [
|
||||
LoRATargetStrategy::OutputHeadsOnly, // Micro-LoRA
|
||||
LoRATargetStrategy::AllAttention, // Base LoRA
|
||||
];
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Memory-Efficient Storage
|
||||
|
||||
### Quantized LoRA Matrices
|
||||
|
||||
```rust
|
||||
/// Q4-quantized LoRA for memory efficiency
|
||||
pub struct QuantizedLoRA {
|
||||
/// Quantized A matrix (4-bit)
|
||||
pub a_q4: Q4Matrix,
|
||||
/// Quantized B matrix (4-bit)
|
||||
pub b_q4: Q4Matrix,
|
||||
/// Full-precision alpha
|
||||
pub alpha: f32,
|
||||
/// Full-precision scaling factors
|
||||
pub a_scales: Vec<f32>,
|
||||
pub b_scales: Vec<f32>,
|
||||
}
|
||||
|
||||
impl QuantizedLoRA {
|
||||
/// Memory usage comparison
|
||||
///
|
||||
/// FP32 LoRA (rank 8, 768 dim):
|
||||
/// A: 768 × 8 × 4 bytes = 24.6 KB
|
||||
/// B: 8 × 768 × 4 bytes = 24.6 KB
|
||||
/// Total: ~50 KB per layer
|
||||
///
|
||||
/// Q4 LoRA (rank 8, 768 dim):
|
||||
/// A: 768 × 8 × 0.5 bytes = 3.1 KB
|
||||
/// B: 8 × 768 × 0.5 bytes = 3.1 KB
|
||||
/// Scales: 2 × 768 × 4 bytes = 6.1 KB
|
||||
/// Total: ~12 KB per layer (4x reduction)
|
||||
|
||||
pub fn from_fp32(lora: &BaseLoRA) -> Self {
|
||||
Self {
|
||||
a_q4: Q4Matrix::quantize(&lora.a),
|
||||
b_q4: Q4Matrix::quantize(&lora.b),
|
||||
alpha: lora.alpha,
|
||||
a_scales: compute_scales(&lora.a),
|
||||
b_scales: compute_scales(&lora.b),
|
||||
}
|
||||
}
|
||||
|
||||
/// Dequantize on-the-fly during forward pass
|
||||
#[inline]
|
||||
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
// Dequantize A, compute x @ A
|
||||
let projected = self.a_q4.matmul_dequant(x, &self.a_scales);
|
||||
// Dequantize B, compute projected @ B
|
||||
let output = self.b_q4.matmul_dequant(&projected, &self.b_scales);
|
||||
// Scale by alpha
|
||||
output.iter().map(|v| v * self.alpha).collect()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Latency Breakdown
|
||||
|
||||
### Target: <100μs Total LoRA Overhead
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ LoRA-ULTRA LATENCY BUDGET │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Signal Extraction: 10μs ████░░░░░░░░░░░░░░░░░░░░░░░░ │
|
||||
│ Gradient Direction: 15μs ██████░░░░░░░░░░░░░░░░░░░░░░ │
|
||||
│ Micro-LoRA Update: 25μs ██████████░░░░░░░░░░░░░░░░░░ │
|
||||
│ Forward Pass Delta: 30μs ████████████░░░░░░░░░░░░░░░░ │
|
||||
│ Momentum Averaging: 10μs ████░░░░░░░░░░░░░░░░░░░░░░░░ │
|
||||
│ Memory Bookkeeping: 10μs ████░░░░░░░░░░░░░░░░░░░░░░░░ │
|
||||
│ ───── │
|
||||
│ TOTAL: ~100μs │
|
||||
│ │
|
||||
│ Amortized (batched): ~30μs per query │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Integration with FastGRNN Router
|
||||
|
||||
### Router-Specific LoRA Configuration
|
||||
|
||||
```rust
|
||||
/// LoRA configuration for FastGRNN router
|
||||
pub struct RouterLoRAConfig {
|
||||
/// Base LoRA for hidden state transformations
|
||||
pub hidden_lora: BaseLoRA,
|
||||
/// Micro LoRA for gate adjustments
|
||||
pub gate_micro_lora: MicroLoRA,
|
||||
/// Per-output-head LoRA adapters
|
||||
pub head_loras: Vec<BaseLoRA>,
|
||||
}
|
||||
|
||||
impl RouterLoRAConfig {
|
||||
pub fn new(hidden_dim: usize, output_dims: &[usize]) -> Self {
|
||||
Self {
|
||||
hidden_lora: BaseLoRA::new(hidden_dim, hidden_dim, 8), // rank 8
|
||||
gate_micro_lora: MicroLoRA::new(hidden_dim, hidden_dim, 2), // rank 2
|
||||
head_loras: output_dims.iter()
|
||||
.map(|&dim| BaseLoRA::new(hidden_dim, dim, 4)) // rank 4
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply LoRA to FastGRNN forward pass
|
||||
pub fn apply(&self, base_output: &FastGRNNOutput) -> FastGRNNOutput {
|
||||
let mut output = base_output.clone();
|
||||
|
||||
// Apply hidden state LoRA
|
||||
output.hidden = self.hidden_lora.apply(&output.hidden);
|
||||
|
||||
// Apply micro-LoRA to gates
|
||||
output.update_gate = self.gate_micro_lora.apply(&output.update_gate);
|
||||
|
||||
// Apply per-head LoRA
|
||||
for (i, head_lora) in self.head_loras.iter().enumerate() {
|
||||
output.heads[i] = head_lora.apply(&output.heads[i]);
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Checkpointing and Recovery
|
||||
|
||||
### Efficient LoRA State Management
|
||||
|
||||
```rust
|
||||
/// LoRA checkpoint for persistence and recovery
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct LoRACheckpoint {
|
||||
/// Base LoRA matrices (serialized as FP16 for space)
|
||||
pub base_lora: SerializedLoRA,
|
||||
/// Micro LoRA state
|
||||
pub micro_lora: SerializedLoRA,
|
||||
/// Momentum buffers
|
||||
pub momentum_state: MomentumState,
|
||||
/// Training statistics
|
||||
pub stats: LoRAStats,
|
||||
/// Checkpoint version
|
||||
pub version: u32,
|
||||
/// Timestamp
|
||||
pub timestamp: i64,
|
||||
}
|
||||
|
||||
impl LoRACheckpoint {
|
||||
/// Save checkpoint (async, non-blocking)
|
||||
pub async fn save_async(&self, path: &Path) -> Result<()> {
|
||||
let bytes = bincode::serialize(self)?;
|
||||
tokio::fs::write(path, &bytes).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load checkpoint
|
||||
pub fn load(path: &Path) -> Result<Self> {
|
||||
let bytes = std::fs::read(path)?;
|
||||
Ok(bincode::deserialize(&bytes)?)
|
||||
}
|
||||
|
||||
/// Incremental checkpoint (only changed matrices)
|
||||
pub fn save_incremental(&self, previous: &Self, path: &Path) -> Result<()> {
|
||||
let delta = self.compute_delta(previous);
|
||||
// Only save changed blocks
|
||||
delta.save(path)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 10. Benchmark Targets
|
||||
|
||||
### Performance Validation
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod benchmarks {
|
||||
use super::*;
|
||||
use criterion::{black_box, Criterion};
|
||||
|
||||
/// Target: <50μs for micro-LoRA update
|
||||
fn bench_micro_lora_update(c: &mut Criterion) {
|
||||
let mut micro = MicroLoRA::new(768, 768, 2);
|
||||
let signal = LearningSignal::random();
|
||||
|
||||
c.bench_function("micro_lora_update", |b| {
|
||||
b.iter(|| {
|
||||
micro.micro_update(black_box(&signal));
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
/// Target: <30μs for LoRA forward pass
|
||||
fn bench_lora_forward(c: &mut Criterion) {
|
||||
let lora = BaseLoRA::new(768, 768, 8);
|
||||
let input = vec![0.0f32; 768];
|
||||
|
||||
c.bench_function("lora_forward", |b| {
|
||||
b.iter(|| {
|
||||
lora.forward(black_box(&input))
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
/// Target: <10μs for signal extraction
|
||||
fn bench_signal_extraction(c: &mut Criterion) {
|
||||
let query = "test query".to_string();
|
||||
let response = "test response".to_string();
|
||||
|
||||
c.bench_function("signal_extraction", |b| {
|
||||
b.iter(|| {
|
||||
LearningSignal::extract(black_box(&query), black_box(&response))
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
SONA LoRA-Ultra achieves sub-100μs adaptive fine-tuning through:
|
||||
|
||||
1. **Two-Tier Architecture**: Base LoRA (hourly) + Micro-LoRA (per-request)
|
||||
2. **SIMD Optimization**: AVX2-accelerated forward pass
|
||||
3. **Quantized Storage**: Q4 matrices for 4x memory reduction
|
||||
4. **Smart Targeting**: Output heads for speed, attention for capability
|
||||
5. **Momentum Smoothing**: Stable micro-updates with EMA
|
||||
6. **Async Checkpointing**: Non-blocking persistence
|
||||
|
||||
This enables true real-time self-improvement where every query makes the model incrementally smarter.
|
||||
Reference in New Issue
Block a user