Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/docs/ruvllm/API_REFERENCE.md
+++ b/vendor/ruvector/docs/ruvllm/API_REFERENCE.md
@@ -0,0 +1,862 @@
+# RuvLLM API Reference
+
+Complete API documentation for the RuvLLM crate.
+
+## Table of Contents
+
+- [Core Types](#core-types)
+- [Backend Trait](#backend-trait)
+- [Candle Backend](#candle-backend)
+- [LoRA Module](#lora-module)
+- [Optimization Module](#optimization-module)
+- [Kernel Functions](#kernel-functions)
+- [KV Cache](#kv-cache)
+- [Error Handling](#error-handling)
+
+---
+
+## Core Types
+
+### `Precision`
+
+Numeric precision for model weights and KV cache.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Precision {
+    /// Full 32-bit floating point
+    FP32,
+    /// Half precision 16-bit float
+    FP16,
+    /// Brain floating point (16-bit)
+    BF16,
+    /// 8-bit integer quantization
+    Q8,
+    /// 4-bit integer quantization
+    Q4,
+    /// 4-bit K-quant (GGML-style)
+    Q4K,
+}
+
+impl Precision {
+    /// Get bytes per element for this precision
+    pub fn bytes_per_element(&self) -> u8;
+}
+```
+
+### `ModelSize`
+
+Model size classification for routing.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ModelSize {
+    Tiny,   // < 1B params
+    Small,  // 1-3B params
+    Medium, // 3-13B params
+    Large,  // > 13B params
+}
+```
+
+### `DeviceType`
+
+Compute device selection.
+
+```rust
+#[derive(Debug, Clone, Copy)]
+pub enum DeviceType {
+    /// CPU (fallback)
+    Cpu,
+    /// Apple Metal GPU
+    Metal,
+    /// NVIDIA CUDA GPU
+    Cuda(usize),  // device index
+}
+```
+
+---
+
+## Backend Trait
+
+### `LlmBackend`
+
+Main trait for LLM inference backends.
+
+```rust
+pub trait LlmBackend: Send + Sync {
+    /// Load a model from HuggingFace Hub or local path
+    ///
+    /// # Arguments
+    /// * `model_id` - HuggingFace model ID or local path
+    /// * `config` - Model configuration
+    ///
+    /// # Example
+    /// ```
+    /// backend.load_model("Qwen/Qwen2.5-7B-Instruct", config)?;
+    /// ```
+    fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()>;
+
+    /// Generate text from a prompt
+    ///
+    /// # Arguments
+    /// * `prompt` - Input text prompt
+    /// * `params` - Generation parameters
+    ///
+    /// # Returns
+    /// Generated text response
+    ///
+    /// # Example
+    /// ```
+    /// let response = backend.generate("Hello!", GenerateParams::default())?;
+    /// ```
+    fn generate(&self, prompt: &str, params: GenerateParams) -> Result<String>;
+
+    /// Streaming text generation
+    ///
+    /// # Arguments
+    /// * `prompt` - Input text prompt
+    /// * `params` - Generation parameters
+    /// * `callback` - Called for each generated token
+    fn generate_stream<F>(&self, prompt: &str, params: GenerateParams, callback: F) -> Result<()>
+    where
+        F: FnMut(&str) -> bool;
+
+    /// Get the tokenizer for this model
+    fn tokenizer(&self) -> Option<&dyn Tokenizer>;
+
+    /// Get model metadata
+    fn model_info(&self) -> Option<ModelInfo>;
+
+    /// Check if a model is loaded
+    fn is_loaded(&self) -> bool;
+}
+```
+
+### `ModelConfig`
+
+Configuration for model loading.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct ModelConfig {
+    /// Maximum context length
+    pub max_context: usize,
+    /// Use Flash Attention
+    pub use_flash_attention: bool,
+    /// Weight quantization level
+    pub quantization: Precision,
+    /// KV cache configuration
+    pub kv_cache_config: KvCacheConfig,
+    /// Device to load model on
+    pub device: DeviceType,
+    /// HuggingFace token for gated models
+    pub hf_token: Option<String>,
+}
+
+impl Default for ModelConfig {
+    fn default() -> Self {
+        Self {
+            max_context: 4096,
+            use_flash_attention: true,
+            quantization: Precision::Q4K,
+            kv_cache_config: KvCacheConfig::default(),
+            device: DeviceType::Metal,
+            hf_token: None,
+        }
+    }
+}
+```
+
+### `GenerateParams`
+
+Parameters for text generation.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct GenerateParams {
+    /// Maximum tokens to generate
+    pub max_tokens: usize,
+    /// Sampling temperature (0.0 = deterministic)
+    pub temperature: f32,
+    /// Top-p (nucleus) sampling
+    pub top_p: f32,
+    /// Top-k sampling (0 = disabled)
+    pub top_k: usize,
+    /// Repetition penalty
+    pub repetition_penalty: f32,
+    /// Stop sequences
+    pub stop_sequences: Vec<String>,
+    /// Random seed for reproducibility
+    pub seed: Option<u64>,
+}
+
+impl Default for GenerateParams {
+    fn default() -> Self {
+        Self {
+            max_tokens: 256,
+            temperature: 0.7,
+            top_p: 0.9,
+            top_k: 0,
+            repetition_penalty: 1.1,
+            stop_sequences: vec![],
+            seed: None,
+        }
+    }
+}
+```
+
+---
+
+## Candle Backend
+
+### `CandleBackend`
+
+HuggingFace Candle-based inference backend.
+
+```rust
+impl CandleBackend {
+    /// Create a new backend with default device
+    ///
+    /// # Example
+    /// ```
+    /// let backend = CandleBackend::new()?;
+    /// ```
+    pub fn new() -> Result<Self>;
+
+    /// Create with specific device
+    ///
+    /// # Example
+    /// ```
+    /// let backend = CandleBackend::with_device(DeviceType::Metal)?;
+    /// ```
+    pub fn with_device(device: DeviceType) -> Result<Self>;
+
+    /// Download model from HuggingFace Hub
+    ///
+    /// # Arguments
+    /// * `model_id` - HuggingFace model ID
+    /// * `quantization` - Target quantization
+    /// * `cache_dir` - Local cache directory
+    ///
+    /// # Example
+    /// ```
+    /// let path = backend.download_model(
+    ///     "Qwen/Qwen2.5-7B-Instruct",
+    ///     Precision::Q4K,
+    ///     "~/.cache/ruvllm"
+    /// ).await?;
+    /// ```
+    pub async fn download_model(
+        &self,
+        model_id: &str,
+        quantization: Precision,
+        cache_dir: &str,
+    ) -> Result<PathBuf>;
+
+    /// Get current device
+    pub fn device(&self) -> DeviceType;
+
+    /// Get memory usage statistics
+    pub fn memory_stats(&self) -> MemoryStats;
+}
+```
+
+---
+
+## LoRA Module
+
+### `MicroLoRA`
+
+Real-time per-request fine-tuning with rank 1-2 adapters.
+
+```rust
+impl MicroLoRA {
+    /// Create a new MicroLoRA instance
+    ///
+    /// # Example
+    /// ```
+    /// let config = MicroLoraConfig::for_hidden_dim(4096);
+    /// let lora = MicroLoRA::new(config);
+    /// ```
+    pub fn new(config: MicroLoraConfig) -> Self;
+
+    /// Adapt on new input with feedback
+    ///
+    /// # Arguments
+    /// * `input` - Input embedding vector
+    /// * `feedback` - Quality feedback for learning
+    ///
+    /// # Example
+    /// ```
+    /// let feedback = AdaptFeedback::from_quality(0.9);
+    /// lora.adapt(&input_embedding, feedback)?;
+    /// ```
+    pub fn adapt(&self, input: &[f32], feedback: AdaptFeedback) -> Result<()>;
+
+    /// Forward pass through LoRA adapter
+    ///
+    /// # Arguments
+    /// * `input` - Input tensor
+    /// * `module` - Target module (Q, K, V, O projections)
+    ///
+    /// # Returns
+    /// Output with LoRA contribution added
+    ///
+    /// # Example
+    /// ```
+    /// let output = lora.forward(&input, &TargetModule::QProj);
+    /// ```
+    pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
+
+    /// Forward pass that adds to existing output (in-place)
+    pub fn forward_add(&self, input: &[f32], module: &TargetModule, output: &mut [f32]);
+
+    /// Apply accumulated gradient updates
+    ///
+    /// # Arguments
+    /// * `learning_rate` - Learning rate for update
+    pub fn apply_updates(&self, learning_rate: f32);
+
+    /// Apply updates with EWC++ regularization
+    ///
+    /// # Arguments
+    /// * `learning_rate` - Learning rate
+    /// * `ewc_states` - EWC++ state per module
+    /// * `ewc_lambda` - EWC regularization strength
+    pub fn apply_updates_with_ewc(
+        &self,
+        learning_rate: f32,
+        ewc_states: &HashMap<TargetModule, EwcState>,
+        ewc_lambda: f32,
+    );
+
+    /// Reset all adapter weights
+    pub fn reset(&self);
+
+    /// Get adapter statistics
+    pub fn stats(&self) -> MicroLoraStats;
+}
+```
+
+### `MicroLoraConfig`
+
+Configuration for MicroLoRA adapters.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct MicroLoraConfig {
+    /// Input feature dimension
+    pub in_features: usize,
+    /// Output feature dimension
+    pub out_features: usize,
+    /// LoRA rank (1-2 for MicroLoRA)
+    pub rank: usize,
+    /// LoRA alpha scaling factor
+    pub alpha: f32,
+    /// Dropout probability
+    pub dropout: f32,
+    /// Target modules to adapt
+    pub target_modules: Vec<TargetModule>,
+    /// Enable gradient checkpointing
+    pub gradient_checkpointing: bool,
+}
+
+impl MicroLoraConfig {
+    /// Create config for a specific hidden dimension
+    ///
+    /// # Example
+    /// ```
+    /// let config = MicroLoraConfig::for_hidden_dim(4096);
+    /// assert_eq!(config.in_features, 4096);
+    /// assert_eq!(config.rank, 2);
+    /// ```
+    pub fn for_hidden_dim(hidden_dim: usize) -> Self;
+}
+```
+
+### `TargetModule`
+
+Transformer modules that can be adapted.
+
+```rust
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum TargetModule {
+    /// Query projection
+    QProj,
+    /// Key projection
+    KProj,
+    /// Value projection
+    VProj,
+    /// Output projection
+    OProj,
+    /// Gate projection (FFN)
+    GateProj,
+    /// Up projection (FFN)
+    UpProj,
+    /// Down projection (FFN)
+    DownProj,
+}
+```
+
+### `AdaptFeedback`
+
+Feedback for LoRA adaptation.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct AdaptFeedback {
+    /// Quality score (0.0 - 1.0)
+    pub quality: f32,
+    /// Gradient estimate from feedback
+    pub gradient_estimate: Vec<f32>,
+    /// Optional reward signal
+    pub reward: Option<f32>,
+    /// Latency in microseconds
+    pub latency_us: u64,
+    /// Source module (optional)
+    pub source_module: Option<TargetModule>,
+    /// Session identifier
+    pub session_id: Option<String>,
+}
+
+impl AdaptFeedback {
+    /// Create feedback from quality score
+    ///
+    /// # Example
+    /// ```
+    /// let feedback = AdaptFeedback::from_quality(0.85);
+    /// ```
+    pub fn from_quality(quality: f32) -> Self;
+}
+```
+
+---
+
+## Optimization Module
+
+### `SonaLlm`
+
+SONA learning integration for LLM inference.
+
+```rust
+impl SonaLlm {
+    /// Create new SONA LLM integration
+    ///
+    /// # Example
+    /// ```
+    /// let sona = SonaLlm::new(SonaLlmConfig::default());
+    /// ```
+    pub fn new(config: SonaLlmConfig) -> Self;
+
+    /// Instant loop: per-request MicroLoRA adaptation
+    ///
+    /// Target latency: <1ms
+    ///
+    /// # Arguments
+    /// * `request` - User query text
+    /// * `response` - Model response text
+    /// * `feedback` - Quality score (0.0 - 1.0)
+    ///
+    /// # Returns
+    /// Adaptation result with statistics
+    ///
+    /// # Example
+    /// ```
+    /// let result = sona.instant_adapt(
+    ///     "What is machine learning?",
+    ///     "Machine learning is...",
+    ///     0.9
+    /// );
+    /// assert!(result.applied);
+    /// assert!(result.latency_us < 1000); // <1ms
+    /// ```
+    pub fn instant_adapt(&self, request: &str, response: &str, feedback: f32) -> AdaptationResult;
+
+    /// Background loop: consolidate patterns
+    ///
+    /// Called periodically (~100ms interval)
+    ///
+    /// # Example
+    /// ```
+    /// let result = sona.background_consolidate();
+    /// println!("Consolidated {} samples", result.samples_used);
+    /// ```
+    pub fn background_consolidate(&self) -> AdaptationResult;
+
+    /// Deep loop: trigger full optimization
+    ///
+    /// # Arguments
+    /// * `dataset` - Training samples to learn from
+    pub fn deep_optimize(&self, dataset: &[TrainingSample]) -> AdaptationResult;
+
+    /// Check if background loop should run
+    pub fn maybe_background(&self) -> Option<AdaptationResult>;
+
+    /// Check if deep loop should be triggered
+    pub fn should_trigger_deep(&self) -> bool;
+
+    /// Get current statistics
+    pub fn stats(&self) -> LearningLoopStats;
+
+    /// Forward pass through MicroLoRA
+    pub fn forward(&self, input: &[f32], module: &TargetModule) -> Vec<f32>;
+
+    /// Reset all learning state
+    pub fn reset(&self);
+}
+```
+
+### `SonaLlmConfig`
+
+Configuration for SONA LLM integration.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct SonaLlmConfig {
+    /// MicroLoRA configuration
+    pub micro_lora: MicroLoraConfig,
+    /// Training pipeline configuration
+    pub training: TrainingConfig,
+    /// SONA core configuration
+    pub sona: SonaConfig,
+    /// Instant loop learning rate
+    pub instant_lr: f32,
+    /// Background loop interval (milliseconds)
+    pub background_interval_ms: u64,
+    /// Minimum samples for background consolidation
+    pub background_min_samples: usize,
+    /// Deep loop trigger threshold
+    pub deep_trigger_threshold: f32,
+    /// Maximum pending samples
+    pub max_pending_samples: usize,
+    /// Consolidation strategy
+    pub consolidation_strategy: ConsolidationStrategy,
+}
+```
+
+### `ConsolidationStrategy`
+
+Strategy for consolidating learned patterns.
+
+```rust
+#[derive(Debug, Clone, Copy)]
+pub enum ConsolidationStrategy {
+    /// Merge with EWC++ regularization (default)
+    EwcMerge,
+    /// Simple averaging
+    Average,
+    /// Weighted by quality
+    QualityWeighted,
+    /// Keep best performing only
+    BestOnly,
+    /// Ensemble multiple adapters
+    Ensemble,
+}
+```
+
+---
+
+## Kernel Functions
+
+### Attention Kernels
+
+```rust
+/// Flash Attention 2 with NEON SIMD optimization
+///
+/// Memory-efficient attention with O(N) complexity.
+///
+/// # Arguments
+/// * `query` - Query tensor (head_dim,)
+/// * `key` - Key tensor (kv_len, head_dim)
+/// * `value` - Value tensor (kv_len, head_dim)
+/// * `scale` - Softmax scale (typically 1/sqrt(head_dim))
+/// * `causal` - Apply causal masking
+///
+/// # Returns
+/// Output tensor (head_dim,)
+///
+/// # Example
+/// ```
+/// let scale = 1.0 / (head_dim as f32).sqrt();
+/// let output = flash_attention_neon(&query, &key, &value, scale, true);
+/// ```
+pub fn flash_attention_neon(
+    query: &[f32],
+    key: &[f32],
+    value: &[f32],
+    scale: f32,
+    causal: bool,
+) -> Vec<f32>;
+
+/// Paged Attention for KV cache
+///
+/// # Arguments
+/// * `query` - Query tensor
+/// * `kv_cache` - Paged KV cache
+/// * `block_tables` - Block index mapping
+/// * `scale` - Softmax scale
+pub fn paged_attention_neon(
+    query: &[f32],
+    kv_cache: &PagedKvCache,
+    block_tables: &[usize],
+    scale: f32,
+) -> Vec<f32>;
+
+/// Grouped-Query Attention (GQA)
+///
+/// KV heads shared among query head groups.
+///
+/// # Arguments
+/// * `queries` - Query tensor (num_heads, head_dim)
+/// * `keys` - Key tensor (kv_len, num_kv_heads, head_dim)
+/// * `values` - Value tensor (kv_len, num_kv_heads, head_dim)
+/// * `config` - Attention configuration
+pub fn grouped_query_attention_neon(
+    queries: &[f32],
+    keys: &[f32],
+    values: &[f32],
+    config: &AttentionConfig,
+) -> Vec<f32>;
+
+/// Multi-Query Attention (MQA)
+///
+/// Single KV head shared across all query heads.
+pub fn multi_query_attention_neon(
+    queries: &[f32],
+    key: &[f32],
+    value: &[f32],
+    config: &AttentionConfig,
+) -> Vec<f32>;
+```
+
+### `AttentionConfig`
+
+Configuration for attention operations.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct AttentionConfig {
+    /// Number of query heads
+    pub num_heads: usize,
+    /// Number of KV heads (for GQA)
+    pub num_kv_heads: usize,
+    /// Dimension per head
+    pub head_dim: usize,
+    /// Apply causal masking
+    pub causal: bool,
+    /// Custom scale factor (None = 1/sqrt(head_dim))
+    pub scale: Option<f32>,
+}
+
+impl AttentionConfig {
+    /// Calculate GQA ratio (query heads / KV heads)
+    pub fn gqa_ratio(&self) -> usize;
+
+    /// Get effective scale factor
+    pub fn effective_scale(&self) -> f32;
+}
+```
+
+---
+
+## KV Cache
+
+### `TwoTierKvCache`
+
+Two-tier KV cache with FP16 tail and quantized store.
+
+```rust
+impl TwoTierKvCache {
+    /// Create a new two-tier KV cache
+    ///
+    /// # Example
+    /// ```
+    /// let config = KvCacheConfig {
+    ///     tail_length: 256,
+    ///     max_tokens: 4096,
+    ///     ..Default::default()
+    /// };
+    /// let cache = TwoTierKvCache::new(config);
+    /// ```
+    pub fn new(config: KvCacheConfig) -> Self;
+
+    /// Append new KV pairs
+    ///
+    /// Automatically handles:
+    /// - Adding to tail
+    /// - Migrating to quantized store
+    /// - Evicting oldest tokens
+    ///
+    /// # Arguments
+    /// * `keys` - Key tensor
+    /// * `values` - Value tensor
+    ///
+    /// # Example
+    /// ```
+    /// cache.append(&keys, &values)?;
+    /// ```
+    pub fn append(&self, keys: &[f32], values: &[f32]) -> Result<()>;
+
+    /// Get all KV pairs for attention
+    ///
+    /// Returns (keys, values) with cold tier dequantized.
+    pub fn get_all_kv(&self) -> (Vec<f32>, Vec<f32>);
+
+    /// Compute attention with tier-aware access
+    ///
+    /// # Arguments
+    /// * `query` - Query tensor
+    /// * `scale` - Softmax scale
+    pub fn attend(&self, query: &[f32], scale: f32) -> Result<Vec<f32>>;
+
+    /// Get current statistics
+    pub fn stats(&self) -> KvCacheStats;
+
+    /// Clear the cache
+    pub fn clear(&self);
+
+    /// Update quantization policy
+    pub fn update_policy(&self, policy: CacheQuantization);
+}
+```
+
+### `KvCacheConfig`
+
+Configuration for KV cache.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct KvCacheConfig {
+    /// Tokens to keep in high-precision tail
+    pub tail_length: usize,
+    /// Precision for tail storage
+    pub tail_precision: Precision,
+    /// Precision for quantized store
+    pub store_precision: Precision,
+    /// Maximum total tokens
+    pub max_tokens: usize,
+    /// Number of KV heads
+    pub num_kv_heads: usize,
+    /// Head dimension
+    pub head_dim: usize,
+    /// Migration batch size
+    pub migration_batch: usize,
+}
+```
+
+### `KvCacheStats`
+
+Statistics for KV cache usage.
+
+```rust
+#[derive(Debug, Clone)]
+pub struct KvCacheStats {
+    /// Total tokens cached
+    pub total_tokens: usize,
+    /// Tokens in high-precision tail
+    pub tail_tokens: usize,
+    /// Tokens in quantized store
+    pub store_tokens: usize,
+    /// Bytes used by tail
+    pub tail_bytes: usize,
+    /// Bytes used by store
+    pub store_bytes: usize,
+    /// Compression ratio
+    pub compression_ratio: f32,
+}
+```
+
+---
+
+## Error Handling
+
+### `RuvLLMError`
+
+Main error type for RuvLLM operations.
+
+```rust
+#[derive(Error, Debug)]
+pub enum RuvLLMError {
+    /// Storage-related errors
+    #[error("Storage error: {0}")]
+    Storage(String),
+
+    /// Session management errors
+    #[error("Session error: {0}")]
+    Session(String),
+
+    /// KV cache errors
+    #[error("KV cache error: {0}")]
+    KvCache(String),
+
+    /// Paged attention errors
+    #[error("Paged attention error: {0}")]
+    PagedAttention(String),
+
+    /// Adapter management errors
+    #[error("Adapter error: {0}")]
+    Adapter(String),
+
+    /// SONA learning errors
+    #[error("SONA error: {0}")]
+    Sona(String),
+
+    /// Configuration errors
+    #[error("Configuration error: {0}")]
+    Config(String),
+
+    /// Out of memory
+    #[error("Out of memory: {0}")]
+    OutOfMemory(String),
+
+    /// Invalid operation
+    #[error("Invalid operation: {0}")]
+    InvalidOperation(String),
+
+    /// Not found
+    #[error("Not found: {0}")]
+    NotFound(String),
+
+    /// Backend inference errors
+    #[error("Backend error: {0}")]
+    Backend(String),
+
+    /// Model loading errors
+    #[error("Model error: {0}")]
+    Model(String),
+
+    /// Tokenization errors
+    #[error("Tokenization error: {0}")]
+    Tokenization(String),
+
+    /// Generation errors
+    #[error("Generation error: {0}")]
+    Generation(String),
+
+    /// IO errors
+    #[error("IO error: {0}")]
+    Io(#[from] std::io::Error),
+}
+```
+
+### `Result` Type Alias
+
+```rust
+/// Result type alias for RuvLLM operations
+pub type Result<T> = std::result::Result<T, RuvLLMError>;
+```
+
+---
+
+## Feature Flags Reference
+
+| Feature | Dependencies | Description |
+|---------|-------------|-------------|
+| `default` | `async-runtime` | Standard async support |
+| `async-runtime` | `tokio` | Tokio async runtime |
+| `wasm` | - | WebAssembly support |
+| `candle` | `candle-*`, `tokenizers`, `hf-hub` | Candle ML backend |
+| `metal` | `candle/metal` | Apple Metal GPU |
+| `cuda` | `candle/cuda` | NVIDIA CUDA GPU |
+| `inference-metal` | `candle`, `metal` | Full Metal stack |
+| `inference-cuda` | `candle`, `cuda` | Full CUDA stack |