Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/ruvLLM/src/inference_real.rs
+++ b/vendor/ruvector/examples/ruvLLM/src/inference_real.rs
@@ -0,0 +1,471 @@
+//! Real LLM Inference with CPU SIMD Optimization
+//!
+//! Uses candle for native Rust tensor operations with SIMD support (AVX2/AVX512).
+//! Optimized for CPU sandbox environments with small, efficient models.
+
+#[cfg(feature = "real-inference")]
+mod real {
+    use candle_core::{DType, Device, Tensor, D};
+    use candle_nn::{linear, Linear, Module, VarBuilder};
+    use candle_transformers::models::quantized_llama as llama;
+    use hf_hub::{api::tokio::Api, Repo, RepoType};
+    use tokenizers::Tokenizer;
+
+    use crate::config::InferenceConfig;
+    use crate::error::{Error, InferenceError, Result};
+    use crate::types::ModelSize;
+
+    use dashmap::DashMap;
+    use parking_lot::RwLock;
+    use std::path::PathBuf;
+    use std::sync::Arc;
+    use std::time::Instant;
+
+    /// Supported small models optimized for CPU
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+    pub enum SmallModel {
+        /// SmolLM 135M - Smallest viable model
+        SmolLM135M,
+        /// SmolLM 360M - Better quality, still fast
+        SmolLM360M,
+        /// Qwen2 0.5B - Good balance
+        Qwen2_500M,
+        /// TinyLlama 1.1B - Best quality for small
+        TinyLlama1B,
+    }
+
+    impl SmallModel {
+        pub fn repo_id(&self) -> &'static str {
+            match self {
+                SmallModel::SmolLM135M => "HuggingFaceTB/SmolLM-135M",
+                SmallModel::SmolLM360M => "HuggingFaceTB/SmolLM-360M",
+                SmallModel::Qwen2_500M => "Qwen/Qwen2-0.5B",
+                SmallModel::TinyLlama1B => "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            }
+        }
+
+        pub fn quantized_repo(&self) -> &'static str {
+            match self {
+                SmallModel::SmolLM135M => "HuggingFaceTB/SmolLM-135M-GGUF",
+                SmallModel::SmolLM360M => "HuggingFaceTB/SmolLM-360M-GGUF",
+                SmallModel::Qwen2_500M => "Qwen/Qwen2-0.5B-GGUF",
+                SmallModel::TinyLlama1B => "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+            }
+        }
+
+        pub fn gguf_file(&self) -> &'static str {
+            match self {
+                SmallModel::SmolLM135M => "smollm-135m-q4_k_m.gguf",
+                SmallModel::SmolLM360M => "smollm-360m-q4_k_m.gguf",
+                SmallModel::Qwen2_500M => "qwen2-0_5b-instruct-q4_k_m.gguf",
+                SmallModel::TinyLlama1B => "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+            }
+        }
+
+        pub fn context_size(&self) -> usize {
+            match self {
+                SmallModel::SmolLM135M => 2048,
+                SmallModel::SmolLM360M => 2048,
+                SmallModel::Qwen2_500M => 4096,
+                SmallModel::TinyLlama1B => 2048,
+            }
+        }
+
+        pub fn from_model_size(size: ModelSize) -> Self {
+            match size {
+                ModelSize::M350 => SmallModel::SmolLM135M,
+                ModelSize::M700 => SmallModel::SmolLM360M,
+                ModelSize::B1_2 => SmallModel::Qwen2_500M,
+                ModelSize::B2_6 => SmallModel::TinyLlama1B,
+            }
+        }
+    }
+
+    /// Generation configuration
+    #[derive(Debug, Clone)]
+    pub struct GenerationConfig {
+        pub max_tokens: usize,
+        pub temperature: f32,
+        pub top_p: f32,
+        pub top_k: usize,
+        pub repeat_penalty: f32,
+        pub seed: u64,
+    }
+
+    impl Default for GenerationConfig {
+        fn default() -> Self {
+            Self {
+                max_tokens: 256,
+                temperature: 0.7,
+                top_p: 0.9,
+                top_k: 40,
+                repeat_penalty: 1.1,
+                seed: 42,
+            }
+        }
+    }
+
+    /// Generation result
+    #[derive(Debug, Clone)]
+    pub struct GenerationResult {
+        pub text: String,
+        pub tokens_generated: usize,
+        pub model_used: ModelSize,
+        pub cache_hit: bool,
+        pub inference_time_ms: f64,
+        pub tokens_per_second: f64,
+    }
+
+    /// KV Cache for efficient generation
+    struct KvCache {
+        key: Option<Tensor>,
+        value: Option<Tensor>,
+        seq_len: usize,
+    }
+
+    impl KvCache {
+        fn new() -> Self {
+            Self {
+                key: None,
+                value: None,
+                seq_len: 0,
+            }
+        }
+
+        fn append(&mut self, key: Tensor, value: Tensor) -> Result<(Tensor, Tensor)> {
+            let (key, value) = match (&self.key, &self.value) {
+                (Some(k), Some(v)) => {
+                    let key = Tensor::cat(&[k, &key], 2)?;
+                    let value = Tensor::cat(&[v, &value], 2)?;
+                    (key, value)
+                }
+                _ => (key, value),
+            };
+            self.seq_len = key.dims()[2];
+            self.key = Some(key.clone());
+            self.value = Some(value.clone());
+            Ok((key, value))
+        }
+
+        fn reset(&mut self) {
+            self.key = None;
+            self.value = None;
+            self.seq_len = 0;
+        }
+    }
+
+    /// Real inference pool with CPU SIMD optimization
+    pub struct RealInferencePool {
+        /// Device (CPU with SIMD)
+        device: Device,
+        /// Loaded GGUF models
+        models: DashMap<SmallModel, Arc<llama::ModelWeights>>,
+        /// Tokenizers
+        tokenizers: DashMap<SmallModel, Arc<Tokenizer>>,
+        /// KV caches per session
+        kv_caches: DashMap<String, Vec<KvCache>>,
+        /// Configuration
+        config: InferenceConfig,
+        /// Model cache directory
+        cache_dir: PathBuf,
+    }
+
+    impl RealInferencePool {
+        /// Create new inference pool
+        pub async fn new(config: &InferenceConfig) -> Result<Self> {
+            // Use CPU device - candle will auto-detect SIMD capabilities
+            let device = Device::Cpu;
+
+            // Setup cache directory
+            let cache_dir = dirs::cache_dir()
+                .unwrap_or_else(|| PathBuf::from("."))
+                .join("ruvllm")
+                .join("models");
+
+            tokio::fs::create_dir_all(&cache_dir).await.map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!(
+                    "Failed to create cache dir: {}",
+                    e
+                )))
+            })?;
+
+            Ok(Self {
+                device,
+                models: DashMap::new(),
+                tokenizers: DashMap::new(),
+                kv_caches: DashMap::new(),
+                config: config.clone(),
+                cache_dir,
+            })
+        }
+
+        /// Download and load a model
+        async fn load_model(&self, model: SmallModel) -> Result<Arc<llama::ModelWeights>> {
+            // Check if already loaded
+            if let Some(m) = self.models.get(&model) {
+                return Ok(m.clone());
+            }
+
+            tracing::info!("Downloading model: {:?}", model);
+
+            // Download from HuggingFace Hub
+            let api = Api::new().map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!("HF API error: {}", e)))
+            })?;
+
+            let repo = api.repo(Repo::with_revision(
+                model.quantized_repo().to_string(),
+                RepoType::Model,
+                "main".to_string(),
+            ));
+
+            let model_path = repo.get(model.gguf_file()).await.map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!(
+                    "Failed to download model: {}",
+                    e
+                )))
+            })?;
+
+            tracing::info!("Loading GGUF model from: {:?}", model_path);
+
+            // Load GGUF model with memory mapping for efficiency
+            let mut file = std::fs::File::open(&model_path).map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!(
+                    "Failed to open model: {}",
+                    e
+                )))
+            })?;
+
+            let model_weights = llama::ModelWeights::from_gguf(file, &mut file, &self.device)
+                .map_err(|e| {
+                    Error::Inference(InferenceError::InitFailed(format!(
+                        "Failed to load GGUF: {}",
+                        e
+                    )))
+                })?;
+
+            let model_arc = Arc::new(model_weights);
+            self.models.insert(model, model_arc.clone());
+
+            Ok(model_arc)
+        }
+
+        /// Download and load tokenizer
+        async fn load_tokenizer(&self, model: SmallModel) -> Result<Arc<Tokenizer>> {
+            if let Some(t) = self.tokenizers.get(&model) {
+                return Ok(t.clone());
+            }
+
+            let api = Api::new().map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!("HF API error: {}", e)))
+            })?;
+
+            let repo = api.repo(Repo::new(model.repo_id().to_string(), RepoType::Model));
+
+            let tokenizer_path = repo.get("tokenizer.json").await.map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!(
+                    "Failed to download tokenizer: {}",
+                    e
+                )))
+            })?;
+
+            let tokenizer = Tokenizer::from_file(tokenizer_path).map_err(|e| {
+                Error::Inference(InferenceError::InitFailed(format!(
+                    "Failed to load tokenizer: {}",
+                    e
+                )))
+            })?;
+
+            let tokenizer_arc = Arc::new(tokenizer);
+            self.tokenizers.insert(model, tokenizer_arc.clone());
+
+            Ok(tokenizer_arc)
+        }
+
+        /// Sample next token with temperature and top-p
+        fn sample_token(
+            &self,
+            logits: &Tensor,
+            config: &GenerationConfig,
+            generated_tokens: &[u32],
+        ) -> Result<u32> {
+            let logits = logits.squeeze(0)?.squeeze(0)?;
+            let mut logits_vec: Vec<f32> = logits.to_vec1()?;
+
+            // Apply repeat penalty
+            for &token in generated_tokens {
+                if (token as usize) < logits_vec.len() {
+                    logits_vec[token as usize] /= config.repeat_penalty;
+                }
+            }
+
+            // Apply temperature
+            if config.temperature > 0.0 {
+                for l in &mut logits_vec {
+                    *l /= config.temperature;
+                }
+            }
+
+            // Softmax
+            let max_logit = logits_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+            let mut probs: Vec<f32> = logits_vec.iter().map(|l| (l - max_logit).exp()).collect();
+            let sum: f32 = probs.iter().sum();
+            for p in &mut probs {
+                *p /= sum;
+            }
+
+            // Top-p sampling
+            let mut sorted_indices: Vec<usize> = (0..probs.len()).collect();
+            sorted_indices.sort_by(|&a, &b| probs[b].partial_cmp(&probs[a]).unwrap());
+
+            let mut cumsum = 0.0;
+            let mut cutoff_idx = sorted_indices.len();
+            for (i, &idx) in sorted_indices.iter().enumerate() {
+                cumsum += probs[idx];
+                if cumsum > config.top_p {
+                    cutoff_idx = i + 1;
+                    break;
+                }
+            }
+
+            // Top-k limiting
+            cutoff_idx = cutoff_idx.min(config.top_k);
+
+            // Renormalize
+            let valid_indices: Vec<usize> = sorted_indices[..cutoff_idx].to_vec();
+            let mut valid_probs: Vec<f32> = valid_indices.iter().map(|&i| probs[i]).collect();
+            let sum: f32 = valid_probs.iter().sum();
+            for p in &mut valid_probs {
+                *p /= sum;
+            }
+
+            // Sample
+            use rand::Rng;
+            let mut rng = rand::thread_rng();
+            let r: f32 = rng.gen();
+            let mut cumsum = 0.0;
+            for (i, &p) in valid_probs.iter().enumerate() {
+                cumsum += p;
+                if r < cumsum {
+                    return Ok(valid_indices[i] as u32);
+                }
+            }
+
+            Ok(valid_indices[0] as u32)
+        }
+
+        /// Generate text with real inference
+        pub async fn generate(
+            &self,
+            model_size: ModelSize,
+            prompt: &str,
+            config: GenerationConfig,
+            session_key: Option<&str>,
+        ) -> Result<GenerationResult> {
+            let start = Instant::now();
+            let small_model = SmallModel::from_model_size(model_size);
+
+            // Load model and tokenizer
+            let model = self.load_model(small_model).await?;
+            let tokenizer = self.load_tokenizer(small_model).await?;
+
+            // Tokenize input
+            let encoding = tokenizer.encode(prompt, true).map_err(|e| {
+                Error::Inference(InferenceError::GenerationFailed(format!(
+                    "Tokenization failed: {}",
+                    e
+                )))
+            })?;
+
+            let mut tokens: Vec<u32> = encoding.get_ids().to_vec();
+            let input_len = tokens.len();
+
+            // Initialize or get KV cache
+            let cache_key = session_key
+                .map(|s| s.to_string())
+                .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
+
+            let num_layers = 12; // Typical for small models
+            if !self.kv_caches.contains_key(&cache_key) {
+                let caches: Vec<KvCache> = (0..num_layers).map(|_| KvCache::new()).collect();
+                self.kv_caches.insert(cache_key.clone(), caches);
+            }
+
+            // Generate tokens
+            let mut generated = Vec::new();
+            let eos_token = tokenizer
+                .token_to_id("</s>")
+                .or_else(|| tokenizer.token_to_id("<|endoftext|>"))
+                .unwrap_or(2);
+
+            for _ in 0..config.max_tokens {
+                // Create input tensor
+                let input = Tensor::new(&tokens[tokens.len() - 1..], &self.device)?;
+                let input = input.unsqueeze(0)?;
+
+                // Forward pass with SIMD-optimized operations
+                let logits = model.forward(&input, tokens.len() - 1)?;
+
+                // Sample next token
+                let next_token = self.sample_token(&logits, &config, &generated)?;
+
+                if next_token == eos_token {
+                    break;
+                }
+
+                tokens.push(next_token);
+                generated.push(next_token);
+            }
+
+            // Decode output
+            let output_text = tokenizer.decode(&generated, true).map_err(|e| {
+                Error::Inference(InferenceError::GenerationFailed(format!(
+                    "Decoding failed: {}",
+                    e
+                )))
+            })?;
+
+            let elapsed = start.elapsed().as_secs_f64() * 1000.0;
+            let tokens_per_second = if elapsed > 0.0 {
+                (generated.len() as f64 / elapsed) * 1000.0
+            } else {
+                0.0
+            };
+
+            Ok(GenerationResult {
+                text: output_text,
+                tokens_generated: generated.len(),
+                model_used: model_size,
+                cache_hit: session_key.is_some(),
+                inference_time_ms: elapsed,
+                tokens_per_second,
+            })
+        }
+
+        /// Get pool health info
+        pub async fn health_check(&self) -> Result<HealthInfo> {
+            Ok(HealthInfo {
+                loaded_models: self.models.len(),
+                loaded_tokenizers: self.tokenizers.len(),
+                active_sessions: self.kv_caches.len(),
+                device: "CPU (SIMD)".to_string(),
+            })
+        }
+    }
+
+    /// Health information
+    #[derive(Debug, Clone)]
+    pub struct HealthInfo {
+        pub loaded_models: usize,
+        pub loaded_tokenizers: usize,
+        pub active_sessions: usize,
+        pub device: String,
+    }
+}
+
+#[cfg(feature = "real-inference")]
+pub use real::*;
+
+// Re-export types for non-real-inference builds
+#[cfg(not(feature = "real-inference"))]
+pub use crate::inference::{GenerationConfig, GenerationResult, HealthInfo, InferencePool};