Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
471
vendor/ruvector/examples/ruvLLM/src/inference_real.rs
vendored
Normal file
471
vendor/ruvector/examples/ruvLLM/src/inference_real.rs
vendored
Normal file
@@ -0,0 +1,471 @@
|
||||
//! Real LLM Inference with CPU SIMD Optimization
|
||||
//!
|
||||
//! Uses candle for native Rust tensor operations with SIMD support (AVX2/AVX512).
|
||||
//! Optimized for CPU sandbox environments with small, efficient models.
|
||||
|
||||
#[cfg(feature = "real-inference")]
|
||||
mod real {
|
||||
use candle_core::{DType, Device, Tensor, D};
|
||||
use candle_nn::{linear, Linear, Module, VarBuilder};
|
||||
use candle_transformers::models::quantized_llama as llama;
|
||||
use hf_hub::{api::tokio::Api, Repo, RepoType};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
use crate::config::InferenceConfig;
|
||||
use crate::error::{Error, InferenceError, Result};
|
||||
use crate::types::ModelSize;
|
||||
|
||||
use dashmap::DashMap;
|
||||
use parking_lot::RwLock;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Supported small models optimized for CPU
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SmallModel {
|
||||
/// SmolLM 135M - Smallest viable model
|
||||
SmolLM135M,
|
||||
/// SmolLM 360M - Better quality, still fast
|
||||
SmolLM360M,
|
||||
/// Qwen2 0.5B - Good balance
|
||||
Qwen2_500M,
|
||||
/// TinyLlama 1.1B - Best quality for small
|
||||
TinyLlama1B,
|
||||
}
|
||||
|
||||
impl SmallModel {
|
||||
pub fn repo_id(&self) -> &'static str {
|
||||
match self {
|
||||
SmallModel::SmolLM135M => "HuggingFaceTB/SmolLM-135M",
|
||||
SmallModel::SmolLM360M => "HuggingFaceTB/SmolLM-360M",
|
||||
SmallModel::Qwen2_500M => "Qwen/Qwen2-0.5B",
|
||||
SmallModel::TinyLlama1B => "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn quantized_repo(&self) -> &'static str {
|
||||
match self {
|
||||
SmallModel::SmolLM135M => "HuggingFaceTB/SmolLM-135M-GGUF",
|
||||
SmallModel::SmolLM360M => "HuggingFaceTB/SmolLM-360M-GGUF",
|
||||
SmallModel::Qwen2_500M => "Qwen/Qwen2-0.5B-GGUF",
|
||||
SmallModel::TinyLlama1B => "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gguf_file(&self) -> &'static str {
|
||||
match self {
|
||||
SmallModel::SmolLM135M => "smollm-135m-q4_k_m.gguf",
|
||||
SmallModel::SmolLM360M => "smollm-360m-q4_k_m.gguf",
|
||||
SmallModel::Qwen2_500M => "qwen2-0_5b-instruct-q4_k_m.gguf",
|
||||
SmallModel::TinyLlama1B => "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn context_size(&self) -> usize {
|
||||
match self {
|
||||
SmallModel::SmolLM135M => 2048,
|
||||
SmallModel::SmolLM360M => 2048,
|
||||
SmallModel::Qwen2_500M => 4096,
|
||||
SmallModel::TinyLlama1B => 2048,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_model_size(size: ModelSize) -> Self {
|
||||
match size {
|
||||
ModelSize::M350 => SmallModel::SmolLM135M,
|
||||
ModelSize::M700 => SmallModel::SmolLM360M,
|
||||
ModelSize::B1_2 => SmallModel::Qwen2_500M,
|
||||
ModelSize::B2_6 => SmallModel::TinyLlama1B,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generation configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerationConfig {
|
||||
pub max_tokens: usize,
|
||||
pub temperature: f32,
|
||||
pub top_p: f32,
|
||||
pub top_k: usize,
|
||||
pub repeat_penalty: f32,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
impl Default for GenerationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_tokens: 256,
|
||||
temperature: 0.7,
|
||||
top_p: 0.9,
|
||||
top_k: 40,
|
||||
repeat_penalty: 1.1,
|
||||
seed: 42,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generation result
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerationResult {
|
||||
pub text: String,
|
||||
pub tokens_generated: usize,
|
||||
pub model_used: ModelSize,
|
||||
pub cache_hit: bool,
|
||||
pub inference_time_ms: f64,
|
||||
pub tokens_per_second: f64,
|
||||
}
|
||||
|
||||
/// KV Cache for efficient generation
|
||||
struct KvCache {
|
||||
key: Option<Tensor>,
|
||||
value: Option<Tensor>,
|
||||
seq_len: usize,
|
||||
}
|
||||
|
||||
impl KvCache {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
key: None,
|
||||
value: None,
|
||||
seq_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn append(&mut self, key: Tensor, value: Tensor) -> Result<(Tensor, Tensor)> {
|
||||
let (key, value) = match (&self.key, &self.value) {
|
||||
(Some(k), Some(v)) => {
|
||||
let key = Tensor::cat(&[k, &key], 2)?;
|
||||
let value = Tensor::cat(&[v, &value], 2)?;
|
||||
(key, value)
|
||||
}
|
||||
_ => (key, value),
|
||||
};
|
||||
self.seq_len = key.dims()[2];
|
||||
self.key = Some(key.clone());
|
||||
self.value = Some(value.clone());
|
||||
Ok((key, value))
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.key = None;
|
||||
self.value = None;
|
||||
self.seq_len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Real inference pool with CPU SIMD optimization
|
||||
pub struct RealInferencePool {
|
||||
/// Device (CPU with SIMD)
|
||||
device: Device,
|
||||
/// Loaded GGUF models
|
||||
models: DashMap<SmallModel, Arc<llama::ModelWeights>>,
|
||||
/// Tokenizers
|
||||
tokenizers: DashMap<SmallModel, Arc<Tokenizer>>,
|
||||
/// KV caches per session
|
||||
kv_caches: DashMap<String, Vec<KvCache>>,
|
||||
/// Configuration
|
||||
config: InferenceConfig,
|
||||
/// Model cache directory
|
||||
cache_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl RealInferencePool {
|
||||
/// Create new inference pool
|
||||
pub async fn new(config: &InferenceConfig) -> Result<Self> {
|
||||
// Use CPU device - candle will auto-detect SIMD capabilities
|
||||
let device = Device::Cpu;
|
||||
|
||||
// Setup cache directory
|
||||
let cache_dir = dirs::cache_dir()
|
||||
.unwrap_or_else(|| PathBuf::from("."))
|
||||
.join("ruvllm")
|
||||
.join("models");
|
||||
|
||||
tokio::fs::create_dir_all(&cache_dir).await.map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to create cache dir: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
Ok(Self {
|
||||
device,
|
||||
models: DashMap::new(),
|
||||
tokenizers: DashMap::new(),
|
||||
kv_caches: DashMap::new(),
|
||||
config: config.clone(),
|
||||
cache_dir,
|
||||
})
|
||||
}
|
||||
|
||||
/// Download and load a model
|
||||
async fn load_model(&self, model: SmallModel) -> Result<Arc<llama::ModelWeights>> {
|
||||
// Check if already loaded
|
||||
if let Some(m) = self.models.get(&model) {
|
||||
return Ok(m.clone());
|
||||
}
|
||||
|
||||
tracing::info!("Downloading model: {:?}", model);
|
||||
|
||||
// Download from HuggingFace Hub
|
||||
let api = Api::new().map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!("HF API error: {}", e)))
|
||||
})?;
|
||||
|
||||
let repo = api.repo(Repo::with_revision(
|
||||
model.quantized_repo().to_string(),
|
||||
RepoType::Model,
|
||||
"main".to_string(),
|
||||
));
|
||||
|
||||
let model_path = repo.get(model.gguf_file()).await.map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to download model: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
tracing::info!("Loading GGUF model from: {:?}", model_path);
|
||||
|
||||
// Load GGUF model with memory mapping for efficiency
|
||||
let mut file = std::fs::File::open(&model_path).map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to open model: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let model_weights = llama::ModelWeights::from_gguf(file, &mut file, &self.device)
|
||||
.map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to load GGUF: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let model_arc = Arc::new(model_weights);
|
||||
self.models.insert(model, model_arc.clone());
|
||||
|
||||
Ok(model_arc)
|
||||
}
|
||||
|
||||
/// Download and load tokenizer
|
||||
async fn load_tokenizer(&self, model: SmallModel) -> Result<Arc<Tokenizer>> {
|
||||
if let Some(t) = self.tokenizers.get(&model) {
|
||||
return Ok(t.clone());
|
||||
}
|
||||
|
||||
let api = Api::new().map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!("HF API error: {}", e)))
|
||||
})?;
|
||||
|
||||
let repo = api.repo(Repo::new(model.repo_id().to_string(), RepoType::Model));
|
||||
|
||||
let tokenizer_path = repo.get("tokenizer.json").await.map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to download tokenizer: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let tokenizer = Tokenizer::from_file(tokenizer_path).map_err(|e| {
|
||||
Error::Inference(InferenceError::InitFailed(format!(
|
||||
"Failed to load tokenizer: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let tokenizer_arc = Arc::new(tokenizer);
|
||||
self.tokenizers.insert(model, tokenizer_arc.clone());
|
||||
|
||||
Ok(tokenizer_arc)
|
||||
}
|
||||
|
||||
/// Sample next token with temperature and top-p
|
||||
fn sample_token(
|
||||
&self,
|
||||
logits: &Tensor,
|
||||
config: &GenerationConfig,
|
||||
generated_tokens: &[u32],
|
||||
) -> Result<u32> {
|
||||
let logits = logits.squeeze(0)?.squeeze(0)?;
|
||||
let mut logits_vec: Vec<f32> = logits.to_vec1()?;
|
||||
|
||||
// Apply repeat penalty
|
||||
for &token in generated_tokens {
|
||||
if (token as usize) < logits_vec.len() {
|
||||
logits_vec[token as usize] /= config.repeat_penalty;
|
||||
}
|
||||
}
|
||||
|
||||
// Apply temperature
|
||||
if config.temperature > 0.0 {
|
||||
for l in &mut logits_vec {
|
||||
*l /= config.temperature;
|
||||
}
|
||||
}
|
||||
|
||||
// Softmax
|
||||
let max_logit = logits_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let mut probs: Vec<f32> = logits_vec.iter().map(|l| (l - max_logit).exp()).collect();
|
||||
let sum: f32 = probs.iter().sum();
|
||||
for p in &mut probs {
|
||||
*p /= sum;
|
||||
}
|
||||
|
||||
// Top-p sampling
|
||||
let mut sorted_indices: Vec<usize> = (0..probs.len()).collect();
|
||||
sorted_indices.sort_by(|&a, &b| probs[b].partial_cmp(&probs[a]).unwrap());
|
||||
|
||||
let mut cumsum = 0.0;
|
||||
let mut cutoff_idx = sorted_indices.len();
|
||||
for (i, &idx) in sorted_indices.iter().enumerate() {
|
||||
cumsum += probs[idx];
|
||||
if cumsum > config.top_p {
|
||||
cutoff_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Top-k limiting
|
||||
cutoff_idx = cutoff_idx.min(config.top_k);
|
||||
|
||||
// Renormalize
|
||||
let valid_indices: Vec<usize> = sorted_indices[..cutoff_idx].to_vec();
|
||||
let mut valid_probs: Vec<f32> = valid_indices.iter().map(|&i| probs[i]).collect();
|
||||
let sum: f32 = valid_probs.iter().sum();
|
||||
for p in &mut valid_probs {
|
||||
*p /= sum;
|
||||
}
|
||||
|
||||
// Sample
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
let r: f32 = rng.gen();
|
||||
let mut cumsum = 0.0;
|
||||
for (i, &p) in valid_probs.iter().enumerate() {
|
||||
cumsum += p;
|
||||
if r < cumsum {
|
||||
return Ok(valid_indices[i] as u32);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(valid_indices[0] as u32)
|
||||
}
|
||||
|
||||
/// Generate text with real inference
|
||||
pub async fn generate(
|
||||
&self,
|
||||
model_size: ModelSize,
|
||||
prompt: &str,
|
||||
config: GenerationConfig,
|
||||
session_key: Option<&str>,
|
||||
) -> Result<GenerationResult> {
|
||||
let start = Instant::now();
|
||||
let small_model = SmallModel::from_model_size(model_size);
|
||||
|
||||
// Load model and tokenizer
|
||||
let model = self.load_model(small_model).await?;
|
||||
let tokenizer = self.load_tokenizer(small_model).await?;
|
||||
|
||||
// Tokenize input
|
||||
let encoding = tokenizer.encode(prompt, true).map_err(|e| {
|
||||
Error::Inference(InferenceError::GenerationFailed(format!(
|
||||
"Tokenization failed: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let mut tokens: Vec<u32> = encoding.get_ids().to_vec();
|
||||
let input_len = tokens.len();
|
||||
|
||||
// Initialize or get KV cache
|
||||
let cache_key = session_key
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||
|
||||
let num_layers = 12; // Typical for small models
|
||||
if !self.kv_caches.contains_key(&cache_key) {
|
||||
let caches: Vec<KvCache> = (0..num_layers).map(|_| KvCache::new()).collect();
|
||||
self.kv_caches.insert(cache_key.clone(), caches);
|
||||
}
|
||||
|
||||
// Generate tokens
|
||||
let mut generated = Vec::new();
|
||||
let eos_token = tokenizer
|
||||
.token_to_id("</s>")
|
||||
.or_else(|| tokenizer.token_to_id("<|endoftext|>"))
|
||||
.unwrap_or(2);
|
||||
|
||||
for _ in 0..config.max_tokens {
|
||||
// Create input tensor
|
||||
let input = Tensor::new(&tokens[tokens.len() - 1..], &self.device)?;
|
||||
let input = input.unsqueeze(0)?;
|
||||
|
||||
// Forward pass with SIMD-optimized operations
|
||||
let logits = model.forward(&input, tokens.len() - 1)?;
|
||||
|
||||
// Sample next token
|
||||
let next_token = self.sample_token(&logits, &config, &generated)?;
|
||||
|
||||
if next_token == eos_token {
|
||||
break;
|
||||
}
|
||||
|
||||
tokens.push(next_token);
|
||||
generated.push(next_token);
|
||||
}
|
||||
|
||||
// Decode output
|
||||
let output_text = tokenizer.decode(&generated, true).map_err(|e| {
|
||||
Error::Inference(InferenceError::GenerationFailed(format!(
|
||||
"Decoding failed: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
|
||||
let elapsed = start.elapsed().as_secs_f64() * 1000.0;
|
||||
let tokens_per_second = if elapsed > 0.0 {
|
||||
(generated.len() as f64 / elapsed) * 1000.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
Ok(GenerationResult {
|
||||
text: output_text,
|
||||
tokens_generated: generated.len(),
|
||||
model_used: model_size,
|
||||
cache_hit: session_key.is_some(),
|
||||
inference_time_ms: elapsed,
|
||||
tokens_per_second,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get pool health info
|
||||
pub async fn health_check(&self) -> Result<HealthInfo> {
|
||||
Ok(HealthInfo {
|
||||
loaded_models: self.models.len(),
|
||||
loaded_tokenizers: self.tokenizers.len(),
|
||||
active_sessions: self.kv_caches.len(),
|
||||
device: "CPU (SIMD)".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Health information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealthInfo {
|
||||
pub loaded_models: usize,
|
||||
pub loaded_tokenizers: usize,
|
||||
pub active_sessions: usize,
|
||||
pub device: String,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "real-inference")]
|
||||
pub use real::*;
|
||||
|
||||
// Re-export types for non-real-inference builds
|
||||
#[cfg(not(feature = "real-inference"))]
|
||||
pub use crate::inference::{GenerationConfig, GenerationResult, HealthInfo, InferencePool};
|
||||
Reference in New Issue
Block a user