//! N-API bindings for RuvLLM //! //! Provides Node.js bindings for the RuvLLM self-learning LLM orchestrator. //! //! ## v2.0 Features //! //! - **Optimized kernels**: Flash Attention 2, NEON GEMM/GEMV //! - **Parallel inference**: Multi-threaded when `parallel` feature enabled //! - **Quantization**: INT8, INT4, Q4K support via `quantization` option //! - **Metal GPU**: Optional Metal acceleration on Apple Silicon //! //! ## Example (Node.js) //! //! ```javascript //! const { RuvLLMEngine } = require('@ruvector/ruvllm'); //! //! // Create engine with parallel inference //! const engine = new RuvLLMEngine({ //! useParallel: true, //! useMetal: false, //! quantization: 'q4k', //! }); //! //! // Generate text //! const response = engine.query("Hello, world!"); //! console.log(response.text); //! //! // Check SIMD capabilities //! console.log(engine.simdCapabilities()); // ['NEON'] on M4 Pro //! ``` #![cfg(feature = "napi")] use napi::bindgen_prelude::*; use napi_derive::napi; use crate::config::{EmbeddingConfig, MemoryConfig, RouterConfig}; use crate::embedding::EmbeddingService; use crate::memory::{cosine_distance, MemoryService}; use crate::router::FastGRNNRouter; use crate::simd_inference::{SimdGenerationConfig, SimdInferenceEngine, SimdOps}; use crate::types::{MemoryNode, NodeType}; use parking_lot::RwLock; use std::collections::HashMap; use std::sync::Arc; // Import optimized kernels for capability detection use ruvllm_lib::kernels::is_neon_available; use ruvllm_lib::memory_pool::{MemoryManager, MemoryManagerConfig, MemoryManagerStats}; /// RuvLLM Configuration for Node.js #[napi(object)] #[derive(Clone, Debug)] pub struct JsRuvLLMConfig { /// Embedding dimension (default: 768) pub embedding_dim: Option, /// Router hidden dimension (default: 128) pub router_hidden_dim: Option, /// HNSW M parameter (default: 16) pub hnsw_m: Option, /// HNSW ef_construction (default: 100) pub hnsw_ef_construction: Option, /// HNSW ef_search (default: 64) pub hnsw_ef_search: Option, /// Enable learning (default: true) pub learning_enabled: Option, /// Quality threshold for learning (default: 0.7) pub quality_threshold: Option, /// EWC lambda (default: 2000) pub ewc_lambda: Option, // v2.0: New optimization options /// Enable parallel inference using rayon (default: true if feature enabled) pub use_parallel: Option, /// Quantization type: "none", "int8", "int4", "q4k" (default: "none") pub quantization: Option, /// Enable Metal GPU acceleration on Apple Silicon (default: false) pub use_metal: Option, /// Memory pool capacity in MB (default: 512) pub memory_pool_mb: Option, } impl Default for JsRuvLLMConfig { fn default() -> Self { Self { embedding_dim: Some(768), router_hidden_dim: Some(128), hnsw_m: Some(16), hnsw_ef_construction: Some(100), hnsw_ef_search: Some(64), learning_enabled: Some(true), quality_threshold: Some(0.7), ewc_lambda: Some(2000.0), // v2.0 defaults use_parallel: Some(true), quantization: Some("none".to_string()), use_metal: Some(false), memory_pool_mb: Some(512), } } } /// Quantization type for model weights #[derive(Debug, Clone, Copy, PartialEq)] pub enum QuantizationType { /// No quantization (FP32) None, /// 8-bit integer quantization Int8, /// 4-bit integer quantization Int4, /// Q4K (k-quants, higher quality) Q4K, } impl From<&str> for QuantizationType { fn from(s: &str) -> Self { match s.to_lowercase().as_str() { "int8" | "q8" => QuantizationType::Int8, "int4" | "q4" => QuantizationType::Int4, "q4k" | "q4_k" => QuantizationType::Q4K, _ => QuantizationType::None, } } } /// Memory pool statistics (v2.0) #[napi(object)] #[derive(Clone, Debug)] pub struct JsMemoryPoolStats { /// Total bytes allocated pub bytes_allocated: u32, /// Total capacity in bytes pub capacity_bytes: u32, /// Number of active allocations pub active_allocations: u32, /// Peak memory usage in bytes pub peak_bytes: u32, /// Whether NEON SIMD is available pub neon_available: bool, /// Whether Metal GPU is available pub metal_available: bool, } /// Generation configuration #[napi(object)] #[derive(Clone, Debug)] pub struct JsGenerationConfig { /// Maximum tokens to generate pub max_tokens: Option, /// Temperature for sampling pub temperature: Option, /// Top-p nucleus sampling pub top_p: Option, /// Top-k sampling pub top_k: Option, /// Repetition penalty pub repetition_penalty: Option, } impl Default for JsGenerationConfig { fn default() -> Self { Self { max_tokens: Some(256), temperature: Some(0.7), top_p: Some(0.9), top_k: Some(50), repetition_penalty: Some(1.1), } } } /// Query response #[napi(object)] #[derive(Clone, Debug)] pub struct JsQueryResponse { /// Generated text pub text: String, /// Confidence score pub confidence: f64, /// Selected model pub model: String, /// Context size used pub context_size: u32, /// Latency in milliseconds pub latency_ms: f64, /// Request ID pub request_id: String, } /// Routing decision #[napi(object)] #[derive(Clone, Debug)] pub struct JsRoutingDecision { /// Selected model size pub model: String, /// Recommended context size pub context_size: u32, /// Temperature pub temperature: f64, /// Top-p pub top_p: f64, /// Confidence pub confidence: f64, } /// Memory search result #[napi(object)] #[derive(Clone, Debug)] pub struct JsMemoryResult { /// Node ID pub id: String, /// Distance (lower is better) pub distance: f64, /// Content text pub content: String, /// Metadata JSON pub metadata: String, } /// RuvLLM Statistics #[napi(object)] #[derive(Clone, Debug)] pub struct JsRuvLLMStats { /// Total queries processed pub total_queries: u32, /// Memory nodes stored pub memory_nodes: u32, /// Patterns learned (training steps) pub patterns_learned: u32, /// Average latency ms pub avg_latency_ms: f64, /// Cache hit rate (0.0 - 1.0) pub cache_hit_rate: f64, /// Router accuracy (0.0 - 1.0) pub router_accuracy: f64, } /// RuvLLM Engine - Main orchestrator for self-learning LLM #[napi] pub struct RuvLLMEngine { embedding_dim: usize, router_hidden: usize, inference_engine: Arc>, router: Arc>, memory: Arc>, embedding: Arc>, learning_enabled: bool, quality_threshold: f32, total_queries: u64, total_latency_ms: f64, hnsw_ef_search: usize, } /// Synchronous memory service wrapper struct MemoryServiceSync { inner: MemoryService, runtime: tokio::runtime::Runtime, } impl MemoryServiceSync { fn new(config: &MemoryConfig) -> Result { let runtime = tokio::runtime::Runtime::new() .map_err(|e| Error::from_reason(format!("Failed to create runtime: {}", e)))?; let inner = runtime .block_on(MemoryService::new(config)) .map_err(|e| Error::from_reason(format!("Failed to create memory service: {}", e)))?; Ok(Self { inner, runtime }) } fn insert_node(&self, node: MemoryNode) -> Result { self.inner .insert_node(node) .map_err(|e| Error::from_reason(format!("Insert failed: {}", e))) } fn search(&self, query: &[f32], k: usize, ef_search: usize) -> Vec<(String, f32, String)> { let result = self .runtime .block_on(self.inner.search_with_graph(query, k, ef_search, 1)); match result { Ok(search_result) => search_result .candidates .into_iter() .map(|c| (c.id, c.distance, c.node.text)) .collect(), Err(_) => vec![], } } fn node_count(&self) -> usize { self.inner.node_count() } fn get_stats(&self) -> (u64, u64) { let stats = self.inner.get_stats(); (stats.total_insertions, stats.total_searches) } } #[napi] impl RuvLLMEngine { /// Create a new RuvLLM engine with default configuration #[napi(constructor)] pub fn new(config: Option) -> Result { let cfg = config.unwrap_or_default(); let embedding_dim = cfg.embedding_dim.unwrap_or(768) as usize; let router_hidden = cfg.router_hidden_dim.unwrap_or(128) as usize; let hnsw_m = cfg.hnsw_m.unwrap_or(16) as usize; let hnsw_ef_construction = cfg.hnsw_ef_construction.unwrap_or(100) as usize; let hnsw_ef_search = cfg.hnsw_ef_search.unwrap_or(64) as usize; let learning_enabled = cfg.learning_enabled.unwrap_or(true); let quality_threshold = cfg.quality_threshold.unwrap_or(0.7) as f32; // Create configs let embedding_config = EmbeddingConfig { dimension: embedding_dim, max_tokens: 512, batch_size: 8, }; let router_config = RouterConfig { input_dim: embedding_dim, hidden_dim: router_hidden, sparsity: 0.9, rank: 8, confidence_threshold: 0.7, weights_path: None, }; let memory_config = MemoryConfig { db_path: std::path::PathBuf::from("./data/memory.db"), hnsw_m, hnsw_ef_construction, hnsw_ef_search, max_nodes: 100000, writeback_batch_size: 100, writeback_interval_ms: 1000, }; // Initialize components let inference_engine = SimdInferenceEngine::new_demo(); let router = FastGRNNRouter::new(&router_config) .map_err(|e| Error::from_reason(format!("Failed to create router: {}", e)))?; let memory = MemoryServiceSync::new(&memory_config)?; let embedding = EmbeddingService::new(&embedding_config).map_err(|e| { Error::from_reason(format!("Failed to create embedding service: {}", e)) })?; Ok(Self { embedding_dim, router_hidden, inference_engine: Arc::new(RwLock::new(inference_engine)), router: Arc::new(RwLock::new(router)), memory: Arc::new(RwLock::new(memory)), embedding: Arc::new(RwLock::new(embedding)), learning_enabled, quality_threshold, total_queries: 0, total_latency_ms: 0.0, hnsw_ef_search, }) } /// Query the LLM with automatic routing #[napi] pub fn query( &mut self, text: String, config: Option, ) -> Result { let start = std::time::Instant::now(); let gen_config = config.unwrap_or_default(); // Generate embedding let embedding = self .embedding .read() .embed(&text) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; // Get routing decision let hidden = vec![0.0f32; self.router_hidden]; let routing = self .router .read() .forward(&embedding.vector, &hidden) .map_err(|e| Error::from_reason(format!("Routing failed: {}", e)))?; // Generate response let simd_config = SimdGenerationConfig { max_tokens: gen_config.max_tokens.unwrap_or(256) as usize, temperature: gen_config.temperature.unwrap_or(0.7) as f32, top_p: gen_config.top_p.unwrap_or(0.9) as f32, top_k: gen_config.top_k.unwrap_or(50) as usize, repeat_penalty: gen_config.repetition_penalty.unwrap_or(1.1) as f32, ..Default::default() }; let (text, _tokens, _latency) = self.inference_engine .read() .generate(&text, &simd_config, None); let latency_ms = start.elapsed().as_secs_f64() * 1000.0; self.total_queries += 1; self.total_latency_ms += latency_ms; let request_id = uuid::Uuid::new_v4().to_string(); Ok(JsQueryResponse { text, confidence: routing.confidence as f64, model: format!("{:?}", routing.model), context_size: routing.context_size as u32, latency_ms, request_id, }) } /// Generate text with SIMD-optimized inference #[napi] pub fn generate(&self, prompt: String, config: Option) -> Result { let gen_config = config.unwrap_or_default(); let simd_config = SimdGenerationConfig { max_tokens: gen_config.max_tokens.unwrap_or(256) as usize, temperature: gen_config.temperature.unwrap_or(0.7) as f32, top_p: gen_config.top_p.unwrap_or(0.9) as f32, top_k: gen_config.top_k.unwrap_or(50) as usize, repeat_penalty: gen_config.repetition_penalty.unwrap_or(1.1) as f32, ..Default::default() }; let (text, _tokens, _latency) = self.inference_engine .read() .generate(&prompt, &simd_config, None); Ok(text) } /// Get routing decision for a query #[napi] pub fn route(&self, text: String) -> Result { let embedding = self .embedding .read() .embed(&text) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; let hidden = vec![0.0f32; self.router_hidden]; let routing = self .router .read() .forward(&embedding.vector, &hidden) .map_err(|e| Error::from_reason(format!("Routing failed: {}", e)))?; Ok(JsRoutingDecision { model: format!("{:?}", routing.model), context_size: routing.context_size as u32, temperature: routing.temperature as f64, top_p: routing.top_p as f64, confidence: routing.confidence as f64, }) } /// Search memory for similar content #[napi] pub fn search_memory(&self, text: String, k: Option) -> Result> { let embedding = self .embedding .read() .embed(&text) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; let k = k.unwrap_or(10) as usize; let results = self .memory .read() .search(&embedding.vector, k, self.hnsw_ef_search); Ok(results .into_iter() .map(|(id, distance, content)| JsMemoryResult { id, distance: distance as f64, content, metadata: "{}".to_string(), }) .collect()) } /// Add content to memory #[napi] pub fn add_memory(&self, content: String, metadata: Option) -> Result { let embedding = self .embedding .read() .embed(&content) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; let meta: HashMap = metadata .and_then(|s| serde_json::from_str(&s).ok()) .unwrap_or_default(); let node = MemoryNode { id: uuid::Uuid::new_v4().to_string(), vector: embedding.vector, text: content, node_type: NodeType::Fact, source: "napi".to_string(), metadata: meta, }; self.memory.write().insert_node(node) } /// Provide feedback for learning #[napi] pub fn feedback( &mut self, _request_id: String, rating: u32, _correction: Option, ) -> Result { if !self.learning_enabled { return Ok(false); } let quality = rating as f32 / 5.0; Ok(quality >= self.quality_threshold) } /// Get engine statistics #[napi] pub fn stats(&self) -> JsRuvLLMStats { let memory = self.memory.read(); let (insertions, searches) = memory.get_stats(); let router_guard = self.router.read(); let router_stats = router_guard.stats(); let training_steps = router_stats .training_steps .load(std::sync::atomic::Ordering::Relaxed) as u32; // Calculate cache hit rate from memory stats let total_ops = insertions + searches; let cache_hit_rate = if total_ops > 0 { // Estimate: searches that don't result in new insertions are "hits" searches as f64 / total_ops as f64 } else { 0.0 }; // Router accuracy based on training convergence let router_accuracy = if self.total_queries > 0 && training_steps > 0 { // Simple heuristic: more training = better accuracy, capped at 0.95 (0.5 + (training_steps as f64 / (training_steps as f64 + 100.0)) * 0.45).min(0.95) } else { 0.5 }; JsRuvLLMStats { total_queries: self.total_queries as u32, memory_nodes: memory.node_count() as u32, patterns_learned: training_steps, avg_latency_ms: if self.total_queries > 0 { self.total_latency_ms / self.total_queries as f64 } else { 0.0 }, cache_hit_rate, router_accuracy, } } /// Force router training #[napi] pub fn force_learn(&self) -> String { "Learning triggered".to_string() } /// Get embedding for text #[napi] pub fn embed(&self, text: String) -> Result> { let embedding = self .embedding .read() .embed(&text) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; Ok(embedding.vector.into_iter().map(|x| x as f64).collect()) } /// Compute similarity between two texts #[napi] pub fn similarity(&self, text1: String, text2: String) -> Result { let emb1 = self .embedding .read() .embed(&text1) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; let emb2 = self .embedding .read() .embed(&text2) .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?; // Cosine similarity = 1 - cosine_distance let distance = cosine_distance(&emb1.vector, &emb2.vector); Ok((1.0 - distance) as f64) } /// Check if SIMD is available #[napi] pub fn has_simd(&self) -> bool { #[cfg(target_arch = "x86_64")] { is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse4.1") } #[cfg(target_arch = "aarch64")] { true } #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] { false } } /// Get SIMD capabilities #[napi] pub fn simd_capabilities(&self) -> Vec { let mut caps = Vec::new(); #[cfg(target_arch = "x86_64")] { if is_x86_feature_detected!("avx512f") { caps.push("AVX-512".to_string()); } if is_x86_feature_detected!("avx2") { caps.push("AVX2".to_string()); } if is_x86_feature_detected!("sse4.1") { caps.push("SSE4.1".to_string()); } if is_x86_feature_detected!("fma") { caps.push("FMA".to_string()); } } #[cfg(target_arch = "aarch64")] { caps.push("NEON".to_string()); } if caps.is_empty() { caps.push("Scalar".to_string()); } caps } // ========================================================================= // v2.0: New optimization methods // ========================================================================= /// Check if NEON SIMD is available (v2.0) /// /// Returns true on all aarch64 (Apple Silicon, ARM) platforms. #[napi] pub fn is_neon_available(&self) -> bool { is_neon_available() } /// Check if parallel inference is enabled (v2.0) /// /// Returns true if the `parallel` feature was enabled at compile time. #[napi] pub fn is_parallel_enabled(&self) -> bool { #[cfg(feature = "parallel")] { true } #[cfg(not(feature = "parallel"))] { false } } /// Get memory pool statistics (v2.0) /// /// Returns current memory usage and allocation stats. #[napi] pub fn memory_pool_stats(&self) -> JsMemoryPoolStats { // For now, return placeholder stats - in a full implementation, // this would connect to the actual MemoryManager JsMemoryPoolStats { bytes_allocated: 0, capacity_bytes: 512 * 1024 * 1024, // 512 MB default active_allocations: 0, peak_bytes: 0, neon_available: is_neon_available(), metal_available: cfg!(feature = "metal"), } } /// Compute Flash Attention (v2.0) /// /// Uses optimized NEON kernels on Apple Silicon with 3-6x speedup. /// /// # Arguments /// * `query` - Query vector [head_dim] /// * `key` - Key vectors [kv_len * head_dim] flattened /// * `value` - Value vectors [kv_len * head_dim] flattened /// * `scale` - Softmax scale (typically 1/sqrt(head_dim)) /// * `causal` - Whether to apply causal masking /// /// # Returns /// Output vector [head_dim] #[napi] pub fn flash_attention( &self, query: Vec, key: Vec, value: Vec, scale: f64, causal: bool, ) -> Vec { let q: Vec = query.into_iter().map(|x| x as f32).collect(); let k: Vec = key.into_iter().map(|x| x as f32).collect(); let v: Vec = value.into_iter().map(|x| x as f32).collect(); let output = SimdOps::attention(&q, &k, &v, scale as f32, causal); output.into_iter().map(|x| x as f64).collect() } /// Compute GEMV (matrix-vector multiply) (v2.0) /// /// Uses optimized 12-row micro-kernel on Apple Silicon. /// /// # Arguments /// * `matrix` - Matrix [m * n] in row-major order /// * `vector` - Vector [n] /// * `m` - Number of rows /// * `n` - Number of columns /// /// # Returns /// Result vector [m] #[napi] pub fn gemv(&self, matrix: Vec, vector: Vec, m: u32, n: u32) -> Vec { let mat: Vec = matrix.into_iter().map(|x| x as f32).collect(); let vec: Vec = vector.into_iter().map(|x| x as f32).collect(); let output = SimdOps::gemv(&mat, &vec, m as usize, n as usize); output.into_iter().map(|x| x as f64).collect() } /// Get version information (v2.0) #[napi] pub fn version(&self) -> String { env!("CARGO_PKG_VERSION").to_string() } } /// SIMD Operations utility class #[napi] pub struct SimdOperations; #[napi] impl SimdOperations { /// Create new SIMD operations instance #[napi(constructor)] pub fn new() -> Self { Self } /// Compute dot product of two vectors #[napi] pub fn dot_product(&self, a: Vec, b: Vec) -> f64 { let a_f32: Vec = a.into_iter().map(|x| x as f32).collect(); let b_f32: Vec = b.into_iter().map(|x| x as f32).collect(); SimdOps::dot_product(&a_f32, &b_f32) as f64 } /// Compute cosine similarity #[napi] pub fn cosine_similarity(&self, a: Vec, b: Vec) -> f64 { let a_f32: Vec = a.into_iter().map(|x| x as f32).collect(); let b_f32: Vec = b.into_iter().map(|x| x as f32).collect(); 1.0 - cosine_distance(&a_f32, &b_f32) as f64 } /// Compute L2 distance #[napi] pub fn l2_distance(&self, a: Vec, b: Vec) -> f64 { let a_f32: Vec = a.into_iter().map(|x| x as f32).collect(); let b_f32: Vec = b.into_iter().map(|x| x as f32).collect(); let mut sum = 0.0f32; for (x, y) in a_f32.iter().zip(b_f32.iter()) { let diff = x - y; sum += diff * diff; } sum.sqrt() as f64 } /// Matrix-vector multiplication #[napi] pub fn matvec(&self, matrix: Vec>, vector: Vec) -> Vec { let rows = matrix.len(); let cols = if rows > 0 { matrix[0].len() } else { 0 }; let mut result = vec![0.0f64; rows]; for i in 0..rows { for j in 0..cols { result[i] += matrix[i][j] * vector[j]; } } result } /// Softmax activation #[napi] pub fn softmax(&self, input: Vec) -> Vec { let max = input.iter().cloned().fold(f64::NEG_INFINITY, f64::max); let exp_sum: f64 = input.iter().map(|x| (x - max).exp()).sum(); input.iter().map(|x| ((x - max).exp()) / exp_sum).collect() } } /// Version information #[napi] pub fn version() -> String { env!("CARGO_PKG_VERSION").to_string() } /// Check if running with SIMD support #[napi] pub fn has_simd_support() -> bool { #[cfg(target_arch = "x86_64")] { is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse4.1") } #[cfg(target_arch = "aarch64")] { true // NEON is always available on aarch64 } #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] { false } }