//! N-API bindings for RuvLLM
//!
//! Provides Node.js bindings for the RuvLLM self-learning LLM orchestrator.
//!
//! ## v2.0 Features
//!
//! - **Optimized kernels**: Flash Attention 2, NEON GEMM/GEMV
//! - **Parallel inference**: Multi-threaded when `parallel` feature enabled
//! - **Quantization**: INT8, INT4, Q4K support via `quantization` option
//! - **Metal GPU**: Optional Metal acceleration on Apple Silicon
//!
//! ## Example (Node.js)
//!
//! ```javascript
//! const { RuvLLMEngine } = require('@ruvector/ruvllm');
//!
//! // Create engine with parallel inference
//! const engine = new RuvLLMEngine({
//!   useParallel: true,
//!   useMetal: false,
//!   quantization: 'q4k',
//! });
//!
//! // Generate text
//! const response = engine.query("Hello, world!");
//! console.log(response.text);
//!
//! // Check SIMD capabilities
//! console.log(engine.simdCapabilities()); // ['NEON'] on M4 Pro
//! ```

#![cfg(feature = "napi")]

use napi::bindgen_prelude::*;
use napi_derive::napi;

use crate::config::{EmbeddingConfig, MemoryConfig, RouterConfig};
use crate::embedding::EmbeddingService;
use crate::memory::{cosine_distance, MemoryService};
use crate::router::FastGRNNRouter;
use crate::simd_inference::{SimdGenerationConfig, SimdInferenceEngine, SimdOps};
use crate::types::{MemoryNode, NodeType};

use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::Arc;

// Import optimized kernels for capability detection
use ruvllm_lib::kernels::is_neon_available;
use ruvllm_lib::memory_pool::{MemoryManager, MemoryManagerConfig, MemoryManagerStats};

/// RuvLLM Configuration for Node.js
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsRuvLLMConfig {
    /// Embedding dimension (default: 768)
    pub embedding_dim: Option<u32>,
    /// Router hidden dimension (default: 128)
    pub router_hidden_dim: Option<u32>,
    /// HNSW M parameter (default: 16)
    pub hnsw_m: Option<u32>,
    /// HNSW ef_construction (default: 100)
    pub hnsw_ef_construction: Option<u32>,
    /// HNSW ef_search (default: 64)
    pub hnsw_ef_search: Option<u32>,
    /// Enable learning (default: true)
    pub learning_enabled: Option<bool>,
    /// Quality threshold for learning (default: 0.7)
    pub quality_threshold: Option<f64>,
    /// EWC lambda (default: 2000)
    pub ewc_lambda: Option<f64>,

    // v2.0: New optimization options
    /// Enable parallel inference using rayon (default: true if feature enabled)
    pub use_parallel: Option<bool>,
    /// Quantization type: "none", "int8", "int4", "q4k" (default: "none")
    pub quantization: Option<String>,
    /// Enable Metal GPU acceleration on Apple Silicon (default: false)
    pub use_metal: Option<bool>,
    /// Memory pool capacity in MB (default: 512)
    pub memory_pool_mb: Option<u32>,
}

impl Default for JsRuvLLMConfig {
    fn default() -> Self {
        Self {
            embedding_dim: Some(768),
            router_hidden_dim: Some(128),
            hnsw_m: Some(16),
            hnsw_ef_construction: Some(100),
            hnsw_ef_search: Some(64),
            learning_enabled: Some(true),
            quality_threshold: Some(0.7),
            ewc_lambda: Some(2000.0),
            // v2.0 defaults
            use_parallel: Some(true),
            quantization: Some("none".to_string()),
            use_metal: Some(false),
            memory_pool_mb: Some(512),
        }
    }
}

/// Quantization type for model weights
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum QuantizationType {
    /// No quantization (FP32)
    None,
    /// 8-bit integer quantization
    Int8,
    /// 4-bit integer quantization
    Int4,
    /// Q4K (k-quants, higher quality)
    Q4K,
}

impl From<&str> for QuantizationType {
    fn from(s: &str) -> Self {
        match s.to_lowercase().as_str() {
            "int8" | "q8" => QuantizationType::Int8,
            "int4" | "q4" => QuantizationType::Int4,
            "q4k" | "q4_k" => QuantizationType::Q4K,
            _ => QuantizationType::None,
        }
    }
}

/// Memory pool statistics (v2.0)
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsMemoryPoolStats {
    /// Total bytes allocated
    pub bytes_allocated: u32,
    /// Total capacity in bytes
    pub capacity_bytes: u32,
    /// Number of active allocations
    pub active_allocations: u32,
    /// Peak memory usage in bytes
    pub peak_bytes: u32,
    /// Whether NEON SIMD is available
    pub neon_available: bool,
    /// Whether Metal GPU is available
    pub metal_available: bool,
}

/// Generation configuration
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsGenerationConfig {
    /// Maximum tokens to generate
    pub max_tokens: Option<u32>,
    /// Temperature for sampling
    pub temperature: Option<f64>,
    /// Top-p nucleus sampling
    pub top_p: Option<f64>,
    /// Top-k sampling
    pub top_k: Option<u32>,
    /// Repetition penalty
    pub repetition_penalty: Option<f64>,
}

impl Default for JsGenerationConfig {
    fn default() -> Self {
        Self {
            max_tokens: Some(256),
            temperature: Some(0.7),
            top_p: Some(0.9),
            top_k: Some(50),
            repetition_penalty: Some(1.1),
        }
    }
}

/// Query response
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsQueryResponse {
    /// Generated text
    pub text: String,
    /// Confidence score
    pub confidence: f64,
    /// Selected model
    pub model: String,
    /// Context size used
    pub context_size: u32,
    /// Latency in milliseconds
    pub latency_ms: f64,
    /// Request ID
    pub request_id: String,
}

/// Routing decision
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsRoutingDecision {
    /// Selected model size
    pub model: String,
    /// Recommended context size
    pub context_size: u32,
    /// Temperature
    pub temperature: f64,
    /// Top-p
    pub top_p: f64,
    /// Confidence
    pub confidence: f64,
}

/// Memory search result
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsMemoryResult {
    /// Node ID
    pub id: String,
    /// Distance (lower is better)
    pub distance: f64,
    /// Content text
    pub content: String,
    /// Metadata JSON
    pub metadata: String,
}

/// RuvLLM Statistics
#[napi(object)]
#[derive(Clone, Debug)]
pub struct JsRuvLLMStats {
    /// Total queries processed
    pub total_queries: u32,
    /// Memory nodes stored
    pub memory_nodes: u32,
    /// Patterns learned (training steps)
    pub patterns_learned: u32,
    /// Average latency ms
    pub avg_latency_ms: f64,
    /// Cache hit rate (0.0 - 1.0)
    pub cache_hit_rate: f64,
    /// Router accuracy (0.0 - 1.0)
    pub router_accuracy: f64,
}

/// RuvLLM Engine - Main orchestrator for self-learning LLM
#[napi]
pub struct RuvLLMEngine {
    embedding_dim: usize,
    router_hidden: usize,
    inference_engine: Arc<RwLock<SimdInferenceEngine>>,
    router: Arc<RwLock<FastGRNNRouter>>,
    memory: Arc<RwLock<MemoryServiceSync>>,
    embedding: Arc<RwLock<EmbeddingService>>,
    learning_enabled: bool,
    quality_threshold: f32,
    total_queries: u64,
    total_latency_ms: f64,
    hnsw_ef_search: usize,
}

/// Synchronous memory service wrapper
struct MemoryServiceSync {
    inner: MemoryService,
    runtime: tokio::runtime::Runtime,
}

impl MemoryServiceSync {
    fn new(config: &MemoryConfig) -> Result<Self> {
        let runtime = tokio::runtime::Runtime::new()
            .map_err(|e| Error::from_reason(format!("Failed to create runtime: {}", e)))?;
        let inner = runtime
            .block_on(MemoryService::new(config))
            .map_err(|e| Error::from_reason(format!("Failed to create memory service: {}", e)))?;
        Ok(Self { inner, runtime })
    }

    fn insert_node(&self, node: MemoryNode) -> Result<String> {
        self.inner
            .insert_node(node)
            .map_err(|e| Error::from_reason(format!("Insert failed: {}", e)))
    }

    fn search(&self, query: &[f32], k: usize, ef_search: usize) -> Vec<(String, f32, String)> {
        let result = self
            .runtime
            .block_on(self.inner.search_with_graph(query, k, ef_search, 1));
        match result {
            Ok(search_result) => search_result
                .candidates
                .into_iter()
                .map(|c| (c.id, c.distance, c.node.text))
                .collect(),
            Err(_) => vec![],
        }
    }

    fn node_count(&self) -> usize {
        self.inner.node_count()
    }

    fn get_stats(&self) -> (u64, u64) {
        let stats = self.inner.get_stats();
        (stats.total_insertions, stats.total_searches)
    }
}

#[napi]
impl RuvLLMEngine {
    /// Create a new RuvLLM engine with default configuration
    #[napi(constructor)]
    pub fn new(config: Option<JsRuvLLMConfig>) -> Result<Self> {
        let cfg = config.unwrap_or_default();

        let embedding_dim = cfg.embedding_dim.unwrap_or(768) as usize;
        let router_hidden = cfg.router_hidden_dim.unwrap_or(128) as usize;
        let hnsw_m = cfg.hnsw_m.unwrap_or(16) as usize;
        let hnsw_ef_construction = cfg.hnsw_ef_construction.unwrap_or(100) as usize;
        let hnsw_ef_search = cfg.hnsw_ef_search.unwrap_or(64) as usize;
        let learning_enabled = cfg.learning_enabled.unwrap_or(true);
        let quality_threshold = cfg.quality_threshold.unwrap_or(0.7) as f32;

        // Create configs
        let embedding_config = EmbeddingConfig {
            dimension: embedding_dim,
            max_tokens: 512,
            batch_size: 8,
        };

        let router_config = RouterConfig {
            input_dim: embedding_dim,
            hidden_dim: router_hidden,
            sparsity: 0.9,
            rank: 8,
            confidence_threshold: 0.7,
            weights_path: None,
        };

        let memory_config = MemoryConfig {
            db_path: std::path::PathBuf::from("./data/memory.db"),
            hnsw_m,
            hnsw_ef_construction,
            hnsw_ef_search,
            max_nodes: 100000,
            writeback_batch_size: 100,
            writeback_interval_ms: 1000,
        };

        // Initialize components
        let inference_engine = SimdInferenceEngine::new_demo();

        let router = FastGRNNRouter::new(&router_config)
            .map_err(|e| Error::from_reason(format!("Failed to create router: {}", e)))?;

        let memory = MemoryServiceSync::new(&memory_config)?;

        let embedding = EmbeddingService::new(&embedding_config).map_err(|e| {
            Error::from_reason(format!("Failed to create embedding service: {}", e))
        })?;

        Ok(Self {
            embedding_dim,
            router_hidden,
            inference_engine: Arc::new(RwLock::new(inference_engine)),
            router: Arc::new(RwLock::new(router)),
            memory: Arc::new(RwLock::new(memory)),
            embedding: Arc::new(RwLock::new(embedding)),
            learning_enabled,
            quality_threshold,
            total_queries: 0,
            total_latency_ms: 0.0,
            hnsw_ef_search,
        })
    }

    /// Query the LLM with automatic routing
    #[napi]
    pub fn query(
        &mut self,
        text: String,
        config: Option<JsGenerationConfig>,
    ) -> Result<JsQueryResponse> {
        let start = std::time::Instant::now();
        let gen_config = config.unwrap_or_default();

        // Generate embedding
        let embedding = self
            .embedding
            .read()
            .embed(&text)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;

        // Get routing decision
        let hidden = vec![0.0f32; self.router_hidden];
        let routing = self
            .router
            .read()
            .forward(&embedding.vector, &hidden)
            .map_err(|e| Error::from_reason(format!("Routing failed: {}", e)))?;

        // Generate response
        let simd_config = SimdGenerationConfig {
            max_tokens: gen_config.max_tokens.unwrap_or(256) as usize,
            temperature: gen_config.temperature.unwrap_or(0.7) as f32,
            top_p: gen_config.top_p.unwrap_or(0.9) as f32,
            top_k: gen_config.top_k.unwrap_or(50) as usize,
            repeat_penalty: gen_config.repetition_penalty.unwrap_or(1.1) as f32,
            ..Default::default()
        };

        let (text, _tokens, _latency) =
            self.inference_engine
                .read()
                .generate(&text, &simd_config, None);

        let latency_ms = start.elapsed().as_secs_f64() * 1000.0;
        self.total_queries += 1;
        self.total_latency_ms += latency_ms;

        let request_id = uuid::Uuid::new_v4().to_string();

        Ok(JsQueryResponse {
            text,
            confidence: routing.confidence as f64,
            model: format!("{:?}", routing.model),
            context_size: routing.context_size as u32,
            latency_ms,
            request_id,
        })
    }

    /// Generate text with SIMD-optimized inference
    #[napi]
    pub fn generate(&self, prompt: String, config: Option<JsGenerationConfig>) -> Result<String> {
        let gen_config = config.unwrap_or_default();

        let simd_config = SimdGenerationConfig {
            max_tokens: gen_config.max_tokens.unwrap_or(256) as usize,
            temperature: gen_config.temperature.unwrap_or(0.7) as f32,
            top_p: gen_config.top_p.unwrap_or(0.9) as f32,
            top_k: gen_config.top_k.unwrap_or(50) as usize,
            repeat_penalty: gen_config.repetition_penalty.unwrap_or(1.1) as f32,
            ..Default::default()
        };

        let (text, _tokens, _latency) =
            self.inference_engine
                .read()
                .generate(&prompt, &simd_config, None);

        Ok(text)
    }

    /// Get routing decision for a query
    #[napi]
    pub fn route(&self, text: String) -> Result<JsRoutingDecision> {
        let embedding = self
            .embedding
            .read()
            .embed(&text)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;
        let hidden = vec![0.0f32; self.router_hidden];
        let routing = self
            .router
            .read()
            .forward(&embedding.vector, &hidden)
            .map_err(|e| Error::from_reason(format!("Routing failed: {}", e)))?;

        Ok(JsRoutingDecision {
            model: format!("{:?}", routing.model),
            context_size: routing.context_size as u32,
            temperature: routing.temperature as f64,
            top_p: routing.top_p as f64,
            confidence: routing.confidence as f64,
        })
    }

    /// Search memory for similar content
    #[napi]
    pub fn search_memory(&self, text: String, k: Option<u32>) -> Result<Vec<JsMemoryResult>> {
        let embedding = self
            .embedding
            .read()
            .embed(&text)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;
        let k = k.unwrap_or(10) as usize;

        let results = self
            .memory
            .read()
            .search(&embedding.vector, k, self.hnsw_ef_search);

        Ok(results
            .into_iter()
            .map(|(id, distance, content)| JsMemoryResult {
                id,
                distance: distance as f64,
                content,
                metadata: "{}".to_string(),
            })
            .collect())
    }

    /// Add content to memory
    #[napi]
    pub fn add_memory(&self, content: String, metadata: Option<String>) -> Result<String> {
        let embedding = self
            .embedding
            .read()
            .embed(&content)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;

        let meta: HashMap<String, serde_json::Value> = metadata
            .and_then(|s| serde_json::from_str(&s).ok())
            .unwrap_or_default();

        let node = MemoryNode {
            id: uuid::Uuid::new_v4().to_string(),
            vector: embedding.vector,
            text: content,
            node_type: NodeType::Fact,
            source: "napi".to_string(),
            metadata: meta,
        };

        self.memory.write().insert_node(node)
    }

    /// Provide feedback for learning
    #[napi]
    pub fn feedback(
        &mut self,
        _request_id: String,
        rating: u32,
        _correction: Option<String>,
    ) -> Result<bool> {
        if !self.learning_enabled {
            return Ok(false);
        }

        let quality = rating as f32 / 5.0;
        Ok(quality >= self.quality_threshold)
    }

    /// Get engine statistics
    #[napi]
    pub fn stats(&self) -> JsRuvLLMStats {
        let memory = self.memory.read();
        let (insertions, searches) = memory.get_stats();
        let router_guard = self.router.read();
        let router_stats = router_guard.stats();

        let training_steps = router_stats
            .training_steps
            .load(std::sync::atomic::Ordering::Relaxed) as u32;

        // Calculate cache hit rate from memory stats
        let total_ops = insertions + searches;
        let cache_hit_rate = if total_ops > 0 {
            // Estimate: searches that don't result in new insertions are "hits"
            searches as f64 / total_ops as f64
        } else {
            0.0
        };

        // Router accuracy based on training convergence
        let router_accuracy = if self.total_queries > 0 && training_steps > 0 {
            // Simple heuristic: more training = better accuracy, capped at 0.95
            (0.5 + (training_steps as f64 / (training_steps as f64 + 100.0)) * 0.45).min(0.95)
        } else {
            0.5
        };

        JsRuvLLMStats {
            total_queries: self.total_queries as u32,
            memory_nodes: memory.node_count() as u32,
            patterns_learned: training_steps,
            avg_latency_ms: if self.total_queries > 0 {
                self.total_latency_ms / self.total_queries as f64
            } else {
                0.0
            },
            cache_hit_rate,
            router_accuracy,
        }
    }

    /// Force router training
    #[napi]
    pub fn force_learn(&self) -> String {
        "Learning triggered".to_string()
    }

    /// Get embedding for text
    #[napi]
    pub fn embed(&self, text: String) -> Result<Vec<f64>> {
        let embedding = self
            .embedding
            .read()
            .embed(&text)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;
        Ok(embedding.vector.into_iter().map(|x| x as f64).collect())
    }

    /// Compute similarity between two texts
    #[napi]
    pub fn similarity(&self, text1: String, text2: String) -> Result<f64> {
        let emb1 = self
            .embedding
            .read()
            .embed(&text1)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;
        let emb2 = self
            .embedding
            .read()
            .embed(&text2)
            .map_err(|e| Error::from_reason(format!("Embedding failed: {}", e)))?;

        // Cosine similarity = 1 - cosine_distance
        let distance = cosine_distance(&emb1.vector, &emb2.vector);
        Ok((1.0 - distance) as f64)
    }

    /// Check if SIMD is available
    #[napi]
    pub fn has_simd(&self) -> bool {
        #[cfg(target_arch = "x86_64")]
        {
            is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse4.1")
        }
        #[cfg(target_arch = "aarch64")]
        {
            true
        }
        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
        {
            false
        }
    }

    /// Get SIMD capabilities
    #[napi]
    pub fn simd_capabilities(&self) -> Vec<String> {
        let mut caps = Vec::new();

        #[cfg(target_arch = "x86_64")]
        {
            if is_x86_feature_detected!("avx512f") {
                caps.push("AVX-512".to_string());
            }
            if is_x86_feature_detected!("avx2") {
                caps.push("AVX2".to_string());
            }
            if is_x86_feature_detected!("sse4.1") {
                caps.push("SSE4.1".to_string());
            }
            if is_x86_feature_detected!("fma") {
                caps.push("FMA".to_string());
            }
        }

        #[cfg(target_arch = "aarch64")]
        {
            caps.push("NEON".to_string());
        }

        if caps.is_empty() {
            caps.push("Scalar".to_string());
        }

        caps
    }

    // =========================================================================
    // v2.0: New optimization methods
    // =========================================================================

    /// Check if NEON SIMD is available (v2.0)
    ///
    /// Returns true on all aarch64 (Apple Silicon, ARM) platforms.
    #[napi]
    pub fn is_neon_available(&self) -> bool {
        is_neon_available()
    }

    /// Check if parallel inference is enabled (v2.0)
    ///
    /// Returns true if the `parallel` feature was enabled at compile time.
    #[napi]
    pub fn is_parallel_enabled(&self) -> bool {
        #[cfg(feature = "parallel")]
        {
            true
        }
        #[cfg(not(feature = "parallel"))]
        {
            false
        }
    }

    /// Get memory pool statistics (v2.0)
    ///
    /// Returns current memory usage and allocation stats.
    #[napi]
    pub fn memory_pool_stats(&self) -> JsMemoryPoolStats {
        // For now, return placeholder stats - in a full implementation,
        // this would connect to the actual MemoryManager
        JsMemoryPoolStats {
            bytes_allocated: 0,
            capacity_bytes: 512 * 1024 * 1024, // 512 MB default
            active_allocations: 0,
            peak_bytes: 0,
            neon_available: is_neon_available(),
            metal_available: cfg!(feature = "metal"),
        }
    }

    /// Compute Flash Attention (v2.0)
    ///
    /// Uses optimized NEON kernels on Apple Silicon with 3-6x speedup.
    ///
    /// # Arguments
    /// * `query` - Query vector [head_dim]
    /// * `key` - Key vectors [kv_len * head_dim] flattened
    /// * `value` - Value vectors [kv_len * head_dim] flattened
    /// * `scale` - Softmax scale (typically 1/sqrt(head_dim))
    /// * `causal` - Whether to apply causal masking
    ///
    /// # Returns
    /// Output vector [head_dim]
    #[napi]
    pub fn flash_attention(
        &self,
        query: Vec<f64>,
        key: Vec<f64>,
        value: Vec<f64>,
        scale: f64,
        causal: bool,
    ) -> Vec<f64> {
        let q: Vec<f32> = query.into_iter().map(|x| x as f32).collect();
        let k: Vec<f32> = key.into_iter().map(|x| x as f32).collect();
        let v: Vec<f32> = value.into_iter().map(|x| x as f32).collect();

        let output = SimdOps::attention(&q, &k, &v, scale as f32, causal);
        output.into_iter().map(|x| x as f64).collect()
    }

    /// Compute GEMV (matrix-vector multiply) (v2.0)
    ///
    /// Uses optimized 12-row micro-kernel on Apple Silicon.
    ///
    /// # Arguments
    /// * `matrix` - Matrix [m * n] in row-major order
    /// * `vector` - Vector [n]
    /// * `m` - Number of rows
    /// * `n` - Number of columns
    ///
    /// # Returns
    /// Result vector [m]
    #[napi]
    pub fn gemv(&self, matrix: Vec<f64>, vector: Vec<f64>, m: u32, n: u32) -> Vec<f64> {
        let mat: Vec<f32> = matrix.into_iter().map(|x| x as f32).collect();
        let vec: Vec<f32> = vector.into_iter().map(|x| x as f32).collect();

        let output = SimdOps::gemv(&mat, &vec, m as usize, n as usize);
        output.into_iter().map(|x| x as f64).collect()
    }

    /// Get version information (v2.0)
    #[napi]
    pub fn version(&self) -> String {
        env!("CARGO_PKG_VERSION").to_string()
    }
}

/// SIMD Operations utility class
#[napi]
pub struct SimdOperations;

#[napi]
impl SimdOperations {
    /// Create new SIMD operations instance
    #[napi(constructor)]
    pub fn new() -> Self {
        Self
    }

    /// Compute dot product of two vectors
    #[napi]
    pub fn dot_product(&self, a: Vec<f64>, b: Vec<f64>) -> f64 {
        let a_f32: Vec<f32> = a.into_iter().map(|x| x as f32).collect();
        let b_f32: Vec<f32> = b.into_iter().map(|x| x as f32).collect();
        SimdOps::dot_product(&a_f32, &b_f32) as f64
    }

    /// Compute cosine similarity
    #[napi]
    pub fn cosine_similarity(&self, a: Vec<f64>, b: Vec<f64>) -> f64 {
        let a_f32: Vec<f32> = a.into_iter().map(|x| x as f32).collect();
        let b_f32: Vec<f32> = b.into_iter().map(|x| x as f32).collect();
        1.0 - cosine_distance(&a_f32, &b_f32) as f64
    }

    /// Compute L2 distance
    #[napi]
    pub fn l2_distance(&self, a: Vec<f64>, b: Vec<f64>) -> f64 {
        let a_f32: Vec<f32> = a.into_iter().map(|x| x as f32).collect();
        let b_f32: Vec<f32> = b.into_iter().map(|x| x as f32).collect();

        let mut sum = 0.0f32;
        for (x, y) in a_f32.iter().zip(b_f32.iter()) {
            let diff = x - y;
            sum += diff * diff;
        }
        sum.sqrt() as f64
    }

    /// Matrix-vector multiplication
    #[napi]
    pub fn matvec(&self, matrix: Vec<Vec<f64>>, vector: Vec<f64>) -> Vec<f64> {
        let rows = matrix.len();
        let cols = if rows > 0 { matrix[0].len() } else { 0 };

        let mut result = vec![0.0f64; rows];
        for i in 0..rows {
            for j in 0..cols {
                result[i] += matrix[i][j] * vector[j];
            }
        }
        result
    }

    /// Softmax activation
    #[napi]
    pub fn softmax(&self, input: Vec<f64>) -> Vec<f64> {
        let max = input.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
        let exp_sum: f64 = input.iter().map(|x| (x - max).exp()).sum();
        input.iter().map(|x| ((x - max).exp()) / exp_sum).collect()
    }
}

/// Version information
#[napi]
pub fn version() -> String {
    env!("CARGO_PKG_VERSION").to_string()
}

/// Check if running with SIMD support
#[napi]
pub fn has_simd_support() -> bool {
    #[cfg(target_arch = "x86_64")]
    {
        is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse4.1")
    }
    #[cfg(target_arch = "aarch64")]
    {
        true // NEON is always available on aarch64
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    {
        false
    }
}