Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/attention.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/attention.rs
@@ -0,0 +1,327 @@
+//! Attention mechanisms for ESP32
+//!
+//! Implements simplified attention patterns optimized for microcontrollers.
+
+// Quantized operations for attention
+
+/// Simplified single-head attention for ESP32
+///
+/// This is a memory-efficient attention that processes one head at a time
+/// to minimize activation memory.
+pub struct MicroAttention {
+    /// Head dimension
+    head_dim: usize,
+    /// Number of heads
+    num_heads: usize,
+    /// Cached attention scaling factor (1/sqrt(head_dim) as fixed-point)
+    scale_shift: u8,
+}
+
+impl MicroAttention {
+    /// Create new attention module
+    pub fn new(embed_dim: usize, num_heads: usize) -> Self {
+        let head_dim = embed_dim / num_heads;
+
+        // Approximate 1/sqrt(head_dim) as right shift
+        // sqrt(64) = 8, so shift by 3
+        // sqrt(32) ≈ 5.66, so shift by 2-3
+        let scale_shift = match head_dim {
+            d if d >= 64 => 3,
+            d if d >= 32 => 3,
+            d if d >= 16 => 2,
+            _ => 1,
+        };
+
+        Self {
+            head_dim,
+            num_heads,
+            scale_shift,
+        }
+    }
+
+    /// Compute attention scores between query and keys
+    ///
+    /// Returns scores in i32 format (scaled by 256)
+    #[inline]
+    pub fn compute_scores(
+        &self,
+        query: &[i8],      // [head_dim]
+        keys: &[&[i8]],    // [seq_len, head_dim]
+        scores: &mut [i32], // [seq_len]
+    ) {
+        for (i, key) in keys.iter().enumerate() {
+            let mut dot: i32 = 0;
+            for j in 0..self.head_dim {
+                dot += query[j] as i32 * key[j] as i32;
+            }
+            // Scale by 1/sqrt(d_k)
+            scores[i] = dot >> self.scale_shift;
+        }
+    }
+
+    /// Apply causal mask (set future positions to minimum)
+    #[inline]
+    pub fn apply_causal_mask(&self, scores: &mut [i32], current_pos: usize) {
+        for i in (current_pos + 1)..scores.len() {
+            scores[i] = i32::MIN / 2; // Avoid overflow in softmax
+        }
+    }
+
+    /// Fixed-point softmax optimized for ESP32
+    ///
+    /// Uses integer arithmetic only, suitable for chips without FPU.
+    /// Output is scaled by 256 (i.e., 256 = 1.0)
+    #[inline]
+    pub fn softmax_fixed(&self, scores: &mut [i32]) {
+        if scores.is_empty() {
+            return;
+        }
+
+        // Find maximum for numerical stability
+        let max_score = scores.iter().cloned().max().unwrap_or(0);
+
+        // Compute exp approximation and sum
+        // exp(x) ≈ 1 + x + x²/2 for small x
+        // We use simpler linear: exp(x) ≈ 256 + x for x in [-256, 0]
+        let mut sum: i64 = 0;
+        for score in scores.iter_mut() {
+            let x = *score - max_score;
+            // Clamp to prevent overflow
+            let x_clamped = x.max(-512).min(0);
+            // Linear approximation of exp, result in range [0, 256]
+            *score = (256 + x_clamped / 2).max(1) as i32;
+            sum += *score as i64;
+        }
+
+        // Normalize: output[i] = score[i] * 256 / sum
+        if sum > 0 {
+            for score in scores.iter_mut() {
+                *score = ((*score as i64 * 256) / sum) as i32;
+            }
+        }
+    }
+
+    /// Compute weighted sum of values
+    ///
+    /// output = sum(attention_weights[i] * values[i])
+    #[inline]
+    pub fn weighted_sum(
+        &self,
+        weights: &[i32],    // [seq_len], scaled by 256
+        values: &[&[i8]],   // [seq_len, head_dim]
+        output: &mut [i32], // [head_dim]
+    ) {
+        // Clear output
+        for o in output.iter_mut() {
+            *o = 0;
+        }
+
+        // Accumulate weighted values
+        for (&weight, value) in weights.iter().zip(values.iter()) {
+            for j in 0..self.head_dim {
+                output[j] += weight * value[j] as i32;
+            }
+        }
+
+        // Descale (weights were scaled by 256)
+        for o in output.iter_mut() {
+            *o >>= 8;
+        }
+    }
+}
+
+/// Linear attention approximation for very long sequences
+///
+/// Uses kernel feature maps to achieve O(n) complexity instead of O(n²)
+pub struct LinearAttention {
+    /// Feature dimension for kernel
+    feature_dim: usize,
+}
+
+impl LinearAttention {
+    pub fn new(feature_dim: usize) -> Self {
+        Self { feature_dim }
+    }
+
+    /// ELU-based feature map: φ(x) = elu(x) + 1
+    /// For INT8: approximate as max(x, 0) + 1
+    #[inline]
+    pub fn feature_map(&self, x: i8) -> i16 {
+        (x.max(0) as i16) + 1
+    }
+
+    /// Compute linear attention
+    /// Instead of softmax(QK^T)V, computes φ(Q)(φ(K)^T V)
+    pub fn forward(
+        &self,
+        query: &[i8],      // [dim]
+        keys: &[&[i8]],    // [seq_len, dim]
+        values: &[&[i8]],  // [seq_len, dim]
+        output: &mut [i32], // [dim]
+    ) {
+        let dim = query.len();
+
+        // Compute φ(K)^T V: [dim, dim] accumulated over sequence
+        // This is O(n * dim²) but can be incrementally updated
+        let mut kv_cache = [[0i32; 64]; 64]; // Fixed size for embedded
+
+        for (key, value) in keys.iter().zip(values.iter()) {
+            for i in 0..dim.min(64) {
+                let phi_k = self.feature_map(key[i]);
+                for j in 0..dim.min(64) {
+                    kv_cache[i][j] += phi_k as i32 * value[j] as i32;
+                }
+            }
+        }
+
+        // Compute φ(Q) @ (φ(K)^T V)
+        for i in 0..dim.min(64) {
+            let phi_q = self.feature_map(query[i]);
+            let mut sum: i32 = 0;
+            for j in 0..dim.min(64) {
+                sum += phi_q as i32 * kv_cache[j][i];
+            }
+            output[i] = sum >> 8;
+        }
+
+        // Compute denominator: φ(Q) @ sum(φ(K))
+        let mut k_sum = [0i32; 64];
+        for key in keys.iter() {
+            for i in 0..dim.min(64) {
+                k_sum[i] += self.feature_map(key[i]) as i32;
+            }
+        }
+
+        let mut denom: i32 = 0;
+        for i in 0..dim.min(64) {
+            denom += self.feature_map(query[i]) as i32 * k_sum[i];
+        }
+
+        // Normalize
+        if denom > 0 {
+            for o in output.iter_mut() {
+                *o = (*o << 8) / denom;
+            }
+        }
+    }
+}
+
+/// Sliding window attention for memory efficiency
+///
+/// Only attends to the last N tokens, reducing memory from O(n²) to O(n*window)
+pub struct SlidingWindowAttention {
+    window_size: usize,
+    head_dim: usize,
+}
+
+impl SlidingWindowAttention {
+    pub fn new(window_size: usize, head_dim: usize) -> Self {
+        Self { window_size, head_dim }
+    }
+
+    /// Compute attention with sliding window
+    pub fn forward(
+        &self,
+        query: &[i8],
+        keys: &[[i8; 64]],    // Ring buffer of keys
+        values: &[[i8; 64]],  // Ring buffer of values
+        cache_len: usize,
+        output: &mut [i32],
+    ) {
+        let window_start = cache_len.saturating_sub(self.window_size);
+        let mut scores = [0i32; 32]; // Max window size
+
+        // Compute attention scores for window
+        for i in window_start..cache_len {
+            let mut dot: i32 = 0;
+            for j in 0..self.head_dim {
+                dot += query[j] as i32 * keys[i % self.window_size][j] as i32;
+            }
+            scores[i - window_start] = dot >> 3;
+        }
+
+        // Softmax over window
+        let window_len = cache_len - window_start;
+        let scores_slice = &mut scores[..window_len];
+
+        // Find max
+        let max = scores_slice.iter().cloned().max().unwrap_or(0);
+        let mut sum: i32 = 0;
+        for s in scores_slice.iter_mut() {
+            *s = (256 + (*s - max) / 2).max(1);
+            sum += *s;
+        }
+
+        // Normalize and compute output
+        for o in output[..self.head_dim].iter_mut() {
+            *o = 0;
+        }
+
+        for i in 0..window_len {
+            let weight = (scores[i] * 256) / sum.max(1);
+            let value = &values[(window_start + i) % self.window_size];
+            for j in 0..self.head_dim {
+                output[j] += weight * value[j] as i32;
+            }
+        }
+
+        for o in output[..self.head_dim].iter_mut() {
+            *o >>= 8;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_micro_attention() {
+        let attn = MicroAttention::new(64, 4);
+
+        let query = [10i8; 16];
+        let key1 = [10i8; 16];
+        let key2 = [5i8; 16];
+        let keys: [&[i8]; 2] = [&key1, &key2];
+
+        let mut scores = [0i32; 2];
+        attn.compute_scores(&query, &keys, &mut scores);
+
+        // First key should have higher score (same as query)
+        assert!(scores[0] > scores[1]);
+    }
+
+    #[test]
+    fn test_softmax_fixed() {
+        let attn = MicroAttention::new(64, 4);
+
+        let mut scores = [100i32, 50, 0, -50];
+        attn.softmax_fixed(&mut scores);
+
+        // Check that scores sum to ~256
+        let sum: i32 = scores.iter().sum();
+        assert!((sum - 256).abs() < 10);
+
+        // Check ordering preserved
+        assert!(scores[0] > scores[1]);
+        assert!(scores[1] > scores[2]);
+        assert!(scores[2] > scores[3]);
+    }
+
+    #[test]
+    fn test_linear_attention() {
+        let attn = LinearAttention::new(16);
+
+        let query = [10i8; 16];
+        let key = [10i8; 16];
+        let value = [5i8; 16];
+        let keys: [&[i8]; 1] = [&key];
+        let values: [&[i8]; 1] = [&value];
+
+        let mut output = [0i32; 16];
+        attn.forward(&query, &keys, &values, &mut output);
+
+        // Output should be non-zero
+        assert!(output.iter().any(|&x| x != 0));
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/benchmark.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/benchmark.rs
@@ -0,0 +1,288 @@
+//! Benchmark Suite for RuvLLM ESP32
+//!
+//! Automated performance measurement across different configurations.
+//!
+//! # Metrics
+//! - Tokens per second
+//! - Memory usage
+//! - Latency percentiles
+//! - Power consumption (estimated)
+
+use core::fmt;
+
+/// Benchmark result
+#[derive(Clone, Default)]
+pub struct BenchmarkResult {
+    /// Test name
+    pub name: heapless::String<32>,
+    /// Tokens per second
+    pub tokens_per_sec: f32,
+    /// Time to first token (ms)
+    pub ttft_ms: u32,
+    /// Average latency per token (ms)
+    pub avg_latency_ms: f32,
+    /// P50 latency (ms)
+    pub p50_latency_ms: f32,
+    /// P99 latency (ms)
+    pub p99_latency_ms: f32,
+    /// Peak memory usage (bytes)
+    pub peak_memory: u32,
+    /// Total tokens generated
+    pub total_tokens: u32,
+    /// Total time (ms)
+    pub total_time_ms: u32,
+}
+
+impl fmt::Display for BenchmarkResult {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}: {:.1} tok/s, TTFT: {}ms, avg: {:.1}ms, mem: {}KB",
+            self.name,
+            self.tokens_per_sec,
+            self.ttft_ms,
+            self.avg_latency_ms,
+            self.peak_memory / 1024
+        )
+    }
+}
+
+/// Benchmark configuration
+#[derive(Clone)]
+pub struct BenchmarkConfig {
+    /// Number of warmup iterations
+    pub warmup_iters: u32,
+    /// Number of benchmark iterations
+    pub bench_iters: u32,
+    /// Tokens to generate per iteration
+    pub tokens_per_iter: u32,
+    /// Input prompt
+    pub prompt: heapless::String<128>,
+}
+
+impl Default for BenchmarkConfig {
+    fn default() -> Self {
+        Self {
+            warmup_iters: 3,
+            bench_iters: 10,
+            tokens_per_iter: 32,
+            prompt: heapless::String::try_from("Once upon a time").unwrap_or_default(),
+        }
+    }
+}
+
+/// Benchmark suite
+pub struct BenchmarkSuite {
+    results: heapless::Vec<BenchmarkResult, 16>,
+    config: BenchmarkConfig,
+}
+
+impl BenchmarkSuite {
+    /// Create new benchmark suite
+    pub fn new(config: BenchmarkConfig) -> Self {
+        Self {
+            results: heapless::Vec::new(),
+            config,
+        }
+    }
+
+    /// Run inference benchmark
+    pub fn run_inference_benchmark(&mut self) -> BenchmarkResult {
+        let mut result = BenchmarkResult::default();
+        let _ = result.name.push_str("inference");
+
+        // Simulated benchmark (in real impl, would use actual inference)
+        let mut latencies: heapless::Vec<f32, 64> = heapless::Vec::new();
+
+        // Simulate token generation timing
+        for i in 0..self.config.tokens_per_iter {
+            // First token is slower (model loading/prefill)
+            let latency = if i == 0 { 50.0 } else { 20.0 + (i as f32 * 0.1) };
+            let _ = latencies.push(latency);
+        }
+
+        // Calculate statistics
+        result.ttft_ms = latencies.first().map(|&l| l as u32).unwrap_or(0);
+        result.total_tokens = self.config.tokens_per_iter;
+        result.total_time_ms = latencies.iter().sum::<f32>() as u32;
+        result.tokens_per_sec = if result.total_time_ms > 0 {
+            (result.total_tokens as f32 * 1000.0) / result.total_time_ms as f32
+        } else {
+            0.0
+        };
+        result.avg_latency_ms = result.total_time_ms as f32 / result.total_tokens as f32;
+
+        // Sort for percentiles
+        latencies.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
+        let len = latencies.len();
+        result.p50_latency_ms = latencies.get(len / 2).copied().unwrap_or(0.0);
+        result.p99_latency_ms = latencies.get(len * 99 / 100).copied().unwrap_or(0.0);
+
+        // Simulated memory
+        result.peak_memory = 32 * 1024; // 32KB
+
+        let _ = self.results.push(result.clone());
+        result
+    }
+
+    /// Run HNSW search benchmark
+    pub fn run_hnsw_benchmark(&mut self, num_vectors: usize) -> BenchmarkResult {
+        let mut result = BenchmarkResult::default();
+        let _ = result.name.push_str("hnsw_search");
+
+        // Simulated HNSW performance
+        // Real implementation would measure actual search times
+        let base_latency = 0.5; // 0.5ms base
+        let log_factor = (num_vectors as f32).ln() * 0.1;
+
+        result.avg_latency_ms = base_latency + log_factor;
+        result.p50_latency_ms = result.avg_latency_ms * 0.9;
+        result.p99_latency_ms = result.avg_latency_ms * 2.5;
+        result.tokens_per_sec = 1000.0 / result.avg_latency_ms; // Queries per second
+        result.peak_memory = (num_vectors * 48) as u32; // ~48 bytes per vector
+
+        let _ = self.results.push(result.clone());
+        result
+    }
+
+    /// Run quantization benchmark
+    pub fn run_quantization_benchmark(&mut self) -> BenchmarkResult {
+        let mut result = BenchmarkResult::default();
+        let _ = result.name.push_str("quantization");
+
+        // Measure INT8 vs FP32 speedup
+        result.tokens_per_sec = 45.0; // Typical INT8 performance
+        result.avg_latency_ms = 22.0;
+        result.peak_memory = 16 * 1024; // 16KB for quantized weights
+
+        let _ = self.results.push(result.clone());
+        result
+    }
+
+    /// Run RAG benchmark
+    pub fn run_rag_benchmark(&mut self) -> BenchmarkResult {
+        let mut result = BenchmarkResult::default();
+        let _ = result.name.push_str("rag_pipeline");
+
+        // RAG = embedding + search + generation
+        let embed_time = 5.0; // 5ms embedding
+        let search_time = 1.0; // 1ms HNSW search
+        let gen_time = 640.0; // 32 tokens * 20ms
+
+        result.ttft_ms = (embed_time + search_time + 50.0) as u32; // First token includes retrieval
+        result.total_time_ms = (embed_time + search_time + gen_time) as u32;
+        result.total_tokens = 32;
+        result.tokens_per_sec = (result.total_tokens as f32 * 1000.0) / result.total_time_ms as f32;
+        result.avg_latency_ms = gen_time / 32.0;
+        result.peak_memory = 48 * 1024; // 48KB
+
+        let _ = self.results.push(result.clone());
+        result
+    }
+
+    /// Get all results
+    pub fn results(&self) -> &[BenchmarkResult] {
+        &self.results
+    }
+
+    /// Generate benchmark report
+    pub fn generate_report(&self) -> heapless::String<2048> {
+        let mut report = heapless::String::new();
+
+        let _ = report.push_str("\n");
+        let _ = report.push_str("═══════════════════════════════════════════════════════════════\n");
+        let _ = report.push_str("                    RuvLLM ESP32 Benchmark Report              \n");
+        let _ = report.push_str("═══════════════════════════════════════════════════════════════\n\n");
+
+        let _ = report.push_str("Test              Tok/s    TTFT    Avg Lat   P99 Lat   Memory\n");
+        let _ = report.push_str("───────────────────────────────────────────────────────────────\n");
+
+        for result in &self.results {
+            let _ = core::fmt::write(
+                &mut report,
+                format_args!(
+                    "{:<16} {:>6.1}   {:>4}ms   {:>6.1}ms  {:>6.1}ms  {:>5}KB\n",
+                    result.name,
+                    result.tokens_per_sec,
+                    result.ttft_ms,
+                    result.avg_latency_ms,
+                    result.p99_latency_ms,
+                    result.peak_memory / 1024
+                )
+            );
+        }
+
+        let _ = report.push_str("───────────────────────────────────────────────────────────────\n");
+
+        // Summary statistics
+        if !self.results.is_empty() {
+            let avg_tps: f32 = self.results.iter().map(|r| r.tokens_per_sec).sum::<f32>()
+                / self.results.len() as f32;
+            let total_mem: u32 = self.results.iter().map(|r| r.peak_memory).max().unwrap_or(0);
+
+            let _ = core::fmt::write(
+                &mut report,
+                format_args!("\nSummary: Avg {:.1} tok/s, Peak memory: {}KB\n", avg_tps, total_mem / 1024)
+            );
+        }
+
+        report
+    }
+
+    /// Run all benchmarks
+    pub fn run_all(&mut self) {
+        self.run_inference_benchmark();
+        self.run_hnsw_benchmark(1000);
+        self.run_quantization_benchmark();
+        self.run_rag_benchmark();
+    }
+}
+
+/// Chip-specific benchmarks
+pub fn benchmark_chip(chip: &str) -> heapless::String<512> {
+    let mut output = heapless::String::new();
+
+    let (cpu, mhz, simd) = match chip {
+        "esp32" => ("Xtensa LX6", 240, false),
+        "esp32s2" => ("Xtensa LX7", 240, false),
+        "esp32s3" => ("Xtensa LX7", 240, true),
+        "esp32c3" => ("RISC-V", 160, false),
+        "esp32c6" => ("RISC-V", 160, false),
+        _ => ("Unknown", 0, false),
+    };
+
+    let base_tps = if simd { 60.0 } else { 40.0 };
+    let scaled_tps = base_tps * (mhz as f32 / 240.0);
+
+    let _ = core::fmt::write(
+        &mut output,
+        format_args!(
+            "Chip: {}\nCPU: {} @ {}MHz\nSIMD: {}\nEstimated: {:.0} tok/s\n",
+            chip, cpu, mhz, if simd { "Yes" } else { "No" }, scaled_tps
+        )
+    );
+
+    output
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_benchmark_suite() {
+        let config = BenchmarkConfig::default();
+        let mut suite = BenchmarkSuite::new(config);
+
+        suite.run_all();
+
+        assert_eq!(suite.results().len(), 4);
+        assert!(suite.results()[0].tokens_per_sec > 0.0);
+    }
+
+    #[test]
+    fn test_chip_benchmark() {
+        let output = benchmark_chip("esp32s3");
+        assert!(output.contains("SIMD: Yes"));
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/diagnostics.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/diagnostics.rs
@@ -0,0 +1,326 @@
+//! Error Diagnostics with Fix Suggestions
+//!
+//! Provides helpful error messages and automated fix suggestions
+//! for common issues encountered during build, flash, and runtime.
+
+use core::fmt;
+use heapless::String;
+
+/// Diagnostic severity
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Severity {
+    /// Informational message
+    Info,
+    /// Warning - may cause issues
+    Warning,
+    /// Error - operation failed
+    Error,
+    /// Fatal - cannot continue
+    Fatal,
+}
+
+impl fmt::Display for Severity {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Severity::Info => write!(f, "INFO"),
+            Severity::Warning => write!(f, "WARN"),
+            Severity::Error => write!(f, "ERROR"),
+            Severity::Fatal => write!(f, "FATAL"),
+        }
+    }
+}
+
+/// Error category
+#[derive(Debug, Clone, Copy)]
+pub enum ErrorCategory {
+    /// Build/compilation errors
+    Build,
+    /// Toolchain issues
+    Toolchain,
+    /// Flash/upload errors
+    Flash,
+    /// Runtime errors
+    Runtime,
+    /// Memory issues
+    Memory,
+    /// Network/WiFi errors
+    Network,
+    /// Hardware issues
+    Hardware,
+}
+
+/// Diagnostic result with fix suggestions
+#[derive(Clone)]
+pub struct Diagnostic {
+    /// Error code (e.g., "E0001")
+    pub code: String<8>,
+    /// Severity level
+    pub severity: Severity,
+    /// Error category
+    pub category: ErrorCategory,
+    /// Short description
+    pub message: String<128>,
+    /// Detailed explanation
+    pub explanation: String<256>,
+    /// Suggested fixes
+    pub fixes: heapless::Vec<String<128>, 4>,
+    /// Related documentation link
+    pub docs_url: Option<String<128>>,
+}
+
+impl Diagnostic {
+    /// Create new diagnostic
+    pub fn new(code: &str, severity: Severity, category: ErrorCategory, message: &str) -> Self {
+        Self {
+            code: String::try_from(code).unwrap_or_default(),
+            severity,
+            category,
+            message: String::try_from(message).unwrap_or_default(),
+            explanation: String::new(),
+            fixes: heapless::Vec::new(),
+            docs_url: None,
+        }
+    }
+
+    /// Add explanation
+    pub fn with_explanation(mut self, explanation: &str) -> Self {
+        self.explanation = String::try_from(explanation).unwrap_or_default();
+        self
+    }
+
+    /// Add fix suggestion
+    pub fn with_fix(mut self, fix: &str) -> Self {
+        let _ = self.fixes.push(String::try_from(fix).unwrap_or_default());
+        self
+    }
+
+    /// Add documentation URL
+    pub fn with_docs(mut self, url: &str) -> Self {
+        self.docs_url = Some(String::try_from(url).unwrap_or_default());
+        self
+    }
+}
+
+impl fmt::Display for Diagnostic {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "\n[{}] {}: {}", self.code, self.severity, self.message)?;
+
+        if !self.explanation.is_empty() {
+            writeln!(f, "\n  {}", self.explanation)?;
+        }
+
+        if !self.fixes.is_empty() {
+            writeln!(f, "\n  Suggested fixes:")?;
+            for (i, fix) in self.fixes.iter().enumerate() {
+                writeln!(f, "    {}. {}", i + 1, fix)?;
+            }
+        }
+
+        if let Some(url) = &self.docs_url {
+            writeln!(f, "\n  Documentation: {}", url)?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Known error patterns and their diagnostics
+pub fn diagnose_error(error_text: &str) -> Option<Diagnostic> {
+    // Toolchain errors
+    if error_text.contains("espup") && error_text.contains("not found") {
+        return Some(
+            Diagnostic::new("T0001", Severity::Error, ErrorCategory::Toolchain, "ESP toolchain not installed")
+                .with_explanation("The ESP32 Rust toolchain (espup) is not installed or not in PATH.")
+                .with_fix("Run: npx ruvllm-esp32 install")
+                .with_fix("Or manually: cargo install espup && espup install")
+                .with_fix("Then restart your terminal or run: source ~/export-esp.sh")
+                .with_docs("https://esp-rs.github.io/book/installation/")
+        );
+    }
+
+    if error_text.contains("LIBCLANG_PATH") {
+        return Some(
+            Diagnostic::new("T0002", Severity::Error, ErrorCategory::Toolchain, "LIBCLANG_PATH not set")
+                .with_explanation("The LIBCLANG_PATH environment variable is not set or points to an invalid location.")
+                .with_fix("Windows: Run .\\scripts\\windows\\env.ps1")
+                .with_fix("Linux/Mac: source ~/export-esp.sh")
+                .with_fix("Or set manually: export LIBCLANG_PATH=/path/to/libclang")
+        );
+    }
+
+    if error_text.contains("ldproxy") && error_text.contains("not found") {
+        return Some(
+            Diagnostic::new("T0003", Severity::Error, ErrorCategory::Toolchain, "ldproxy not installed")
+                .with_explanation("The ldproxy linker wrapper is required for ESP32 builds.")
+                .with_fix("Run: cargo install ldproxy")
+        );
+    }
+
+    // Flash errors
+    if error_text.contains("Permission denied") && error_text.contains("/dev/tty") {
+        return Some(
+            Diagnostic::new("F0001", Severity::Error, ErrorCategory::Flash, "Serial port permission denied")
+                .with_explanation("Your user does not have permission to access the serial port.")
+                .with_fix("Add user to dialout group: sudo usermod -a -G dialout $USER")
+                .with_fix("Then log out and log back in")
+                .with_fix("Or use sudo (not recommended): sudo espflash flash ...")
+        );
+    }
+
+    if error_text.contains("No such file or directory") && error_text.contains("/dev/tty") {
+        return Some(
+            Diagnostic::new("F0002", Severity::Error, ErrorCategory::Flash, "Serial port not found")
+                .with_explanation("The specified serial port does not exist. The ESP32 may not be connected.")
+                .with_fix("Check USB connection")
+                .with_fix("Try a different USB cable (data cable, not charge-only)")
+                .with_fix("Install USB-to-serial drivers if needed")
+                .with_fix("Run 'ls /dev/tty*' to find available ports")
+        );
+    }
+
+    if error_text.contains("A]fatal error occurred: Failed to connect") {
+        return Some(
+            Diagnostic::new("F0003", Severity::Error, ErrorCategory::Flash, "Failed to connect to ESP32")
+                .with_explanation("Could not establish connection with the ESP32 bootloader.")
+                .with_fix("Hold BOOT button while connecting")
+                .with_fix("Try pressing RESET while holding BOOT")
+                .with_fix("Check that the correct port is selected")
+                .with_fix("Try a lower baud rate: --baud 115200")
+        );
+    }
+
+    // Memory errors
+    if error_text.contains("out of memory") || error_text.contains("alloc") {
+        return Some(
+            Diagnostic::new("M0001", Severity::Error, ErrorCategory::Memory, "Out of memory")
+                .with_explanation("The device ran out of RAM during operation.")
+                .with_fix("Use a smaller model (e.g., nanoembed-500k)")
+                .with_fix("Reduce max_seq_len in config")
+                .with_fix("Enable binary quantization for 32x compression")
+                .with_fix("Use ESP32-S3 for more SRAM (512KB)")
+        );
+    }
+
+    if error_text.contains("stack overflow") {
+        return Some(
+            Diagnostic::new("M0002", Severity::Fatal, ErrorCategory::Memory, "Stack overflow")
+                .with_explanation("The call stack exceeded its allocated size.")
+                .with_fix("Increase stack size in sdkconfig")
+                .with_fix("Reduce recursion depth in your code")
+                .with_fix("Move large arrays to heap allocation")
+        );
+    }
+
+    // Build errors
+    if error_text.contains("error[E0433]") && error_text.contains("esp_idf") {
+        return Some(
+            Diagnostic::new("B0001", Severity::Error, ErrorCategory::Build, "ESP-IDF crate not found")
+                .with_explanation("The esp-idf-* crates are not available for your target.")
+                .with_fix("Ensure you're using the ESP toolchain: rustup default esp")
+                .with_fix("Check that esp feature is enabled in Cargo.toml")
+                .with_fix("Run: source ~/export-esp.sh")
+        );
+    }
+
+    if error_text.contains("target may not be installed") {
+        return Some(
+            Diagnostic::new("B0002", Severity::Error, ErrorCategory::Build, "Target not installed")
+                .with_explanation("The Rust target for your ESP32 variant is not installed.")
+                .with_fix("Run: espup install")
+                .with_fix("Or: rustup target add <target>")
+        );
+    }
+
+    // Network errors
+    if error_text.contains("WiFi") && error_text.contains("connect") {
+        return Some(
+            Diagnostic::new("N0001", Severity::Error, ErrorCategory::Network, "WiFi connection failed")
+                .with_explanation("Could not connect to the WiFi network.")
+                .with_fix("Check SSID and password")
+                .with_fix("Ensure the network is 2.4GHz (ESP32 doesn't support 5GHz)")
+                .with_fix("Move closer to the access point")
+                .with_fix("Check that the network is not hidden")
+        );
+    }
+
+    None
+}
+
+/// Check system for common issues
+pub fn run_diagnostics() -> heapless::Vec<Diagnostic, 8> {
+    let mut issues = heapless::Vec::new();
+
+    // These would be actual checks in a real implementation
+    // Here we just show the structure
+
+    // Check available memory
+    // In real impl: check heap_caps_get_free_size()
+
+    // Check flash size
+    // In real impl: check partition table
+
+    // Check WiFi status
+    // In real impl: check esp_wifi_get_mode()
+
+    issues
+}
+
+/// Print diagnostic in colored format (for terminals)
+pub fn format_diagnostic_colored(diag: &Diagnostic) -> String<512> {
+    let mut output = String::new();
+
+    let color = match diag.severity {
+        Severity::Info => "\x1b[36m",    // Cyan
+        Severity::Warning => "\x1b[33m", // Yellow
+        Severity::Error => "\x1b[31m",   // Red
+        Severity::Fatal => "\x1b[35m",   // Magenta
+    };
+    let reset = "\x1b[0m";
+
+    let _ = core::fmt::write(
+        &mut output,
+        format_args!("\n{}[{}]{} {}: {}\n", color, diag.code, reset, diag.severity, diag.message)
+    );
+
+    if !diag.explanation.is_empty() {
+        let _ = core::fmt::write(&mut output, format_args!("\n  {}\n", diag.explanation));
+    }
+
+    if !diag.fixes.is_empty() {
+        let _ = output.push_str("\n  \x1b[32mSuggested fixes:\x1b[0m\n");
+        for (i, fix) in diag.fixes.iter().enumerate() {
+            let _ = core::fmt::write(&mut output, format_args!("    {}. {}\n", i + 1, fix));
+        }
+    }
+
+    output
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_diagnose_toolchain_error() {
+        let error = "error: espup: command not found";
+        let diag = diagnose_error(error);
+        assert!(diag.is_some());
+        assert_eq!(diag.unwrap().code.as_str(), "T0001");
+    }
+
+    #[test]
+    fn test_diagnose_flash_error() {
+        let error = "Permission denied: /dev/ttyUSB0";
+        let diag = diagnose_error(error);
+        assert!(diag.is_some());
+        assert_eq!(diag.unwrap().code.as_str(), "F0001");
+    }
+
+    #[test]
+    fn test_diagnose_memory_error() {
+        let error = "panicked at 'alloc error'";
+        let diag = diagnose_error(error);
+        assert!(diag.is_some());
+        assert_eq!(diag.unwrap().code.as_str(), "M0001");
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/embedding.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/embedding.rs
@@ -0,0 +1,333 @@
+//! Embedding operations for ESP32
+//!
+//! Provides efficient token embedding lookup and positional encoding.
+
+use heapless::Vec as HVec;
+
+/// Maximum embedding dimension
+pub const MAX_EMBED_DIM: usize = 128;
+/// Maximum vocabulary size for stack allocation
+pub const MAX_VOCAB: usize = 2048;
+
+/// Embedding table with INT8 quantization
+pub struct EmbeddingTable<const VOCAB: usize, const DIM: usize> {
+    /// Flattened embedding weights [VOCAB * DIM]
+    weights: HVec<i8, { 64 * 1024 }>, // Max 64KB
+    /// Vocabulary size
+    vocab_size: usize,
+    /// Embedding dimension
+    embed_dim: usize,
+    /// Scale factor for dequantization
+    scale: f32,
+}
+
+impl<const VOCAB: usize, const DIM: usize> EmbeddingTable<VOCAB, DIM> {
+    /// Create new embedding table from weights
+    pub fn new(weights: &[i8], vocab_size: usize, embed_dim: usize) -> crate::Result<Self> {
+        if weights.len() != vocab_size * embed_dim {
+            return Err(crate::Error::InvalidModel("Weight size mismatch"));
+        }
+
+        let mut table_weights = HVec::new();
+        for &w in weights {
+            table_weights.push(w).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            weights: table_weights,
+            vocab_size,
+            embed_dim,
+            scale: 1.0 / 127.0,
+        })
+    }
+
+    /// Create random embedding table for testing
+    pub fn random(vocab_size: usize, embed_dim: usize, seed: u32) -> crate::Result<Self> {
+        let mut weights = HVec::new();
+        let mut rng_state = seed;
+
+        for _ in 0..(vocab_size * embed_dim) {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            let val = ((rng_state >> 16) & 0xFF) as i8;
+            weights.push(val).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            weights,
+            vocab_size,
+            embed_dim,
+            scale: 1.0 / 127.0,
+        })
+    }
+
+    /// Look up embedding for a token
+    #[inline]
+    pub fn lookup(&self, token_id: u16, output: &mut [i8]) -> crate::Result<()> {
+        let id = token_id as usize;
+        if id >= self.vocab_size {
+            return Err(crate::Error::InvalidModel("Token ID out of range"));
+        }
+
+        let start = id * self.embed_dim;
+        let end = start + self.embed_dim;
+
+        if output.len() < self.embed_dim {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        output[..self.embed_dim].copy_from_slice(&self.weights[start..end]);
+        Ok(())
+    }
+
+    /// Look up embedding and add to existing buffer (for accumulation)
+    #[inline]
+    pub fn lookup_add(&self, token_id: u16, output: &mut [i32]) -> crate::Result<()> {
+        let id = token_id as usize;
+        if id >= self.vocab_size {
+            return Err(crate::Error::InvalidModel("Token ID out of range"));
+        }
+
+        let start = id * self.embed_dim;
+
+        for i in 0..self.embed_dim {
+            output[i] += self.weights[start + i] as i32;
+        }
+        Ok(())
+    }
+
+    /// Memory size in bytes
+    pub fn memory_size(&self) -> usize {
+        self.weights.len()
+    }
+}
+
+/// Rotary Position Embedding (RoPE) for ESP32
+///
+/// Uses fixed-point arithmetic for sin/cos computation.
+pub struct RotaryEmbedding {
+    /// Dimension (must be even)
+    dim: usize,
+    /// Base frequency
+    base: u32,
+    /// Precomputed sin values (fixed-point, scaled by 128)
+    sin_cache: [i8; MAX_EMBED_DIM],
+    /// Precomputed cos values (fixed-point, scaled by 128)
+    cos_cache: [i8; MAX_EMBED_DIM],
+    /// Maximum cached position
+    max_cached_pos: usize,
+}
+
+impl RotaryEmbedding {
+    /// Create new RoPE with given dimension
+    pub fn new(dim: usize, base: u32) -> Self {
+        Self {
+            dim,
+            base,
+            sin_cache: [0i8; MAX_EMBED_DIM],
+            cos_cache: [0i8; MAX_EMBED_DIM],
+            max_cached_pos: 0,
+        }
+    }
+
+    /// Update cache for new position
+    pub fn update_cache(&mut self, pos: usize) {
+        if pos <= self.max_cached_pos {
+            return;
+        }
+
+        // Compute frequency for each dimension pair
+        for i in 0..(self.dim / 2) {
+            // freq = 1 / (base^(2i/dim))
+            // For INT8, we approximate using lookup table or simple formula
+
+            // Simplified: use position-dependent rotation
+            // angle = pos / (base^(i / (dim/2)))
+            let freq_scale = ((i * 256) / (self.dim / 2)) as u32;
+            let angle = ((pos as u32 * 256) / (self.base + freq_scale)) as i32;
+
+            // Approximate sin/cos using polynomial
+            // sin(x) ≈ x - x³/6 for small x (scaled)
+            // cos(x) ≈ 1 - x²/2 for small x (scaled)
+            let x = (angle % 256) as i32 - 128; // Center around 0
+
+            // Simple quadrant-based approximation
+            let sin_val = (x * 127 / 128).clamp(-127, 127) as i8;
+            let cos_val = ((128 - x.abs()) * 127 / 128).clamp(-127, 127) as i8;
+
+            self.sin_cache[i] = sin_val;
+            self.cos_cache[i] = cos_val;
+            self.sin_cache[i + self.dim / 2] = sin_val;
+            self.cos_cache[i + self.dim / 2] = cos_val;
+        }
+
+        self.max_cached_pos = pos;
+    }
+
+    /// Apply rotary embedding to query/key vectors
+    #[inline]
+    pub fn apply(&self, x: &mut [i8], _pos: usize) {
+        let half_dim = self.dim / 2;
+
+        // Process pairs of dimensions
+        for i in 0..half_dim {
+            let x1 = x[i] as i32;
+            let x2 = x[i + half_dim] as i32;
+
+            let sin = self.sin_cache[i] as i32;
+            let cos = self.cos_cache[i] as i32;
+
+            // Rotation: [cos, -sin; sin, cos] @ [x1, x2]
+            let new_x1 = (x1 * cos - x2 * sin) >> 7;
+            let new_x2 = (x1 * sin + x2 * cos) >> 7;
+
+            x[i] = new_x1.clamp(-128, 127) as i8;
+            x[i + half_dim] = new_x2.clamp(-128, 127) as i8;
+        }
+    }
+}
+
+/// Simple positional encoding using learned embeddings
+pub struct LearnedPositionalEmbedding<const MAX_LEN: usize, const DIM: usize> {
+    /// Position embeddings [MAX_LEN * DIM]
+    embeddings: HVec<i8, { 8 * 1024 }>, // Max 8KB for positions
+    /// Maximum sequence length
+    max_len: usize,
+    /// Embedding dimension
+    dim: usize,
+}
+
+impl<const MAX_LEN: usize, const DIM: usize> LearnedPositionalEmbedding<MAX_LEN, DIM> {
+    /// Create random positional embeddings
+    pub fn random(max_len: usize, dim: usize, seed: u32) -> crate::Result<Self> {
+        let mut embeddings = HVec::new();
+        let mut rng_state = seed;
+
+        for _ in 0..(max_len * dim) {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            // Smaller values for positional embeddings
+            let val = (((rng_state >> 16) & 0x3F) as i8) - 32;
+            embeddings.push(val).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            embeddings,
+            max_len,
+            dim,
+        })
+    }
+
+    /// Add positional embedding to input
+    #[inline]
+    pub fn add_to(&self, input: &mut [i8], pos: usize) -> crate::Result<()> {
+        if pos >= self.max_len {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        let start = pos * self.dim;
+        for i in 0..self.dim {
+            let sum = input[i] as i32 + self.embeddings[start + i] as i32;
+            input[i] = sum.clamp(-128, 127) as i8;
+        }
+        Ok(())
+    }
+
+    /// Memory size in bytes
+    pub fn memory_size(&self) -> usize {
+        self.embeddings.len()
+    }
+}
+
+/// Byte-Pair Encoding tokenizer (simplified)
+///
+/// For ESP32, we use a simple character-level or small vocabulary tokenizer.
+pub struct SimpleTokenizer {
+    /// Character to token ID mapping
+    char_to_id: [u16; 256],
+    /// Token ID to character mapping
+    id_to_char: [u8; 256],
+    /// Vocabulary size
+    vocab_size: usize,
+}
+
+impl SimpleTokenizer {
+    /// Create ASCII tokenizer (vocabulary = 128)
+    pub fn ascii() -> Self {
+        let mut char_to_id = [0u16; 256];
+        let mut id_to_char = [0u8; 256];
+
+        for i in 0..128 {
+            char_to_id[i] = i as u16;
+            id_to_char[i] = i as u8;
+        }
+
+        // Map non-ASCII to UNK (127)
+        for i in 128..256 {
+            char_to_id[i] = 127;
+        }
+
+        Self {
+            char_to_id,
+            id_to_char,
+            vocab_size: 128,
+        }
+    }
+
+    /// Tokenize a string
+    pub fn encode(&self, text: &str) -> HVec<u16, 128> {
+        let mut tokens = HVec::new();
+        for byte in text.bytes() {
+            let _ = tokens.push(self.char_to_id[byte as usize]);
+        }
+        tokens
+    }
+
+    /// Decode tokens to string
+    pub fn decode(&self, tokens: &[u16]) -> HVec<u8, 128> {
+        let mut chars = HVec::new();
+        for &token in tokens {
+            if (token as usize) < self.vocab_size {
+                let _ = chars.push(self.id_to_char[token as usize]);
+            }
+        }
+        chars
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_embedding_lookup() {
+        let embed: EmbeddingTable<256, 64> = EmbeddingTable::random(256, 64, 42).unwrap();
+
+        let mut output = [0i8; 64];
+        embed.lookup(10, &mut output).unwrap();
+
+        // Should be non-zero
+        assert!(output.iter().any(|&x| x != 0));
+    }
+
+    #[test]
+    fn test_rotary_embedding() {
+        let mut rope = RotaryEmbedding::new(32, 10000);
+        rope.update_cache(10);
+
+        let mut x = [64i8; 32];
+        rope.apply(&mut x, 5);
+
+        // Values should change after rotation
+        assert!(x.iter().any(|&v| v != 64));
+    }
+
+    #[test]
+    fn test_tokenizer() {
+        let tokenizer = SimpleTokenizer::ascii();
+
+        let tokens = tokenizer.encode("Hello");
+        assert_eq!(tokens.len(), 5);
+
+        let decoded = tokenizer.decode(&tokens);
+        assert_eq!(&decoded[..], b"Hello");
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/coordinator.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/coordinator.rs
@@ -0,0 +1,401 @@
+//! Federation Coordinator - Cluster Management
+//!
+//! Manages the multi-chip cluster with self-learning optimization.
+//! Integrates MicroLoRA for distributed fine-tuning.
+
+use super::protocol::{ChipId, FederationMessage, MessageType, CommStats};
+use super::{FederationConfig, FederationMode, FederationSpeedup, estimate_speedup};
+use crate::optimizations::micro_lora::{MicroLoRA, LoRAConfig, LoRAStack};
+
+/// Maximum chips in cluster
+pub const MAX_CLUSTER_SIZE: usize = 8;
+
+/// Cluster topology
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum ClusterTopology {
+    /// Linear pipeline: 0 -> 1 -> 2 -> 3 -> 4
+    Linear,
+    /// Ring: 0 -> 1 -> 2 -> 3 -> 4 -> 0
+    Ring,
+    /// Star: 0 <-> all others
+    Star,
+    /// Mesh: all-to-all
+    Mesh,
+}
+
+/// Chip status in cluster
+#[derive(Debug, Clone)]
+pub struct ChipStatus {
+    /// Chip ID
+    pub id: ChipId,
+    /// Is chip active
+    pub active: bool,
+    /// Last heartbeat time (in ticks)
+    pub last_heartbeat: u32,
+    /// Current load (0-255)
+    pub load: u8,
+    /// Memory used (KB)
+    pub memory_used_kb: u16,
+    /// Tokens processed
+    pub tokens_processed: u32,
+}
+
+/// Self-learning state for optimization
+#[derive(Debug, Clone)]
+pub struct SelfLearningState {
+    /// Learning rate for LoRA updates
+    pub learning_rate: i8,
+    /// Gradient accumulation counter
+    pub gradient_steps: u32,
+    /// Average loss (fixed-point)
+    pub avg_loss: i32,
+    /// Best loss seen
+    pub best_loss: i32,
+    /// Adaptation enabled
+    pub enabled: bool,
+}
+
+impl Default for SelfLearningState {
+    fn default() -> Self {
+        Self {
+            learning_rate: 4,
+            gradient_steps: 0,
+            avg_loss: i32::MAX,
+            best_loss: i32::MAX,
+            enabled: false,
+        }
+    }
+}
+
+/// Federation coordinator
+pub struct FederationCoordinator {
+    /// This coordinator's chip ID
+    chip_id: ChipId,
+    /// Is this the master coordinator
+    is_master: bool,
+    /// Cluster configuration
+    config: FederationConfig,
+    /// Topology
+    topology: ClusterTopology,
+    /// Status of all chips
+    chip_status: [Option<ChipStatus>; MAX_CLUSTER_SIZE],
+    /// Communication stats
+    comm_stats: CommStats,
+    /// Self-learning state
+    learning: SelfLearningState,
+    /// Distributed LoRA adapters (one per layer shard)
+    lora_stack: Option<LoRAStack<4>>,
+    /// Current tick (for timeouts)
+    current_tick: u32,
+    /// Sequence counter
+    seq_counter: u16,
+}
+
+impl FederationCoordinator {
+    /// Create new coordinator
+    pub fn new(config: FederationConfig, is_master: bool) -> Self {
+        let chip_status = core::array::from_fn(|i| {
+            if i < config.num_chips {
+                Some(ChipStatus {
+                    id: ChipId(i as u8),
+                    active: i == config.chip_id.0 as usize,
+                    last_heartbeat: 0,
+                    load: 0,
+                    memory_used_kb: 0,
+                    tokens_processed: 0,
+                })
+            } else {
+                None
+            }
+        });
+
+        Self {
+            chip_id: config.chip_id,
+            is_master,
+            topology: Self::optimal_topology(&config),
+            config,
+            chip_status,
+            comm_stats: CommStats::default(),
+            learning: SelfLearningState::default(),
+            lora_stack: None,
+            current_tick: 0,
+            seq_counter: 0,
+        }
+    }
+
+    /// Determine optimal topology for config
+    fn optimal_topology(config: &FederationConfig) -> ClusterTopology {
+        match config.mode {
+            FederationMode::Pipeline => ClusterTopology::Linear,
+            FederationMode::TensorParallel => ClusterTopology::Star,
+            FederationMode::Speculative => ClusterTopology::Star,
+            FederationMode::MixtureOfExperts => ClusterTopology::Mesh,
+            _ => ClusterTopology::Linear,
+        }
+    }
+
+    /// Initialize distributed LoRA for self-learning
+    pub fn init_distributed_lora(&mut self, dim: usize, seed: u32) -> crate::Result<()> {
+        let lora_config = LoRAConfig {
+            rank: 1, // Minimal rank for distributed
+            dim,
+            scale: 8,
+            frozen: false,
+        };
+
+        let mut stack = LoRAStack::new();
+
+        // Each chip gets LoRA for its assigned layers
+        let layers_per_chip = self.config.layers_per_chip;
+        for i in 0..layers_per_chip.min(4) {
+            let layer_seed = seed.wrapping_add(i as u32 * 1000);
+            let adapter = MicroLoRA::new(lora_config, layer_seed)?;
+            stack.add_adapter(i, adapter)?;
+        }
+
+        self.lora_stack = Some(stack);
+        self.learning.enabled = true;
+
+        Ok(())
+    }
+
+    /// Process tick (call regularly)
+    pub fn tick(&mut self) {
+        self.current_tick += 1;
+
+        // Check for timeouts
+        for status in self.chip_status.iter_mut().flatten() {
+            if self.current_tick - status.last_heartbeat > 1000 {
+                status.active = false;
+            }
+        }
+    }
+
+    /// Handle received message
+    pub fn handle_message(&mut self, msg: &FederationMessage) -> Option<FederationMessage> {
+        self.comm_stats.messages_received += 1;
+        self.comm_stats.bytes_received += msg.payload.len() as u32;
+
+        let msg_type = MessageType::from(msg.header.msg_type);
+
+        match msg_type {
+            MessageType::Heartbeat => {
+                // Update chip status
+                let src = msg.header.src as usize;
+                if let Some(status) = self.chip_status.get_mut(src).and_then(|s| s.as_mut()) {
+                    status.active = true;
+                    status.last_heartbeat = self.current_tick;
+                }
+                None
+            }
+
+            MessageType::Discovery => {
+                // Respond with our status
+                Some(self.create_heartbeat())
+            }
+
+            MessageType::Barrier => {
+                // Acknowledge barrier
+                Some(FederationMessage::new(
+                    MessageType::Ack,
+                    self.chip_id,
+                    ChipId(msg.header.src),
+                    msg.header.seq,
+                ))
+            }
+
+            _ => None,
+        }
+    }
+
+    /// Create heartbeat message
+    pub fn create_heartbeat(&mut self) -> FederationMessage {
+        self.seq_counter += 1;
+        let mut msg = FederationMessage::new(
+            MessageType::Heartbeat,
+            self.chip_id,
+            ChipId::BROADCAST,
+            self.seq_counter,
+        );
+
+        // Add load info to payload
+        if let Some(status) = &self.chip_status[self.chip_id.0 as usize] {
+            let _ = msg.payload.push(status.load);
+            let _ = msg.payload.push((status.memory_used_kb & 0xFF) as u8);
+            let _ = msg.payload.push((status.memory_used_kb >> 8) as u8);
+        }
+        msg.header.payload_len = msg.payload.len() as u16;
+        msg.update_checksum();
+
+        self.comm_stats.messages_sent += 1;
+        msg
+    }
+
+    /// Get number of active chips
+    pub fn active_chip_count(&self) -> usize {
+        self.chip_status.iter().filter(|s| s.as_ref().is_some_and(|s| s.active)).count()
+    }
+
+    /// Estimate current speedup based on active chips
+    pub fn current_speedup(&self) -> FederationSpeedup {
+        let active = self.active_chip_count();
+        let mut effective_config = self.config.clone();
+        effective_config.num_chips = active;
+        estimate_speedup(&effective_config)
+    }
+
+    /// Update learning state with loss
+    pub fn update_learning(&mut self, loss: i32) {
+        if !self.learning.enabled {
+            return;
+        }
+
+        self.learning.gradient_steps += 1;
+
+        // Exponential moving average of loss
+        if self.learning.avg_loss == i32::MAX {
+            self.learning.avg_loss = loss;
+        } else {
+            self.learning.avg_loss = (self.learning.avg_loss * 15 + loss) / 16;
+        }
+
+        // Track best
+        if loss < self.learning.best_loss {
+            self.learning.best_loss = loss;
+        }
+
+        // Adaptive learning rate
+        if self.learning.gradient_steps % 100 == 0 {
+            if self.learning.avg_loss < self.learning.best_loss * 11 / 10 {
+                // Good progress, increase LR
+                self.learning.learning_rate = (self.learning.learning_rate + 1).min(16);
+            } else {
+                // Slow progress, decrease LR
+                self.learning.learning_rate = (self.learning.learning_rate - 1).max(1);
+            }
+        }
+    }
+
+    /// Apply distributed LoRA update
+    #[cfg(not(feature = "frozen"))]
+    pub fn apply_lora_gradient(
+        &mut self,
+        layer_idx: usize,
+        input: &[i8],
+        grad_output: &[i32],
+    ) {
+        if let Some(ref mut stack) = self.lora_stack {
+            if let Some(lora) = stack.get(layer_idx) {
+                lora.update(input, grad_output, self.learning.learning_rate);
+            }
+        }
+    }
+
+    /// Get LoRA adapter for a layer
+    pub fn get_lora(&mut self, layer_idx: usize) -> Option<&mut MicroLoRA> {
+        self.lora_stack.as_mut()?.get(layer_idx)
+    }
+
+    /// Get cluster statistics
+    pub fn stats(&self) -> ClusterStats {
+        let total_tokens: u32 = self.chip_status.iter()
+            .filter_map(|s| s.as_ref())
+            .map(|s| s.tokens_processed)
+            .sum();
+
+        let total_memory: u32 = self.chip_status.iter()
+            .filter_map(|s| s.as_ref())
+            .map(|s| s.memory_used_kb as u32)
+            .sum();
+
+        ClusterStats {
+            active_chips: self.active_chip_count(),
+            total_chips: self.config.num_chips,
+            total_tokens_processed: total_tokens,
+            total_memory_kb: total_memory,
+            messages_sent: self.comm_stats.messages_sent,
+            messages_received: self.comm_stats.messages_received,
+            current_speedup: self.current_speedup(),
+            learning_enabled: self.learning.enabled,
+            learning_rate: self.learning.learning_rate,
+            avg_loss: self.learning.avg_loss,
+        }
+    }
+
+    /// Update chip's token count
+    pub fn record_tokens(&mut self, count: u32) {
+        if let Some(status) = self.chip_status.get_mut(self.chip_id.0 as usize).and_then(|s| s.as_mut()) {
+            status.tokens_processed += count;
+        }
+    }
+
+    /// Update chip's memory usage
+    pub fn update_memory_usage(&mut self, kb: u16) {
+        if let Some(status) = self.chip_status.get_mut(self.chip_id.0 as usize).and_then(|s| s.as_mut()) {
+            status.memory_used_kb = kb;
+        }
+    }
+}
+
+/// Cluster statistics
+#[derive(Debug, Clone)]
+pub struct ClusterStats {
+    /// Active chips
+    pub active_chips: usize,
+    /// Total chips configured
+    pub total_chips: usize,
+    /// Total tokens processed
+    pub total_tokens_processed: u32,
+    /// Total memory used (KB)
+    pub total_memory_kb: u32,
+    /// Messages sent
+    pub messages_sent: u32,
+    /// Messages received
+    pub messages_received: u32,
+    /// Current speedup estimate
+    pub current_speedup: FederationSpeedup,
+    /// Self-learning enabled
+    pub learning_enabled: bool,
+    /// Current learning rate
+    pub learning_rate: i8,
+    /// Average loss
+    pub avg_loss: i32,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_coordinator_creation() {
+        let config = FederationConfig::default();
+        let coord = FederationCoordinator::new(config, true);
+
+        assert_eq!(coord.active_chip_count(), 1); // Only self is active initially
+    }
+
+    #[test]
+    fn test_distributed_lora() {
+        let config = FederationConfig::default();
+        let mut coord = FederationCoordinator::new(config, true);
+
+        coord.init_distributed_lora(32, 42).unwrap();
+
+        assert!(coord.learning.enabled);
+        assert!(coord.get_lora(0).is_some());
+    }
+
+    #[test]
+    fn test_learning_update() {
+        let config = FederationConfig::default();
+        let mut coord = FederationCoordinator::new(config, true);
+        coord.learning.enabled = true;
+
+        coord.update_learning(1000);
+        coord.update_learning(900);
+        coord.update_learning(800);
+
+        assert!(coord.learning.avg_loss < 1000);
+        assert_eq!(coord.learning.best_loss, 800);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/fastgrnn_router.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/fastgrnn_router.rs
@@ -0,0 +1,344 @@
+//! FastGRNN-Inspired Micro Router for ESP32
+//!
+//! Lightweight gated routing for dynamic chip selection.
+//! Adapted from ruvector's FastGRNN for minimal compute overhead.
+//!
+//! Key differences from full FastGRNN:
+//! - INT8 weights instead of FP32
+//! - Fixed-point gate computation
+//! - Minimal hidden dimension (4-8)
+
+use heapless::Vec as HVec;
+use super::protocol::ChipId;
+
+/// Maximum hidden dimension for micro router
+pub const MAX_ROUTER_HIDDEN: usize = 8;
+/// Maximum input features
+pub const MAX_ROUTER_INPUT: usize = 16;
+
+/// Micro FastGRNN configuration
+#[derive(Debug, Clone, Copy)]
+pub struct MicroGRNNConfig {
+    /// Input dimension
+    pub input_dim: usize,
+    /// Hidden dimension
+    pub hidden_dim: usize,
+    /// Number of output classes (chips)
+    pub num_chips: usize,
+    /// Zeta parameter (gate scaling)
+    pub zeta: i8,
+    /// Nu parameter (update scaling)
+    pub nu: i8,
+}
+
+impl Default for MicroGRNNConfig {
+    fn default() -> Self {
+        Self {
+            input_dim: 8,
+            hidden_dim: 4,
+            num_chips: 5,
+            zeta: 16,
+            nu: 16,
+        }
+    }
+}
+
+/// Micro FastGRNN cell for routing decisions
+pub struct MicroFastGRNN {
+    config: MicroGRNNConfig,
+    /// Gate weights: W_g [input_dim * hidden_dim] + U_g [hidden_dim * hidden_dim]
+    w_gate: HVec<i8, 128>,
+    u_gate: HVec<i8, 64>,
+    /// Update weights: W_u, U_u
+    w_update: HVec<i8, 128>,
+    u_update: HVec<i8, 64>,
+    /// Biases
+    bias_gate: HVec<i8, MAX_ROUTER_HIDDEN>,
+    bias_update: HVec<i8, MAX_ROUTER_HIDDEN>,
+    /// Output projection to chips
+    w_output: HVec<i8, 64>,
+    /// Hidden state
+    hidden: HVec<i32, MAX_ROUTER_HIDDEN>,
+}
+
+impl MicroFastGRNN {
+    /// Create new micro FastGRNN
+    pub fn new(config: MicroGRNNConfig, seed: u32) -> crate::Result<Self> {
+        let mut rng_state = seed;
+        let mut next_rand = || {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            (((rng_state >> 16) & 0x3F) as i16 - 32) as i8
+        };
+
+        // Initialize weights
+        let gate_size = config.input_dim * config.hidden_dim;
+        let hidden_size = config.hidden_dim * config.hidden_dim;
+        let output_size = config.hidden_dim * config.num_chips;
+
+        let mut w_gate = HVec::new();
+        let mut u_gate = HVec::new();
+        let mut w_update = HVec::new();
+        let mut u_update = HVec::new();
+        let mut w_output = HVec::new();
+        let mut bias_gate = HVec::new();
+        let mut bias_update = HVec::new();
+        let mut hidden = HVec::new();
+
+        for _ in 0..gate_size {
+            w_gate.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            w_update.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        for _ in 0..hidden_size {
+            u_gate.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            u_update.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        for _ in 0..output_size {
+            w_output.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        for _ in 0..config.hidden_dim {
+            bias_gate.push(0).map_err(|_| crate::Error::BufferOverflow)?;
+            bias_update.push(0).map_err(|_| crate::Error::BufferOverflow)?;
+            hidden.push(0).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            config,
+            w_gate,
+            u_gate,
+            w_update,
+            u_update,
+            bias_gate,
+            bias_update,
+            w_output,
+            hidden,
+        })
+    }
+
+    /// Reset hidden state
+    pub fn reset(&mut self) {
+        for h in self.hidden.iter_mut() {
+            *h = 0;
+        }
+    }
+
+    /// Fixed-point sigmoid approximation
+    #[inline]
+    fn sigmoid_fp(x: i32) -> i32 {
+        // Piecewise linear sigmoid: clamp to [0, 256] representing [0, 1]
+        if x < -512 { 0 }
+        else if x > 512 { 256 }
+        else { (x + 512) >> 2 }
+    }
+
+    /// Fixed-point tanh approximation
+    #[inline]
+    fn tanh_fp(x: i32) -> i32 {
+        // Piecewise linear tanh: clamp to [-256, 256] representing [-1, 1]
+        if x < -512 { -256 }
+        else if x > 512 { 256 }
+        else { x >> 1 }
+    }
+
+    /// Matrix-vector multiply (INT8 weights, INT32 accumulator)
+    fn matmul(&self, weights: &[i8], input: &[i32], rows: usize, cols: usize) -> HVec<i32, MAX_ROUTER_HIDDEN> {
+        let mut output = HVec::new();
+
+        for r in 0..rows {
+            let mut sum: i32 = 0;
+            for c in 0..cols {
+                if c < input.len() {
+                    sum += weights[r * cols + c] as i32 * input[c];
+                }
+            }
+            let _ = output.push(sum >> 8); // Scale down
+        }
+
+        output
+    }
+
+    /// One step of FastGRNN computation
+    ///
+    /// h_new = (1 - z) ⊙ h + z ⊙ tanh(W_u*x + U_u*h + b_u)
+    /// where z = sigmoid(W_g*x + U_g*h + b_g)
+    pub fn step(&mut self, input: &[i8]) -> crate::Result<()> {
+        // Convert input to i32
+        let input_i32: HVec<i32, MAX_ROUTER_INPUT> = input.iter()
+            .take(self.config.input_dim)
+            .map(|&x| x as i32 * 16) // Scale up
+            .collect();
+
+        // Compute gate: z = sigmoid(W_g * x + U_g * h + b_g)
+        let wx_gate = self.matmul(&self.w_gate, &input_i32, self.config.hidden_dim, self.config.input_dim);
+        let uh_gate = self.matmul(&self.u_gate, &self.hidden, self.config.hidden_dim, self.config.hidden_dim);
+
+        let mut gate = HVec::<i32, MAX_ROUTER_HIDDEN>::new();
+        for i in 0..self.config.hidden_dim {
+            let wx = wx_gate.get(i).copied().unwrap_or(0);
+            let uh = uh_gate.get(i).copied().unwrap_or(0);
+            let b = self.bias_gate.get(i).copied().unwrap_or(0) as i32 * 16;
+            let z = Self::sigmoid_fp((wx + uh + b) * self.config.zeta as i32 / 16);
+            let _ = gate.push(z);
+        }
+
+        // Compute update: u = tanh(W_u * x + U_u * h + b_u)
+        let wx_update = self.matmul(&self.w_update, &input_i32, self.config.hidden_dim, self.config.input_dim);
+        let uh_update = self.matmul(&self.u_update, &self.hidden, self.config.hidden_dim, self.config.hidden_dim);
+
+        // Update hidden state: h = (1 - z) * h + z * u
+        for i in 0..self.config.hidden_dim {
+            let wx = wx_update.get(i).copied().unwrap_or(0);
+            let uh = uh_update.get(i).copied().unwrap_or(0);
+            let b = self.bias_update.get(i).copied().unwrap_or(0) as i32 * 16;
+            let u = Self::tanh_fp((wx + uh + b) * self.config.nu as i32 / 16);
+
+            let z = gate.get(i).copied().unwrap_or(128);
+            let h = self.hidden.get(i).copied().unwrap_or(0);
+
+            // h_new = (256 - z) * h / 256 + z * u / 256
+            let h_new = ((256 - z) * h + z * u) >> 8;
+            self.hidden[i] = h_new;
+        }
+
+        Ok(())
+    }
+
+    /// Get routing decision (which chip to use)
+    pub fn route(&self) -> ChipId {
+        // Output projection: scores = W_o * hidden
+        let mut scores = [0i32; 8];
+
+        for chip in 0..self.config.num_chips {
+            let mut sum: i32 = 0;
+            for h in 0..self.config.hidden_dim {
+                let w_idx = chip * self.config.hidden_dim + h;
+                let w = self.w_output.get(w_idx).copied().unwrap_or(0) as i32;
+                let hidden = self.hidden.get(h).copied().unwrap_or(0);
+                sum += w * hidden;
+            }
+            scores[chip] = sum;
+        }
+
+        // Find argmax
+        let mut best_chip = 0;
+        let mut best_score = scores[0];
+        for (i, &score) in scores[..self.config.num_chips].iter().enumerate() {
+            if score > best_score {
+                best_score = score;
+                best_chip = i;
+            }
+        }
+
+        ChipId(best_chip as u8)
+    }
+
+    /// Get routing probabilities (softmax-like)
+    pub fn route_probs(&self) -> HVec<u8, 8> {
+        let mut probs = HVec::new();
+        let mut scores = [0i32; 8];
+        let mut max_score = i32::MIN;
+
+        // Compute scores
+        for chip in 0..self.config.num_chips {
+            let mut sum: i32 = 0;
+            for h in 0..self.config.hidden_dim {
+                let w_idx = chip * self.config.hidden_dim + h;
+                let w = self.w_output.get(w_idx).copied().unwrap_or(0) as i32;
+                let hidden = self.hidden.get(h).copied().unwrap_or(0);
+                sum += w * hidden;
+            }
+            scores[chip] = sum;
+            if sum > max_score {
+                max_score = sum;
+            }
+        }
+
+        // Simple softmax approximation
+        let mut total: i32 = 0;
+        for chip in 0..self.config.num_chips {
+            let exp_score = (scores[chip] - max_score + 256).max(1);
+            scores[chip] = exp_score;
+            total += exp_score;
+        }
+
+        for chip in 0..self.config.num_chips {
+            let prob = (scores[chip] * 255 / total.max(1)) as u8;
+            let _ = probs.push(prob);
+        }
+
+        probs
+    }
+
+    /// Memory size
+    pub fn memory_size(&self) -> usize {
+        self.w_gate.len() + self.u_gate.len() +
+        self.w_update.len() + self.u_update.len() +
+        self.w_output.len() +
+        self.bias_gate.len() + self.bias_update.len() +
+        self.hidden.len() * 4
+    }
+}
+
+/// Feature extractor for routing input
+pub struct RoutingFeatures {
+    /// Token embedding summary (mean)
+    pub embed_mean: i8,
+    /// Token embedding variance proxy
+    pub embed_var: i8,
+    /// Current sequence position (normalized)
+    pub position: i8,
+    /// Current load on each chip (0-127)
+    pub chip_loads: [i8; 5],
+}
+
+impl RoutingFeatures {
+    /// Convert to input vector
+    pub fn to_input(&self) -> [i8; 8] {
+        [
+            self.embed_mean,
+            self.embed_var,
+            self.position,
+            self.chip_loads[0],
+            self.chip_loads[1],
+            self.chip_loads[2],
+            self.chip_loads[3],
+            self.chip_loads[4],
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_micro_fastgrnn() {
+        let config = MicroGRNNConfig::default();
+        let mut router = MicroFastGRNN::new(config, 42).unwrap();
+
+        // Test step
+        let input = [10i8, 20, 30, 40, 50, 60, 70, 80];
+        router.step(&input).unwrap();
+
+        // Should produce valid routing
+        let chip = router.route();
+        assert!(chip.0 < 5);
+
+        println!("Memory: {} bytes", router.memory_size());
+    }
+
+    #[test]
+    fn test_routing_probs() {
+        let config = MicroGRNNConfig::default();
+        let mut router = MicroFastGRNN::new(config, 42).unwrap();
+
+        let input = [10i8; 8];
+        router.step(&input).unwrap();
+
+        let probs = router.route_probs();
+        assert_eq!(probs.len(), 5);
+
+        // Sum should be approximately 255
+        let sum: i32 = probs.iter().map(|&p| p as i32).sum();
+        assert!(sum > 200 && sum < 280);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/massive_scale.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/massive_scale.rs
@@ -0,0 +1,705 @@
+//! Massive Scale Federation - 100s to Millions of Chips
+//!
+//! Hierarchical coordination for extreme-scale distributed inference.
+//!
+//! # Topology Options
+//!
+//! ```text
+//! Flat (≤16 chips):     Hierarchical Tree (≤10K):     Hypercube (≤1M):
+//!   ○─○─○─○─○             ┌───[Root]───┐               ○═══○
+//!   │ │ │ │ │             │     │     │               ╱│   │╲
+//!   └─┴─┴─┴─┘           [L1]  [L1]  [L1]             ○─┼───┼─○
+//!                        │││   │││   │││             │ ○═══○ │
+//!                       chips chips chips            ○═══════○
+//! ```
+//!
+//! # Scaling Laws
+//!
+//! - **Pipeline**: O(n) throughput, O(1) latency per stage
+//! - **Tree**: O(log n) coordination, O(n) compute
+//! - **Hypercube**: O(log n) hops, O(n) total bandwidth
+//! - **Torus**: O(√n) diameter, excellent locality
+
+use heapless::Vec as HVec;
+use super::protocol::ChipId;
+
+/// Maximum depth for hierarchical topologies
+pub const MAX_TREE_DEPTH: usize = 20; // 2^20 = 1M chips
+/// Maximum children per node in tree
+pub const MAX_CHILDREN: usize = 16;
+/// Maximum nodes at any level
+pub const MAX_LEVEL_NODES: usize = 64;
+
+/// Large-scale topology types
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum MassiveTopology {
+    /// Flat mesh - up to ~16 chips
+    FlatMesh { size: usize },
+    /// Binary tree - scales to millions
+    BinaryTree { depth: usize },
+    /// K-ary tree with configurable fanout
+    KaryTree { depth: usize, fanout: usize },
+    /// Hypercube - O(log n) diameter
+    Hypercube { dimensions: usize },
+    /// 2D Torus - good for spatial locality
+    Torus2D { width: usize, height: usize },
+    /// 3D Torus - even better scaling
+    Torus3D { x: usize, y: usize, z: usize },
+    /// Butterfly network - FFT-like communication
+    Butterfly { stages: usize },
+    /// Hierarchical pipeline - practical for real deployments
+    HierarchicalPipeline {
+        clusters: usize,      // Number of clusters
+        chips_per_cluster: usize,
+    },
+}
+
+impl MassiveTopology {
+    /// Total number of chips in topology
+    pub fn total_chips(&self) -> usize {
+        match *self {
+            Self::FlatMesh { size } => size,
+            Self::BinaryTree { depth } => (1 << depth) - 1,
+            Self::KaryTree { depth, fanout } => {
+                // (k^(d+1) - 1) / (k - 1)
+                if fanout == 1 { depth + 1 }
+                else { (fanout.pow(depth as u32 + 1) - 1) / (fanout - 1) }
+            }
+            Self::Hypercube { dimensions } => 1 << dimensions,
+            Self::Torus2D { width, height } => width * height,
+            Self::Torus3D { x, y, z } => x * y * z,
+            Self::Butterfly { stages } => stages * (1 << stages),
+            Self::HierarchicalPipeline { clusters, chips_per_cluster } => {
+                clusters * chips_per_cluster
+            }
+        }
+    }
+
+    /// Network diameter (max hops between any two nodes)
+    pub fn diameter(&self) -> usize {
+        match *self {
+            Self::FlatMesh { size } => size - 1,
+            Self::BinaryTree { depth } => 2 * depth,
+            Self::KaryTree { depth, .. } => 2 * depth,
+            Self::Hypercube { dimensions } => dimensions,
+            Self::Torus2D { width, height } => width / 2 + height / 2,
+            Self::Torus3D { x, y, z } => x / 2 + y / 2 + z / 2,
+            Self::Butterfly { stages } => stages,
+            Self::HierarchicalPipeline { chips_per_cluster, .. } => {
+                chips_per_cluster + 2 // Within cluster + up + down
+            }
+        }
+    }
+
+    /// Bisection bandwidth (edges crossing middle cut)
+    pub fn bisection_bandwidth(&self) -> usize {
+        match *self {
+            Self::FlatMesh { .. } => 1,
+            Self::BinaryTree { .. } => 1, // Root is bottleneck
+            Self::KaryTree { fanout, .. } => fanout,
+            Self::Hypercube { dimensions } => 1 << (dimensions - 1),
+            Self::Torus2D { width, height } => 2 * width.min(height),
+            Self::Torus3D { x, y, z } => 2 * x.min(y).min(z) * x.min(y).min(z),
+            Self::Butterfly { stages } => 1 << (stages - 1),
+            Self::HierarchicalPipeline { clusters, .. } => clusters,
+        }
+    }
+
+    /// Recommended topology for given chip count
+    pub fn recommended(chip_count: usize) -> Self {
+        match chip_count {
+            0..=16 => Self::FlatMesh { size: chip_count },
+            17..=256 => Self::HierarchicalPipeline {
+                clusters: (chip_count as f64).sqrt().ceil() as usize,
+                chips_per_cluster: (chip_count as f64).sqrt().ceil() as usize,
+            },
+            257..=10_000 => {
+                // Use hierarchical pipeline for medium scale
+                let clusters = (chip_count as f64).sqrt().ceil() as usize;
+                let per_cluster = (chip_count + clusters - 1) / clusters;
+                Self::HierarchicalPipeline {
+                    clusters,
+                    chips_per_cluster: per_cluster,
+                }
+            }
+            10_001..=1_000_000 => {
+                // Hypercube for large scale
+                let dims = (chip_count as f64).log2().ceil() as usize;
+                Self::Hypercube { dimensions: dims }
+            }
+            _ => {
+                // Millions+ : 3D Torus
+                let side = (chip_count as f64).cbrt().ceil() as usize;
+                Self::Torus3D { x: side, y: side, z: side }
+            }
+        }
+    }
+}
+
+/// Scaling configuration for massive clusters
+#[derive(Debug, Clone)]
+pub struct MassiveScaleConfig {
+    /// Topology type
+    pub topology: MassiveTopology,
+    /// Layers of model
+    pub total_layers: usize,
+    /// Embedding dimension
+    pub embed_dim: usize,
+    /// Communication latency per hop (microseconds)
+    pub hop_latency_us: usize,
+    /// Bandwidth per link (bytes/sec)
+    pub link_bandwidth: usize,
+    /// Computation time per layer (microseconds)
+    pub layer_compute_us: usize,
+    /// Enable speculative execution
+    pub speculative: bool,
+    /// Speculation depth (tokens to draft)
+    pub spec_depth: usize,
+    /// Enable gradient checkpointing for memory
+    pub gradient_checkpointing: bool,
+    /// Fault tolerance level (0=none, 1=retry, 2=redundancy)
+    pub fault_tolerance: u8,
+}
+
+impl Default for MassiveScaleConfig {
+    fn default() -> Self {
+        Self {
+            topology: MassiveTopology::HierarchicalPipeline {
+                clusters: 10,
+                chips_per_cluster: 10,
+            },
+            total_layers: 32,
+            embed_dim: 64,
+            hop_latency_us: 10,      // SPI latency
+            link_bandwidth: 10_000_000, // 10 MB/s
+            layer_compute_us: 4000,   // 4ms per layer on ESP32
+            speculative: true,
+            spec_depth: 4,
+            gradient_checkpointing: false,
+            fault_tolerance: 1,
+        }
+    }
+}
+
+/// Performance projection for massive scale
+#[derive(Debug, Clone)]
+pub struct ScaleProjection {
+    /// Total chips
+    pub total_chips: usize,
+    /// Throughput in tokens/sec
+    pub throughput_tokens_sec: f64,
+    /// Latency per token in milliseconds
+    pub latency_ms: f64,
+    /// Memory per chip in KB
+    pub memory_per_chip_kb: f64,
+    /// Total model parameters supportable
+    pub max_parameters: usize,
+    /// Efficiency (vs linear scaling)
+    pub efficiency: f64,
+    /// Communication overhead percentage
+    pub comm_overhead_pct: f64,
+    /// Estimated power in watts
+    pub power_watts: f64,
+    /// Estimated cost in USD
+    pub cost_usd: f64,
+}
+
+/// Massive scale simulator
+pub struct MassiveScaleSimulator {
+    config: MassiveScaleConfig,
+}
+
+impl MassiveScaleSimulator {
+    pub fn new(config: MassiveScaleConfig) -> Self {
+        Self { config }
+    }
+
+    /// Project performance for current configuration
+    pub fn project(&self) -> ScaleProjection {
+        let chips = self.config.topology.total_chips();
+        let diameter = self.config.topology.diameter();
+        let bisection = self.config.topology.bisection_bandwidth();
+
+        // Compute distribution
+        let layers_per_chip = (self.config.total_layers as f64 / chips as f64).max(0.1);
+        let compute_per_chip_us = layers_per_chip * self.config.layer_compute_us as f64;
+
+        // Communication cost
+        let activation_size = self.config.embed_dim * 4; // INT8 with some overhead
+        let comm_time_us = (activation_size as f64 / self.config.link_bandwidth as f64)
+            * 1_000_000.0
+            * diameter as f64;
+
+        // Pipeline efficiency
+        let pipeline_stages = chips.min(self.config.total_layers);
+        let bubble_overhead = (pipeline_stages - 1) as f64 / pipeline_stages as f64;
+
+        // Speculative multiplier
+        let spec_multiplier = if self.config.speculative {
+            1.0 + (self.config.spec_depth as f64 - 1.0) * 0.7 // 70% acceptance
+        } else {
+            1.0
+        };
+
+        // Throughput calculation
+        let base_throughput = 1_000_000.0 / compute_per_chip_us.max(1.0);
+        let comm_factor = 1.0 / (1.0 + comm_time_us / compute_per_chip_us.max(1.0));
+        let efficiency = (1.0 - bubble_overhead * 0.15) * comm_factor;
+        let throughput = base_throughput * pipeline_stages as f64 * efficiency * spec_multiplier;
+
+        // Latency
+        let latency_us = compute_per_chip_us * pipeline_stages as f64 + comm_time_us;
+        let latency_ms = latency_us / 1000.0;
+
+        // Memory
+        let base_memory_kb = 119.0; // Single chip baseline
+        let memory_per_chip = base_memory_kb / (chips as f64).sqrt().max(1.0);
+
+        // Max parameters
+        let params_per_chip = (memory_per_chip * 1024.0 * 0.7) as usize; // 70% for weights
+        let max_parameters = params_per_chip * chips;
+
+        // Communication overhead
+        let comm_overhead = comm_time_us / (compute_per_chip_us + comm_time_us) * 100.0;
+
+        // Power and cost estimates
+        let power_per_chip = 0.5; // 500mW per ESP32
+        let cost_per_chip = 4.0;  // $4 per ESP32
+
+        ScaleProjection {
+            total_chips: chips,
+            throughput_tokens_sec: throughput,
+            latency_ms,
+            memory_per_chip_kb: memory_per_chip,
+            max_parameters,
+            efficiency,
+            comm_overhead_pct: comm_overhead,
+            power_watts: power_per_chip * chips as f64,
+            cost_usd: cost_per_chip * chips as f64,
+        }
+    }
+
+    /// Run scaling study across multiple configurations
+    pub fn scaling_study(&self, chip_counts: &[usize]) -> HVec<ScaleProjection, 32> {
+        let mut results = HVec::new();
+
+        for &count in chip_counts {
+            let topology = MassiveTopology::recommended(count);
+            let config = MassiveScaleConfig {
+                topology,
+                ..self.config.clone()
+            };
+            let sim = MassiveScaleSimulator::new(config);
+            let _ = results.push(sim.project());
+        }
+
+        results
+    }
+
+    /// Find optimal configuration for target throughput
+    pub fn optimize_for_throughput(&self, target_tokens_sec: f64) -> MassiveScaleConfig {
+        let mut best_config = self.config.clone();
+        let mut best_efficiency = 0.0;
+
+        // Try different chip counts
+        for power in 2..=20 {
+            let chips = 1 << power;
+
+            for &topology in &[
+                MassiveTopology::KaryTree { depth: power, fanout: 4 },
+                MassiveTopology::Hypercube { dimensions: power },
+                MassiveTopology::HierarchicalPipeline {
+                    clusters: 1 << (power / 2),
+                    chips_per_cluster: 1 << (power - power / 2),
+                },
+            ] {
+                if topology.total_chips() < 4 { continue; }
+
+                let config = MassiveScaleConfig {
+                    topology,
+                    ..self.config.clone()
+                };
+                let sim = MassiveScaleSimulator::new(config.clone());
+                let proj = sim.project();
+
+                if proj.throughput_tokens_sec >= target_tokens_sec {
+                    let efficiency = proj.throughput_tokens_sec / (proj.total_chips as f64);
+                    if efficiency > best_efficiency {
+                        best_efficiency = efficiency;
+                        best_config = config;
+                    }
+                }
+            }
+        }
+
+        best_config
+    }
+}
+
+/// Distributed coordinator for massive scale
+pub struct DistributedCoordinator {
+    /// This node's ID
+    node_id: u32,
+    /// Parent node (None if root)
+    parent: Option<u32>,
+    /// Child nodes
+    children: HVec<u32, MAX_CHILDREN>,
+    /// Sibling nodes (same level)
+    siblings: HVec<u32, MAX_CHILDREN>,
+    /// Current level in hierarchy
+    level: u8,
+    /// Total levels
+    total_levels: u8,
+    /// Local state
+    local_state: NodeState,
+}
+
+/// State of a node in the distributed system
+#[derive(Debug, Clone, Default)]
+pub struct NodeState {
+    /// Tokens processed
+    pub tokens_processed: u64,
+    /// Current load (0-255)
+    pub load: u8,
+    /// Last heartbeat (ticks)
+    pub last_heartbeat: u32,
+    /// Active flag
+    pub active: bool,
+    /// Current sequence position being processed
+    pub seq_position: u32,
+    /// Error count
+    pub errors: u16,
+}
+
+impl DistributedCoordinator {
+    /// Create coordinator for position in tree
+    pub fn new(node_id: u32, total_nodes: usize, topology: MassiveTopology) -> Self {
+        let (parent, children, siblings, level, total_levels) =
+            Self::compute_neighbors(node_id, total_nodes, topology);
+
+        Self {
+            node_id,
+            parent,
+            children,
+            siblings,
+            level,
+            total_levels,
+            local_state: NodeState { active: true, ..Default::default() },
+        }
+    }
+
+    fn compute_neighbors(
+        node_id: u32,
+        total_nodes: usize,
+        topology: MassiveTopology
+    ) -> (Option<u32>, HVec<u32, MAX_CHILDREN>, HVec<u32, MAX_CHILDREN>, u8, u8) {
+        let mut children = HVec::new();
+        let mut siblings = HVec::new();
+
+        match topology {
+            MassiveTopology::BinaryTree { depth } |
+            MassiveTopology::KaryTree { depth, fanout: 2 } => {
+                let level = (node_id + 1).ilog2() as u8;
+                let parent = if node_id == 0 { None } else { Some((node_id - 1) / 2) };
+
+                let left = 2 * node_id + 1;
+                let right = 2 * node_id + 2;
+                if (left as usize) < total_nodes {
+                    let _ = children.push(left);
+                }
+                if (right as usize) < total_nodes {
+                    let _ = children.push(right);
+                }
+
+                // Sibling
+                if node_id > 0 {
+                    let sib = if node_id % 2 == 1 { node_id + 1 } else { node_id - 1 };
+                    if (sib as usize) < total_nodes {
+                        let _ = siblings.push(sib);
+                    }
+                }
+
+                (parent, children, siblings, level, depth as u8)
+            }
+            MassiveTopology::Hypercube { dimensions } => {
+                // In hypercube, neighbors differ by one bit
+                let level = node_id.count_ones() as u8;
+                for d in 0..dimensions {
+                    let neighbor = node_id ^ (1 << d);
+                    if (neighbor as usize) < total_nodes {
+                        if neighbor < node_id {
+                            // Could be parent
+                        }
+                        let _ = siblings.push(neighbor);
+                    }
+                }
+                (None, children, siblings, level, dimensions as u8)
+            }
+            MassiveTopology::HierarchicalPipeline { clusters, chips_per_cluster } => {
+                let cluster_id = node_id as usize / chips_per_cluster;
+                let local_id = node_id as usize % chips_per_cluster;
+                let level = local_id as u8;
+
+                // Parent is previous in pipeline
+                let parent = if local_id > 0 {
+                    Some(node_id - 1)
+                } else if cluster_id > 0 {
+                    // Cross-cluster: last node of previous cluster
+                    Some((cluster_id * chips_per_cluster - 1) as u32)
+                } else {
+                    None
+                };
+
+                // Child is next in pipeline
+                if local_id + 1 < chips_per_cluster {
+                    let _ = children.push(node_id + 1);
+                } else if cluster_id + 1 < clusters {
+                    // Cross-cluster
+                    let _ = children.push(((cluster_id + 1) * chips_per_cluster) as u32);
+                }
+
+                (parent, children, siblings, level, chips_per_cluster as u8)
+            }
+            _ => {
+                // Default: linear chain
+                let parent = if node_id > 0 { Some(node_id - 1) } else { None };
+                if ((node_id + 1) as usize) < total_nodes {
+                    let _ = children.push(node_id + 1);
+                }
+                (parent, children, siblings, node_id as u8, total_nodes as u8)
+            }
+        }
+    }
+
+    /// Check if this node is root
+    pub fn is_root(&self) -> bool {
+        self.parent.is_none()
+    }
+
+    /// Check if this node is leaf
+    pub fn is_leaf(&self) -> bool {
+        self.children.is_empty()
+    }
+
+    /// Get nodes to send to for broadcast
+    pub fn broadcast_targets(&self) -> &[u32] {
+        &self.children
+    }
+
+    /// Get node to send to for aggregation (reduce)
+    pub fn reduce_target(&self) -> Option<u32> {
+        self.parent
+    }
+
+    /// Update local state
+    pub fn update_state(&mut self, tokens: u64, load: u8) {
+        self.local_state.tokens_processed = tokens;
+        self.local_state.load = load;
+        self.local_state.last_heartbeat = self.local_state.last_heartbeat.wrapping_add(1);
+    }
+
+    /// Get aggregate statistics (for root to report)
+    pub fn aggregate_stats(&self, child_stats: &[NodeState]) -> NodeState {
+        let mut agg = self.local_state.clone();
+        for child in child_stats {
+            agg.tokens_processed += child.tokens_processed;
+            agg.load = agg.load.saturating_add(child.load / (child_stats.len() as u8).max(1));
+            agg.errors += child.errors;
+        }
+        agg
+    }
+}
+
+/// Gossip protocol for state synchronization at massive scale
+pub struct GossipProtocol {
+    /// Known node states (sampled)
+    known_states: HVec<(u32, NodeState), 64>,
+    /// Fanout for gossip
+    fanout: usize,
+    /// Round number
+    round: u32,
+}
+
+impl GossipProtocol {
+    pub fn new(fanout: usize) -> Self {
+        Self {
+            known_states: HVec::new(),
+            fanout,
+            round: 0,
+        }
+    }
+
+    /// Select random nodes for gossip
+    pub fn select_gossip_targets(&self, my_id: u32, total_nodes: usize, seed: u32) -> HVec<u32, 8> {
+        let mut targets = HVec::new();
+        let mut rng = seed.wrapping_mul(1103515245).wrapping_add(my_id);
+
+        for _ in 0..self.fanout.min(8) {
+            rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
+            let target = (rng % total_nodes as u32) as u32;
+            if target != my_id && !targets.contains(&target) {
+                let _ = targets.push(target);
+            }
+        }
+
+        targets
+    }
+
+    /// Merge received state
+    pub fn merge_state(&mut self, node_id: u32, state: NodeState) {
+        // Update or insert
+        for (id, s) in self.known_states.iter_mut() {
+            if *id == node_id {
+                *s = state;
+                return;
+            }
+        }
+        // Insert new
+        if self.known_states.len() < 64 {
+            let _ = self.known_states.push((node_id, state));
+        } else {
+            // Replace oldest (simple LRU)
+            self.known_states[0] = (node_id, state);
+        }
+    }
+
+    /// Get estimated cluster health
+    pub fn cluster_health(&self) -> f32 {
+        if self.known_states.is_empty() {
+            return 1.0;
+        }
+        let active = self.known_states.iter().filter(|(_, s)| s.active).count();
+        active as f32 / self.known_states.len() as f32
+    }
+}
+
+/// Fault tolerance manager
+pub struct FaultTolerance {
+    /// Redundancy level (1 = no redundancy, 2 = pairs, 3 = triples)
+    redundancy: u8,
+    /// Failed node IDs
+    failed_nodes: HVec<u32, 64>,
+    /// Backup assignments (primary -> backup)
+    backups: HVec<(u32, u32), 32>,
+}
+
+impl FaultTolerance {
+    pub fn new(redundancy: u8) -> Self {
+        Self {
+            redundancy: redundancy.max(1),
+            failed_nodes: HVec::new(),
+            backups: HVec::new(),
+        }
+    }
+
+    /// Mark node as failed
+    pub fn mark_failed(&mut self, node_id: u32) {
+        if !self.failed_nodes.contains(&node_id) {
+            let _ = self.failed_nodes.push(node_id);
+        }
+    }
+
+    /// Get backup for failed node
+    pub fn get_backup(&self, failed_id: u32) -> Option<u32> {
+        self.backups.iter()
+            .find(|(primary, _)| *primary == failed_id)
+            .map(|(_, backup)| *backup)
+    }
+
+    /// Assign backups for nodes
+    pub fn assign_backups(&mut self, total_nodes: usize) {
+        if self.redundancy < 2 { return; }
+
+        for i in 0..total_nodes {
+            let backup = (i + total_nodes / 2) % total_nodes;
+            if self.backups.len() < 32 {
+                let _ = self.backups.push((i as u32, backup as u32));
+            }
+        }
+    }
+
+    /// Check if node is available (not failed)
+    pub fn is_available(&self, node_id: u32) -> bool {
+        !self.failed_nodes.contains(&node_id)
+    }
+
+    /// Get failure rate
+    pub fn failure_rate(&self, total_nodes: usize) -> f32 {
+        self.failed_nodes.len() as f32 / total_nodes as f32
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_topology_sizing() {
+        assert_eq!(MassiveTopology::BinaryTree { depth: 10 }.total_chips(), 1023);
+        assert_eq!(MassiveTopology::Hypercube { dimensions: 10 }.total_chips(), 1024);
+        assert_eq!(MassiveTopology::Torus2D { width: 100, height: 100 }.total_chips(), 10_000);
+    }
+
+    #[test]
+    fn test_scaling_projection() {
+        let config = MassiveScaleConfig {
+            topology: MassiveTopology::HierarchicalPipeline {
+                clusters: 10,
+                chips_per_cluster: 10,
+            },
+            ..Default::default()
+        };
+
+        let sim = MassiveScaleSimulator::new(config);
+        let proj = sim.project();
+
+        assert_eq!(proj.total_chips, 100);
+        assert!(proj.throughput_tokens_sec > 1000.0);
+        assert!(proj.efficiency > 0.5);
+
+        println!("100 chips: {:.0} tok/s, {:.1}% efficiency",
+            proj.throughput_tokens_sec, proj.efficiency * 100.0);
+    }
+
+    #[test]
+    fn test_massive_scale() {
+        let chip_counts = [5, 100, 1000, 10_000, 100_000, 1_000_000];
+
+        for &count in &chip_counts {
+            let topology = MassiveTopology::recommended(count);
+            let config = MassiveScaleConfig {
+                topology,
+                ..Default::default()
+            };
+            let sim = MassiveScaleSimulator::new(config);
+            let proj = sim.project();
+
+            println!("{:>10} chips: {:>12.0} tok/s, {:>6.1}% eff, ${:.0}",
+                count, proj.throughput_tokens_sec, proj.efficiency * 100.0, proj.cost_usd);
+        }
+    }
+
+    #[test]
+    fn test_distributed_coordinator() {
+        let coord = DistributedCoordinator::new(
+            5,
+            100,
+            MassiveTopology::BinaryTree { depth: 7 }
+        );
+
+        assert!(!coord.is_root());
+        println!("Node 5: parent={:?}, children={:?}", coord.parent, coord.children);
+    }
+
+    #[test]
+    fn test_gossip_protocol() {
+        let mut gossip = GossipProtocol::new(3);
+
+        let targets = gossip.select_gossip_targets(5, 1000, 42);
+        assert!(!targets.is_empty());
+        assert!(!targets.contains(&5)); // Shouldn't include self
+
+        gossip.merge_state(10, NodeState { active: true, ..Default::default() });
+        assert_eq!(gossip.cluster_health(), 1.0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/medium_scale.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/medium_scale.rs
@@ -0,0 +1,420 @@
+//! Medium Scale Federation - 100 to 500 Chip Clusters
+//!
+//! This is the "sweet spot" for ESP32 federation:
+//! - High efficiency (40-70%)
+//! - Practical throughput (50K-100K tokens/sec)
+//! - Manageable communication overhead
+//! - Affordable cost ($400-$2,000)
+//!
+//! # Why 100-500 Chips?
+//!
+//! ```text
+//! Performance vs Chip Count:
+//!
+//! 100K ┤              ┌─────────────────────── Communication-bound
+//!      │         ____/│  Sweet Spot
+//!  80K ┤       /      │  100-500 chips
+//!      │     /        │
+//!  60K ┤   /          │  • 40-70% efficiency
+//!      │  │           │  • Low communication overhead
+//!  40K ┤ │            │  • Best $/performance
+//!      ││             └─────────────────────────────────
+//!  20K ┤│
+//!      │
+//!    0 ┼──────────────────────────────────────────────────
+//!        5   50  100  200  500  1K   5K   10K  100K  1M
+//!             ▲           ▲
+//!             │           │
+//!         Good start   Best value
+//! ```
+//!
+//! # Topology Recommendations
+//!
+//! | Chips | Best Topology | Clusters × Chips | Efficiency |
+//! |-------|---------------|------------------|------------|
+//! | 100   | 10×10 Grid    | 10 × 10          | ~70%       |
+//! | 144   | 12×12 Grid    | 12 × 12          | ~65%       |
+//! | 256   | 16×16 Grid    | 16 × 16          | ~55%       |
+//! | 400   | 20×20 Grid    | 20 × 20          | ~45%       |
+//! | 500   | 25×20 Grid    | 25 × 20          | ~40%       |
+
+use super::massive_scale::{MassiveTopology, MassiveScaleConfig, MassiveScaleSimulator, ScaleProjection};
+use heapless::Vec as HVec;
+
+/// Medium-scale cluster sizes (sweet spot)
+pub const MEDIUM_SCALE_MIN: usize = 100;
+pub const MEDIUM_SCALE_MAX: usize = 500;
+pub const MEDIUM_SCALE_OPTIMAL: usize = 256; // Best efficiency/throughput balance
+
+/// Pre-optimized cluster configurations
+#[derive(Debug, Clone, Copy)]
+pub struct MediumClusterConfig {
+    /// Total chips in cluster
+    pub total_chips: usize,
+    /// Number of clusters (groups)
+    pub clusters: usize,
+    /// Chips per cluster
+    pub chips_per_cluster: usize,
+    /// Expected throughput (tokens/sec)
+    pub expected_throughput: f64,
+    /// Expected efficiency
+    pub expected_efficiency: f64,
+    /// Estimated cost USD
+    pub cost_usd: f64,
+    /// Power consumption watts
+    pub power_watts: f64,
+    /// Max model parameters supportable
+    pub max_params: usize,
+}
+
+impl MediumClusterConfig {
+    /// Get optimal configuration for given chip count
+    pub fn optimal_for(chip_count: usize) -> Self {
+        let chips = chip_count.clamp(MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX);
+
+        // Find best square-ish layout
+        let sqrt = (chips as f64).sqrt();
+        let clusters = sqrt.ceil() as usize;
+        let per_cluster = (chips + clusters - 1) / clusters;
+        let actual_chips = clusters * per_cluster;
+
+        // Simulate to get accurate projections
+        let config = MassiveScaleConfig {
+            topology: MassiveTopology::HierarchicalPipeline {
+                clusters,
+                chips_per_cluster: per_cluster,
+            },
+            total_layers: 32,
+            embed_dim: 64,
+            hop_latency_us: 10,
+            link_bandwidth: 10_000_000,
+            layer_compute_us: 4000,
+            speculative: true,
+            spec_depth: 4,
+            gradient_checkpointing: false,
+            fault_tolerance: 1,
+        };
+
+        let sim = MassiveScaleSimulator::new(config);
+        let proj = sim.project();
+
+        Self {
+            total_chips: actual_chips,
+            clusters,
+            chips_per_cluster: per_cluster,
+            expected_throughput: proj.throughput_tokens_sec,
+            expected_efficiency: proj.efficiency,
+            cost_usd: proj.cost_usd,
+            power_watts: proj.power_watts,
+            max_params: proj.max_parameters,
+        }
+    }
+
+    /// Get all standard configurations
+    pub fn standard_configs() -> [Self; 5] {
+        [
+            Self::optimal_for(100),
+            Self::optimal_for(144),
+            Self::optimal_for(256),
+            Self::optimal_for(400),
+            Self::optimal_for(500),
+        ]
+    }
+}
+
+/// Comparison with smaller clusters
+#[derive(Debug, Clone)]
+pub struct ScaleComparison {
+    /// Single chip baseline
+    pub single_chip: ScaleProjection,
+    /// 5-chip small cluster
+    pub small_cluster: ScaleProjection,
+    /// Medium cluster (specified)
+    pub medium_cluster: ScaleProjection,
+    /// Throughput multiplier vs single
+    pub throughput_multiplier: f64,
+    /// Throughput multiplier vs 5-chip
+    pub vs_small_multiplier: f64,
+    /// Cost per 1K tokens/sec
+    pub cost_per_1k_tokens: f64,
+}
+
+impl ScaleComparison {
+    /// Compare medium cluster against baselines
+    pub fn analyze(chip_count: usize) -> Self {
+        let base_config = MassiveScaleConfig {
+            total_layers: 32,
+            embed_dim: 64,
+            hop_latency_us: 10,
+            link_bandwidth: 10_000_000,
+            layer_compute_us: 4000,
+            speculative: true,
+            spec_depth: 4,
+            ..Default::default()
+        };
+
+        // Single chip
+        let single_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
+            topology: MassiveTopology::FlatMesh { size: 1 },
+            ..base_config.clone()
+        });
+        let single = single_sim.project();
+
+        // 5-chip small cluster
+        let small_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
+            topology: MassiveTopology::FlatMesh { size: 5 },
+            ..base_config.clone()
+        });
+        let small = small_sim.project();
+
+        // Medium cluster
+        let medium_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
+            topology: MassiveTopology::recommended(chip_count),
+            ..base_config.clone()
+        });
+        let medium = medium_sim.project();
+
+        Self {
+            throughput_multiplier: medium.throughput_tokens_sec / single.throughput_tokens_sec,
+            vs_small_multiplier: medium.throughput_tokens_sec / small.throughput_tokens_sec,
+            cost_per_1k_tokens: medium.cost_usd / (medium.throughput_tokens_sec / 1000.0),
+            single_chip: single,
+            small_cluster: small,
+            medium_cluster: medium,
+        }
+    }
+}
+
+/// Model categories that can run at different scales
+#[derive(Debug, Clone, Copy)]
+pub enum ModelCategory {
+    /// 50K-500K params, minimal memory
+    Nano,
+    /// 500K-5M params, basic tasks
+    Micro,
+    /// 5M-20M params, good general use
+    Small,
+    /// 20M-100M params, high quality
+    Base,
+    /// 100M-500M params, needs large clusters
+    Large,
+}
+
+impl ModelCategory {
+    /// Minimum chips required for this model category
+    pub fn min_chips(&self) -> usize {
+        match self {
+            Self::Nano => 1,
+            Self::Micro => 5,
+            Self::Small => 50,
+            Self::Base => 200,
+            Self::Large => 500,
+        }
+    }
+
+    /// Parameter range
+    pub fn param_range(&self) -> (usize, usize) {
+        match self {
+            Self::Nano => (50_000, 500_000),
+            Self::Micro => (500_000, 5_000_000),
+            Self::Small => (5_000_000, 20_000_000),
+            Self::Base => (20_000_000, 100_000_000),
+            Self::Large => (100_000_000, 500_000_000),
+        }
+    }
+
+    /// Example models
+    pub fn examples(&self) -> &'static str {
+        match self {
+            Self::Nano => "TinyBERT-nano, Custom embeddings",
+            Self::Micro => "DistilBERT-tiny, MiniLM",
+            Self::Small => "TinyLlama, Phi-nano",
+            Self::Base => "Phi-1, GPT-2-Small",
+            Self::Large => "Phi-2, LLaMA-7B (quantized)",
+        }
+    }
+
+    /// What's possible with given chip count
+    pub fn for_chip_count(chips: usize) -> Self {
+        match chips {
+            0..=4 => Self::Nano,
+            5..=49 => Self::Micro,
+            50..=199 => Self::Small,
+            200..=499 => Self::Base,
+            _ => Self::Large,
+        }
+    }
+}
+
+/// Hardware configuration for physical deployment
+#[derive(Debug, Clone)]
+pub struct HardwareConfig {
+    /// Chips per PCB (physical board)
+    pub chips_per_board: usize,
+    /// Number of PCBs
+    pub num_boards: usize,
+    /// Communication bus
+    pub bus_type: BusType,
+    /// Power supply requirement (watts)
+    pub power_supply_watts: f64,
+    /// Recommended form factor
+    pub form_factor: &'static str,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum BusType {
+    /// SPI - up to 40MHz, simple
+    Spi,
+    /// I2C - 400kHz standard, lower bandwidth
+    I2c,
+    /// UART mesh - flexible, medium speed
+    Uart,
+    /// Custom high-speed interconnect
+    HighSpeed,
+}
+
+impl BusType {
+    pub fn bandwidth_bytes_sec(&self) -> usize {
+        match self {
+            Self::Spi => 5_000_000,      // 5 MB/s typical
+            Self::I2c => 50_000,          // 50 KB/s
+            Self::Uart => 1_000_000,      // 1 MB/s at 10Mbaud
+            Self::HighSpeed => 50_000_000, // Custom FPGA/ASIC
+        }
+    }
+}
+
+impl HardwareConfig {
+    /// Recommended hardware for chip count
+    pub fn for_cluster(chip_count: usize) -> Self {
+        match chip_count {
+            0..=25 => Self {
+                chips_per_board: chip_count.min(10),
+                num_boards: (chip_count + 9) / 10,
+                bus_type: BusType::Spi,
+                power_supply_watts: chip_count as f64 * 0.5 + 10.0,
+                form_factor: "Single PCB or small rack",
+            },
+            26..=100 => Self {
+                chips_per_board: 10,
+                num_boards: (chip_count + 9) / 10,
+                bus_type: BusType::Spi,
+                power_supply_watts: chip_count as f64 * 0.5 + 25.0,
+                form_factor: "1U rack mount (10 boards)",
+            },
+            101..=256 => Self {
+                chips_per_board: 16,
+                num_boards: (chip_count + 15) / 16,
+                bus_type: BusType::Uart,
+                power_supply_watts: chip_count as f64 * 0.5 + 50.0,
+                form_factor: "2U-4U rack mount",
+            },
+            257..=500 => Self {
+                chips_per_board: 20,
+                num_boards: (chip_count + 19) / 20,
+                bus_type: BusType::Uart,
+                power_supply_watts: chip_count as f64 * 0.5 + 75.0,
+                form_factor: "Full rack unit",
+            },
+            _ => Self {
+                chips_per_board: 25,
+                num_boards: (chip_count + 24) / 25,
+                bus_type: BusType::HighSpeed,
+                power_supply_watts: chip_count as f64 * 0.5 + 100.0,
+                form_factor: "Multi-rack datacenter",
+            },
+        }
+    }
+}
+
+/// Run complete analysis for 100-500 chip clusters
+pub struct MediumScaleAnalyzer;
+
+impl MediumScaleAnalyzer {
+    /// Compare all standard medium-scale configurations
+    pub fn full_analysis() -> HVec<(MediumClusterConfig, ScaleComparison), 8> {
+        let mut results = HVec::new();
+
+        for chips in [100, 144, 196, 256, 324, 400, 484, 500] {
+            if chips <= MEDIUM_SCALE_MAX {
+                let config = MediumClusterConfig::optimal_for(chips);
+                let comparison = ScaleComparison::analyze(chips);
+                let _ = results.push((config, comparison));
+            }
+        }
+
+        results
+    }
+
+    /// Find optimal configuration for target throughput
+    pub fn optimize_for_throughput(target_tokens_sec: f64) -> Option<MediumClusterConfig> {
+        // Binary search in medium scale range
+        let mut low = MEDIUM_SCALE_MIN;
+        let mut high = MEDIUM_SCALE_MAX;
+        let mut best: Option<MediumClusterConfig> = None;
+
+        while low <= high {
+            let mid = (low + high) / 2;
+            let config = MediumClusterConfig::optimal_for(mid);
+
+            if config.expected_throughput >= target_tokens_sec {
+                best = Some(config);
+                high = mid.saturating_sub(1);
+            } else {
+                low = mid + 1;
+            }
+        }
+
+        best
+    }
+
+    /// Find optimal configuration for target cost
+    pub fn optimize_for_budget(budget_usd: f64) -> MediumClusterConfig {
+        let max_chips = (budget_usd / 4.0) as usize; // $4 per chip
+        let clamped = max_chips.clamp(MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX);
+        MediumClusterConfig::optimal_for(clamped)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_optimal_config_100() {
+        let config = MediumClusterConfig::optimal_for(100);
+        assert_eq!(config.clusters, 10);
+        assert_eq!(config.chips_per_cluster, 10);
+        assert!(config.expected_throughput > 40000.0); // 40K+ tok/s
+        assert!(config.expected_efficiency > 0.5); // 50%+ efficiency
+    }
+
+    #[test]
+    fn test_optimal_config_256() {
+        let config = MediumClusterConfig::optimal_for(256);
+        assert_eq!(config.clusters, 16);
+        assert_eq!(config.chips_per_cluster, 16);
+        assert!(config.expected_throughput > 60000.0); // 60K+ tok/s
+    }
+
+    #[test]
+    fn test_scale_comparison() {
+        let comparison = ScaleComparison::analyze(256);
+        assert!(comparison.throughput_multiplier > 50.0); // 50x+ vs single chip
+        assert!(comparison.vs_small_multiplier > 10.0);   // 10x+ vs 5 chips
+    }
+
+    #[test]
+    fn test_model_categories() {
+        assert_eq!(ModelCategory::for_chip_count(50).min_chips(), 50);
+        assert_eq!(ModelCategory::for_chip_count(256).min_chips(), 200);
+    }
+
+    #[test]
+    fn test_hardware_config() {
+        let hw = HardwareConfig::for_cluster(256);
+        assert_eq!(hw.chips_per_board, 16);
+        assert_eq!(hw.num_boards, 16);
+        assert!(hw.power_supply_watts > 100.0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/mod.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/mod.rs
@@ -0,0 +1,280 @@
+//! Federation Module for Multi-ESP32 Distributed Inference
+//!
+//! Enables running larger models across multiple ESP32 chips:
+//! - Pipeline parallelism: Each chip handles different layers
+//! - Tensor parallelism: Split attention heads across chips
+//! - Model sharding: Distribute embeddings/weights
+//! - Speculative decoding: Draft on one chip, verify on others
+//!
+//! # Architecture Options
+//!
+//! ```text
+//! 5-Chip Pipeline (recommended for latency):
+//! ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐
+//! │ ESP32-0 │───▶│ ESP32-1 │───▶│ ESP32-2 │───▶│ ESP32-3 │───▶│ ESP32-4 │
+//! │ Embed + │    │ Layer 1 │    │ Layer 2 │    │ Layer 3 │    │ Layer 4 │
+//! │ Layer 0 │    │         │    │         │    │         │    │ + Head  │
+//! └─────────┘    └─────────┘    └─────────┘    └─────────┘    └─────────┘
+//!
+//! 5-Chip Tensor Parallel (for throughput):
+//! ┌─────────┐
+//! │ ESP32-0 │ ◀──┐
+//! │ Head 0  │    │
+//! └─────────┘    │
+//! ┌─────────┐    │    ┌─────────┐
+//! │ ESP32-1 │ ◀──┼────│ ESP32-4 │
+//! │ Head 1  │    │    │ Coord   │
+//! └─────────┘    │    └─────────┘
+//! ┌─────────┐    │
+//! │ ESP32-2 │ ◀──┤
+//! │ Head 2  │    │
+//! └─────────┘    │
+//! ┌─────────┐    │
+//! │ ESP32-3 │ ◀──┘
+//! │ Head 3  │
+//! └─────────┘
+//! ```
+
+pub mod pipeline;
+pub mod tensor_parallel;
+pub mod sharding;
+pub mod speculative;
+pub mod protocol;
+pub mod coordinator;
+pub mod fastgrnn_router;
+pub mod massive_scale;
+pub mod medium_scale;
+
+// Re-exports
+pub use pipeline::{PipelineNode, PipelineConfig, PipelineRole};
+pub use tensor_parallel::{TensorParallelNode, TPConfig};
+pub use sharding::{ShardedEmbedding, ShardConfig};
+pub use speculative::{SpeculativeDecoder, DraftVerifyConfig};
+pub use protocol::{FederationMessage, MessageType, ChipId};
+pub use coordinator::{FederationCoordinator, ClusterTopology};
+pub use fastgrnn_router::{MicroFastGRNN, MicroGRNNConfig, RoutingFeatures};
+pub use massive_scale::{
+    MassiveTopology, MassiveScaleConfig, MassiveScaleSimulator, ScaleProjection,
+    DistributedCoordinator, GossipProtocol, FaultTolerance,
+};
+pub use medium_scale::{
+    MediumClusterConfig, ScaleComparison, MediumScaleAnalyzer,
+    ModelCategory, HardwareConfig, BusType,
+    MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX, MEDIUM_SCALE_OPTIMAL,
+};
+
+/// Maximum chips in small federation
+pub const MAX_FEDERATION_SIZE: usize = 8;
+/// Maximum chips in massive scale (theoretical)
+pub const MAX_MASSIVE_SCALE: usize = 1_000_000;
+
+/// Federation mode
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum FederationMode {
+    /// Single chip (no federation)
+    Standalone,
+    /// Pipeline parallelism - each chip handles different layers
+    Pipeline,
+    /// Tensor parallelism - split heads across chips
+    TensorParallel,
+    /// Hybrid: pipeline + tensor parallel
+    Hybrid,
+    /// Speculative decoding with draft/verify
+    Speculative,
+    /// Mixture of Experts - each chip is an expert
+    MixtureOfExperts,
+}
+
+/// Federation cluster configuration
+#[derive(Debug, Clone)]
+pub struct FederationConfig {
+    /// Number of chips in cluster
+    pub num_chips: usize,
+    /// This chip's ID (0-indexed)
+    pub chip_id: ChipId,
+    /// Federation mode
+    pub mode: FederationMode,
+    /// Communication bus type
+    pub bus: CommunicationBus,
+    /// Layers per chip (for pipeline mode)
+    pub layers_per_chip: usize,
+    /// Heads per chip (for tensor parallel mode)
+    pub heads_per_chip: usize,
+    /// Enable pipelining (process next token while current finishes)
+    pub enable_pipelining: bool,
+}
+
+impl Default for FederationConfig {
+    fn default() -> Self {
+        Self {
+            num_chips: 5,
+            chip_id: ChipId(0),
+            mode: FederationMode::Pipeline,
+            bus: CommunicationBus::Spi,
+            layers_per_chip: 2,
+            heads_per_chip: 1,
+            enable_pipelining: true,
+        }
+    }
+}
+
+/// Communication bus between chips
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum CommunicationBus {
+    /// SPI bus (fastest, 10-80 MHz)
+    Spi,
+    /// I2C bus (slower, 400 kHz - 1 MHz)
+    I2c,
+    /// UART (flexible, up to 5 Mbps)
+    Uart,
+    /// ESP-NOW (wireless, ~1 Mbps)
+    EspNow,
+    /// Custom parallel bus
+    Parallel,
+}
+
+impl CommunicationBus {
+    /// Estimated bandwidth in bytes/second
+    pub const fn bandwidth_bytes_per_sec(&self) -> usize {
+        match self {
+            Self::Spi => 10_000_000,      // 10 MB/s at 80 MHz
+            Self::I2c => 100_000,          // 100 KB/s at 1 MHz
+            Self::Uart => 500_000,         // 500 KB/s at 5 Mbps
+            Self::EspNow => 125_000,       // ~1 Mbps
+            Self::Parallel => 20_000_000,  // Custom 8-bit parallel
+        }
+    }
+
+    /// Latency overhead in microseconds
+    pub const fn latency_us(&self) -> usize {
+        match self {
+            Self::Spi => 10,
+            Self::I2c => 50,
+            Self::Uart => 20,
+            Self::EspNow => 500,  // Wireless overhead
+            Self::Parallel => 5,
+        }
+    }
+}
+
+/// Calculate optimal federation configuration for given model
+pub fn calculate_optimal_config(
+    model_size_bytes: usize,
+    num_layers: usize,
+    num_heads: usize,
+    num_chips: usize,
+    per_chip_ram: usize,
+) -> FederationConfig {
+    let model_per_chip = model_size_bytes / num_chips;
+
+    // Check if model fits with pipeline parallelism
+    if model_per_chip <= per_chip_ram {
+        let layers_per_chip = (num_layers + num_chips - 1) / num_chips;
+        return FederationConfig {
+            num_chips,
+            chip_id: ChipId(0),
+            mode: FederationMode::Pipeline,
+            bus: CommunicationBus::Spi,
+            layers_per_chip,
+            heads_per_chip: num_heads,
+            enable_pipelining: true,
+        };
+    }
+
+    // Try tensor parallelism
+    let heads_per_chip = (num_heads + num_chips - 1) / num_chips;
+    FederationConfig {
+        num_chips,
+        chip_id: ChipId(0),
+        mode: FederationMode::TensorParallel,
+        bus: CommunicationBus::Spi,
+        layers_per_chip: num_layers,
+        heads_per_chip,
+        enable_pipelining: false,
+    }
+}
+
+/// Estimate performance improvement from federation
+pub fn estimate_speedup(config: &FederationConfig) -> FederationSpeedup {
+    let n = config.num_chips as f32;
+
+    match config.mode {
+        FederationMode::Standalone => FederationSpeedup {
+            throughput_multiplier: 1.0,
+            latency_reduction: 1.0,
+            memory_per_chip_reduction: 1.0,
+        },
+        FederationMode::Pipeline => FederationSpeedup {
+            // Pipeline: n-way throughput, slightly higher latency
+            throughput_multiplier: n * 0.85, // 85% efficiency due to bubble
+            latency_reduction: 1.0 / (1.0 + 0.1 * (n - 1.0)), // Slight increase
+            memory_per_chip_reduction: n,
+        },
+        FederationMode::TensorParallel => FederationSpeedup {
+            // TP: near-linear speedup on attention
+            throughput_multiplier: n * 0.7, // Communication overhead
+            latency_reduction: n * 0.7,
+            memory_per_chip_reduction: n * 0.8, // Some duplication
+        },
+        FederationMode::Hybrid => FederationSpeedup {
+            throughput_multiplier: n * 0.75,
+            latency_reduction: (n / 2.0) * 0.8,
+            memory_per_chip_reduction: n * 0.9,
+        },
+        FederationMode::Speculative => FederationSpeedup {
+            // Speculative: 2-4x speedup typical
+            throughput_multiplier: 2.5,
+            latency_reduction: 2.0,
+            memory_per_chip_reduction: 1.0, // Full model on draft chip
+        },
+        FederationMode::MixtureOfExperts => FederationSpeedup {
+            throughput_multiplier: n * 0.9, // Excellent scaling
+            latency_reduction: 1.5,
+            memory_per_chip_reduction: n,
+        },
+    }
+}
+
+/// Performance improvement estimates
+#[derive(Debug, Clone)]
+pub struct FederationSpeedup {
+    /// Throughput improvement (tokens/sec multiplier)
+    pub throughput_multiplier: f32,
+    /// Latency reduction (time per token)
+    pub latency_reduction: f32,
+    /// Memory reduction per chip
+    pub memory_per_chip_reduction: f32,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_optimal_config() {
+        let config = calculate_optimal_config(
+            500 * 1024,  // 500 KB model
+            10,          // 10 layers
+            4,           // 4 heads
+            5,           // 5 chips
+            120 * 1024,  // 120 KB per chip
+        );
+
+        assert_eq!(config.mode, FederationMode::Pipeline);
+        assert_eq!(config.layers_per_chip, 2);
+    }
+
+    #[test]
+    fn test_speedup_estimate() {
+        let config = FederationConfig {
+            num_chips: 5,
+            mode: FederationMode::Pipeline,
+            ..Default::default()
+        };
+
+        let speedup = estimate_speedup(&config);
+
+        assert!(speedup.throughput_multiplier > 4.0);
+        assert!(speedup.memory_per_chip_reduction >= 5.0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/pipeline.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/pipeline.rs
@@ -0,0 +1,387 @@
+//! Pipeline Parallelism for Multi-ESP32 Inference
+//!
+//! Distributes layers across chips for linear scaling with model size.
+//! Each chip processes its assigned layers and passes activations to the next.
+//!
+//! # 5-Chip Pipeline Example
+//!
+//! ```text
+//! Token 0: [C0:embed+L0] → [C1:L1-2] → [C2:L3-4] → [C3:L5-6] → [C4:L7+head]
+//! Token 1:    idle        [C0:embed]  [C1:L1-2]  [C2:L3-4]  [C3:L5-6]
+//! Token 2:    idle           idle     [C0:embed] [C1:L1-2]  [C2:L3-4]
+//! ...
+//! ```
+
+use heapless::Vec as HVec;
+use super::protocol::{ChipId, FederationMessage};
+
+/// Maximum layers per chip
+pub const MAX_LAYERS_PER_CHIP: usize = 4;
+/// Pipeline depth (tokens in flight)
+pub const MAX_PIPELINE_DEPTH: usize = 8;
+
+/// Role in the pipeline
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum PipelineRole {
+    /// First chip: handles embedding + first layers
+    Head,
+    /// Middle chip: processes middle layers
+    Middle,
+    /// Last chip: final layers + output head
+    Tail,
+    /// Single chip mode (no pipeline)
+    Standalone,
+}
+
+/// Pipeline configuration
+#[derive(Debug, Clone)]
+pub struct PipelineConfig {
+    /// Total chips in pipeline
+    pub num_chips: usize,
+    /// This chip's position (0 = head)
+    pub position: usize,
+    /// Layers assigned to this chip
+    pub layer_start: usize,
+    /// Number of layers on this chip
+    pub layer_count: usize,
+    /// Total layers in model
+    pub total_layers: usize,
+    /// Embedding dimension
+    pub embed_dim: usize,
+    /// Enable micro-batching
+    pub micro_batch_size: usize,
+}
+
+impl PipelineConfig {
+    /// Create config for a specific chip in the pipeline
+    pub fn for_chip(
+        chip_pos: usize,
+        num_chips: usize,
+        total_layers: usize,
+        embed_dim: usize,
+    ) -> Self {
+        let layers_per_chip = (total_layers + num_chips - 1) / num_chips;
+        let layer_start = chip_pos * layers_per_chip;
+        let layer_count = layers_per_chip.min(total_layers - layer_start);
+
+        Self {
+            num_chips,
+            position: chip_pos,
+            layer_start,
+            layer_count,
+            total_layers,
+            embed_dim,
+            micro_batch_size: 1,
+        }
+    }
+
+    /// Get role of this chip
+    pub fn role(&self) -> PipelineRole {
+        if self.num_chips == 1 {
+            PipelineRole::Standalone
+        } else if self.position == 0 {
+            PipelineRole::Head
+        } else if self.position == self.num_chips - 1 {
+            PipelineRole::Tail
+        } else {
+            PipelineRole::Middle
+        }
+    }
+
+    /// Previous chip in pipeline (if any)
+    pub fn prev_chip(&self) -> Option<ChipId> {
+        if self.position > 0 {
+            Some(ChipId((self.position - 1) as u8))
+        } else {
+            None
+        }
+    }
+
+    /// Next chip in pipeline (if any)
+    pub fn next_chip(&self) -> Option<ChipId> {
+        if self.position + 1 < self.num_chips {
+            Some(ChipId((self.position + 1) as u8))
+        } else {
+            None
+        }
+    }
+}
+
+/// Pipeline state for a chip
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum PipelineState {
+    /// Waiting for input from previous chip
+    WaitingInput,
+    /// Processing layers
+    Processing,
+    /// Waiting to send output
+    WaitingSend,
+    /// Idle (pipeline bubble)
+    Idle,
+}
+
+/// In-flight token tracking
+#[derive(Debug, Clone)]
+pub struct InFlightToken {
+    /// Sequence position
+    pub seq_pos: u16,
+    /// Token ID
+    pub token_id: u16,
+    /// Current layer being processed
+    pub current_layer: u8,
+    /// Activation data (INT8)
+    pub activation: HVec<i8, 128>,
+}
+
+/// Pipeline node managing this chip's portion
+pub struct PipelineNode {
+    /// Configuration
+    config: PipelineConfig,
+    /// Current state
+    state: PipelineState,
+    /// Chip ID
+    chip_id: ChipId,
+    /// Sequence counter
+    seq_counter: u16,
+    /// Tokens in flight in the pipeline
+    in_flight: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
+    /// Completed tokens waiting to send
+    output_queue: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
+    /// Input buffer for receiving activations
+    input_buffer: HVec<i8, 256>,
+    /// Barrier counter for synchronization
+    barrier_counter: u16,
+}
+
+impl PipelineNode {
+    /// Create new pipeline node
+    pub fn new(config: PipelineConfig) -> Self {
+        Self {
+            chip_id: ChipId(config.position as u8),
+            config,
+            state: PipelineState::Idle,
+            seq_counter: 0,
+            in_flight: HVec::new(),
+            output_queue: HVec::new(),
+            input_buffer: HVec::new(),
+            barrier_counter: 0,
+        }
+    }
+
+    /// Get current pipeline state
+    pub fn state(&self) -> PipelineState {
+        self.state
+    }
+
+    /// Check if this chip should handle embedding
+    pub fn handles_embedding(&self) -> bool {
+        self.config.role() == PipelineRole::Head ||
+        self.config.role() == PipelineRole::Standalone
+    }
+
+    /// Check if this chip should handle output head
+    pub fn handles_output(&self) -> bool {
+        self.config.role() == PipelineRole::Tail ||
+        self.config.role() == PipelineRole::Standalone
+    }
+
+    /// Start processing a new token (head chip only)
+    pub fn start_token(&mut self, token_id: u16) -> crate::Result<()> {
+        if !self.handles_embedding() {
+            return Err(crate::Error::UnsupportedFeature("Not head chip"));
+        }
+
+        if self.in_flight.len() >= MAX_PIPELINE_DEPTH {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        let token = InFlightToken {
+            seq_pos: self.seq_counter,
+            token_id,
+            current_layer: 0,
+            activation: HVec::new(),
+        };
+
+        self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
+        self.seq_counter += 1;
+        self.state = PipelineState::Processing;
+
+        Ok(())
+    }
+
+    /// Receive activation from previous chip
+    pub fn receive_activation(&mut self, msg: &FederationMessage) -> crate::Result<()> {
+        let (layer_idx, position, data) = msg.get_activation_data()
+            .ok_or(crate::Error::InvalidModel("Invalid activation message"))?;
+
+        // Create in-flight token from received data
+        let mut activation = HVec::new();
+        for &d in data {
+            activation.push(d as i8).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        let token = InFlightToken {
+            seq_pos: position,
+            token_id: 0, // Not needed for middle/tail chips
+            current_layer: layer_idx,
+            activation,
+        };
+
+        self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
+        self.state = PipelineState::Processing;
+
+        Ok(())
+    }
+
+    /// Process one step (one layer for one token)
+    /// Returns true if there's work to do
+    pub fn process_step<F>(&mut self, mut layer_fn: F) -> crate::Result<bool>
+    where
+        F: FnMut(usize, &mut [i8]) -> crate::Result<()>,
+    {
+        if self.in_flight.is_empty() {
+            self.state = PipelineState::WaitingInput;
+            return Ok(false);
+        }
+
+        // Process first token in queue
+        let token = &mut self.in_flight[0];
+
+        // Determine which layer to process
+        let relative_layer = token.current_layer as usize - self.config.layer_start;
+
+        if relative_layer < self.config.layer_count {
+            // Process this layer
+            let layer_idx = self.config.layer_start + relative_layer;
+            layer_fn(layer_idx, &mut token.activation)?;
+            token.current_layer += 1;
+        }
+
+        // Check if done with this chip's layers
+        let next_layer = token.current_layer as usize;
+        if next_layer >= self.config.layer_start + self.config.layer_count {
+            // Move to output queue
+            if let Some(completed) = self.in_flight.pop() {
+                self.output_queue.push(completed).map_err(|_| crate::Error::BufferOverflow)?;
+            }
+            self.state = PipelineState::WaitingSend;
+        }
+
+        Ok(true)
+    }
+
+    /// Get activation to send to next chip
+    pub fn get_output(&mut self) -> Option<FederationMessage> {
+        if self.output_queue.is_empty() {
+            return None;
+        }
+
+        let token = self.output_queue.pop()?;
+        let next_chip = self.config.next_chip()?;
+
+        // Convert activation to bytes
+        let data: Vec<i8> = token.activation.iter().cloned().collect();
+
+        FederationMessage::activation(
+            self.chip_id,
+            next_chip,
+            token.seq_pos,
+            token.current_layer,
+            token.seq_pos,
+            &data,
+        ).ok()
+    }
+
+    /// Check if output is available (for tail chip)
+    pub fn has_final_output(&self) -> bool {
+        self.handles_output() && !self.output_queue.is_empty()
+    }
+
+    /// Get final output logits (tail chip only)
+    pub fn get_final_output(&mut self) -> Option<HVec<i8, 128>> {
+        if !self.handles_output() {
+            return None;
+        }
+
+        let token = self.output_queue.pop()?;
+        Some(token.activation)
+    }
+
+    /// Get pipeline statistics
+    pub fn stats(&self) -> PipelineStats {
+        PipelineStats {
+            in_flight_count: self.in_flight.len(),
+            output_queue_len: self.output_queue.len(),
+            tokens_processed: self.seq_counter as usize,
+            current_state: self.state,
+        }
+    }
+
+    /// Create synchronization barrier
+    pub fn create_barrier(&mut self) -> FederationMessage {
+        self.barrier_counter += 1;
+        FederationMessage::barrier(self.chip_id, self.barrier_counter)
+    }
+}
+
+/// Pipeline statistics
+#[derive(Debug, Clone)]
+pub struct PipelineStats {
+    /// Tokens currently in pipeline
+    pub in_flight_count: usize,
+    /// Tokens waiting to send
+    pub output_queue_len: usize,
+    /// Total tokens processed
+    pub tokens_processed: usize,
+    /// Current state
+    pub current_state: PipelineState,
+}
+
+/// Calculate pipeline efficiency
+pub fn calculate_pipeline_efficiency(
+    num_chips: usize,
+    tokens_generated: usize,
+) -> f32 {
+    // Pipeline efficiency = useful work / total work
+    // With N chips, first N-1 tokens have bubble overhead
+    if tokens_generated <= num_chips {
+        tokens_generated as f32 / (num_chips as f32 * tokens_generated as f32)
+    } else {
+        // After warmup, efficiency approaches 100%
+        let warmup_overhead = (num_chips - 1) as f32;
+        let useful_work = tokens_generated as f32;
+        useful_work / (useful_work + warmup_overhead)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pipeline_config() {
+        // 5 chips, 10 layers
+        let config = PipelineConfig::for_chip(0, 5, 10, 64);
+        assert_eq!(config.role(), PipelineRole::Head);
+        assert_eq!(config.layer_start, 0);
+        assert_eq!(config.layer_count, 2);
+
+        let config = PipelineConfig::for_chip(2, 5, 10, 64);
+        assert_eq!(config.role(), PipelineRole::Middle);
+        assert_eq!(config.layer_start, 4);
+
+        let config = PipelineConfig::for_chip(4, 5, 10, 64);
+        assert_eq!(config.role(), PipelineRole::Tail);
+    }
+
+    #[test]
+    fn test_pipeline_efficiency() {
+        // After 100 tokens, efficiency should be high
+        let eff = calculate_pipeline_efficiency(5, 100);
+        assert!(eff > 0.95);
+
+        // During warmup, efficiency is lower
+        let eff_warmup = calculate_pipeline_efficiency(5, 5);
+        assert!(eff_warmup < 0.5);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/protocol.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/protocol.rs
@@ -0,0 +1,414 @@
+//! Inter-Chip Communication Protocol
+//!
+//! Defines the message format for ESP32-to-ESP32 communication.
+//! Designed for low overhead on SPI/I2C/UART buses.
+
+use heapless::Vec as HVec;
+
+/// Maximum activation size that can be sent in one message
+pub const MAX_ACTIVATION_SIZE: usize = 256;
+/// Maximum message payload
+pub const MAX_PAYLOAD_SIZE: usize = 512;
+/// Protocol version
+pub const PROTOCOL_VERSION: u8 = 1;
+
+/// Chip identifier in the federation
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
+pub struct ChipId(pub u8);
+
+impl ChipId {
+    pub const BROADCAST: ChipId = ChipId(0xFF);
+
+    pub fn is_broadcast(&self) -> bool {
+        self.0 == 0xFF
+    }
+}
+
+/// Message types for federation protocol
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[repr(u8)]
+pub enum MessageType {
+    /// Heartbeat / keep-alive
+    Heartbeat = 0x00,
+    /// Cluster discovery
+    Discovery = 0x01,
+    /// Ready signal
+    Ready = 0x02,
+
+    /// Forward pass activation data
+    Activation = 0x10,
+    /// Attention K/V cache update
+    KVCache = 0x11,
+    /// Gradient (for future training)
+    Gradient = 0x12,
+
+    /// Token embedding request
+    EmbedRequest = 0x20,
+    /// Token embedding response
+    EmbedResponse = 0x21,
+    /// Output logits
+    Logits = 0x22,
+    /// Sampled token
+    Token = 0x23,
+
+    /// Speculative draft tokens
+    DraftTokens = 0x30,
+    /// Verification result
+    VerifyResult = 0x31,
+
+    /// Synchronization barrier
+    Barrier = 0x40,
+    /// Acknowledgment
+    Ack = 0x41,
+    /// Error
+    Error = 0xFF,
+}
+
+impl From<u8> for MessageType {
+    fn from(v: u8) -> Self {
+        match v {
+            0x00 => Self::Heartbeat,
+            0x01 => Self::Discovery,
+            0x02 => Self::Ready,
+            0x10 => Self::Activation,
+            0x11 => Self::KVCache,
+            0x12 => Self::Gradient,
+            0x20 => Self::EmbedRequest,
+            0x21 => Self::EmbedResponse,
+            0x22 => Self::Logits,
+            0x23 => Self::Token,
+            0x30 => Self::DraftTokens,
+            0x31 => Self::VerifyResult,
+            0x40 => Self::Barrier,
+            0x41 => Self::Ack,
+            _ => Self::Error,
+        }
+    }
+}
+
+/// Message header (8 bytes)
+#[derive(Debug, Clone, Copy)]
+#[repr(C, packed)]
+pub struct MessageHeader {
+    /// Protocol version
+    pub version: u8,
+    /// Message type
+    pub msg_type: u8,
+    /// Source chip ID
+    pub src: u8,
+    /// Destination chip ID
+    pub dst: u8,
+    /// Sequence number (for ordering)
+    pub seq: u16,
+    /// Payload length
+    pub payload_len: u16,
+}
+
+impl MessageHeader {
+    pub const SIZE: usize = 8;
+
+    pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16, payload_len: u16) -> Self {
+        Self {
+            version: PROTOCOL_VERSION,
+            msg_type: msg_type as u8,
+            src: src.0,
+            dst: dst.0,
+            seq,
+            payload_len,
+        }
+    }
+
+    /// Serialize to bytes
+    pub fn to_bytes(&self) -> [u8; 8] {
+        [
+            self.version,
+            self.msg_type,
+            self.src,
+            self.dst,
+            (self.seq & 0xFF) as u8,
+            (self.seq >> 8) as u8,
+            (self.payload_len & 0xFF) as u8,
+            (self.payload_len >> 8) as u8,
+        ]
+    }
+
+    /// Deserialize from bytes
+    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
+        if bytes.len() < 8 {
+            return None;
+        }
+        Some(Self {
+            version: bytes[0],
+            msg_type: bytes[1],
+            src: bytes[2],
+            dst: bytes[3],
+            seq: (bytes[4] as u16) | ((bytes[5] as u16) << 8),
+            payload_len: (bytes[6] as u16) | ((bytes[7] as u16) << 8),
+        })
+    }
+
+    /// Calculate simple checksum
+    pub fn checksum(&self) -> u8 {
+        let bytes = self.to_bytes();
+        bytes.iter().fold(0u8, |acc, &b| acc.wrapping_add(b))
+    }
+}
+
+/// Complete federation message
+#[derive(Debug, Clone)]
+pub struct FederationMessage {
+    /// Message header
+    pub header: MessageHeader,
+    /// Payload data
+    pub payload: HVec<u8, MAX_PAYLOAD_SIZE>,
+    /// Checksum
+    pub checksum: u8,
+}
+
+impl FederationMessage {
+    /// Create new message
+    pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16) -> Self {
+        Self {
+            header: MessageHeader::new(msg_type, src, dst, seq, 0),
+            payload: HVec::new(),
+            checksum: 0,
+        }
+    }
+
+    /// Create activation message with INT8 data
+    pub fn activation(
+        src: ChipId,
+        dst: ChipId,
+        seq: u16,
+        layer_idx: u8,
+        position: u16,
+        data: &[i8],
+    ) -> crate::Result<Self> {
+        let mut msg = Self::new(MessageType::Activation, src, dst, seq);
+
+        // Payload format: [layer_idx:1][position:2][data:N]
+        msg.payload.push(layer_idx).map_err(|_| crate::Error::BufferOverflow)?;
+        msg.payload.push((position & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+        msg.payload.push((position >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+
+        for &d in data {
+            msg.payload.push(d as u8).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        msg.header.payload_len = msg.payload.len() as u16;
+        msg.update_checksum();
+        Ok(msg)
+    }
+
+    /// Create token message
+    pub fn token(src: ChipId, dst: ChipId, seq: u16, token_id: u16) -> Self {
+        let mut msg = Self::new(MessageType::Token, src, dst, seq);
+        let _ = msg.payload.push((token_id & 0xFF) as u8);
+        let _ = msg.payload.push((token_id >> 8) as u8);
+        msg.header.payload_len = 2;
+        msg.update_checksum();
+        msg
+    }
+
+    /// Create draft tokens message for speculative decoding
+    pub fn draft_tokens(src: ChipId, dst: ChipId, seq: u16, tokens: &[u16]) -> crate::Result<Self> {
+        let mut msg = Self::new(MessageType::DraftTokens, src, dst, seq);
+
+        msg.payload.push(tokens.len() as u8).map_err(|_| crate::Error::BufferOverflow)?;
+
+        for &t in tokens {
+            msg.payload.push((t & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+            msg.payload.push((t >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        msg.header.payload_len = msg.payload.len() as u16;
+        msg.update_checksum();
+        Ok(msg)
+    }
+
+    /// Create barrier synchronization message
+    pub fn barrier(src: ChipId, barrier_id: u16) -> Self {
+        let mut msg = Self::new(MessageType::Barrier, src, ChipId::BROADCAST, 0);
+        let _ = msg.payload.push((barrier_id & 0xFF) as u8);
+        let _ = msg.payload.push((barrier_id >> 8) as u8);
+        msg.header.payload_len = 2;
+        msg.update_checksum();
+        msg
+    }
+
+    /// Update checksum
+    pub fn update_checksum(&mut self) {
+        let mut sum = self.header.checksum();
+        for &b in &self.payload {
+            sum = sum.wrapping_add(b);
+        }
+        self.checksum = sum;
+    }
+
+    /// Verify checksum
+    pub fn verify_checksum(&self) -> bool {
+        let mut sum = self.header.checksum();
+        for &b in &self.payload {
+            sum = sum.wrapping_add(b);
+        }
+        sum == self.checksum
+    }
+
+    /// Serialize to bytes
+    pub fn to_bytes(&self) -> HVec<u8, { MAX_PAYLOAD_SIZE + 16 }> {
+        let mut bytes = HVec::new();
+
+        // Header
+        for b in self.header.to_bytes() {
+            let _ = bytes.push(b);
+        }
+
+        // Payload
+        for &b in &self.payload {
+            let _ = bytes.push(b);
+        }
+
+        // Checksum
+        let _ = bytes.push(self.checksum);
+
+        bytes
+    }
+
+    /// Deserialize from bytes
+    pub fn from_bytes(bytes: &[u8]) -> crate::Result<Self> {
+        if bytes.len() < MessageHeader::SIZE + 1 {
+            return Err(crate::Error::InvalidModel("Message too short"));
+        }
+
+        let header = MessageHeader::from_bytes(bytes)
+            .ok_or(crate::Error::InvalidModel("Invalid header"))?;
+
+        let payload_end = MessageHeader::SIZE + header.payload_len as usize;
+        if bytes.len() < payload_end + 1 {
+            return Err(crate::Error::InvalidModel("Payload incomplete"));
+        }
+
+        let mut payload = HVec::new();
+        for &b in &bytes[MessageHeader::SIZE..payload_end] {
+            payload.push(b).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        let checksum = bytes[payload_end];
+
+        let msg = Self {
+            header,
+            payload,
+            checksum,
+        };
+
+        if !msg.verify_checksum() {
+            return Err(crate::Error::InvalidModel("Checksum mismatch"));
+        }
+
+        Ok(msg)
+    }
+
+    /// Extract activation data from payload
+    pub fn get_activation_data(&self) -> Option<(u8, u16, &[u8])> {
+        if self.header.msg_type != MessageType::Activation as u8 {
+            return None;
+        }
+        if self.payload.len() < 3 {
+            return None;
+        }
+
+        let layer_idx = self.payload[0];
+        let position = (self.payload[1] as u16) | ((self.payload[2] as u16) << 8);
+        let data = &self.payload[3..];
+
+        Some((layer_idx, position, data))
+    }
+
+    /// Extract token from payload
+    pub fn get_token(&self) -> Option<u16> {
+        if self.header.msg_type != MessageType::Token as u8 {
+            return None;
+        }
+        if self.payload.len() < 2 {
+            return None;
+        }
+
+        Some((self.payload[0] as u16) | ((self.payload[1] as u16) << 8))
+    }
+}
+
+/// Communication statistics
+#[derive(Debug, Default, Clone)]
+pub struct CommStats {
+    /// Messages sent
+    pub messages_sent: u32,
+    /// Messages received
+    pub messages_received: u32,
+    /// Bytes sent
+    pub bytes_sent: u32,
+    /// Bytes received
+    pub bytes_received: u32,
+    /// Checksum errors
+    pub checksum_errors: u32,
+    /// Timeouts
+    pub timeouts: u32,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_message_header() {
+        let header = MessageHeader::new(
+            MessageType::Activation,
+            ChipId(0),
+            ChipId(1),
+            42,
+            100,
+        );
+
+        let bytes = header.to_bytes();
+        let decoded = MessageHeader::from_bytes(&bytes).unwrap();
+
+        assert_eq!(decoded.msg_type, MessageType::Activation as u8);
+        assert_eq!(decoded.src, 0);
+        assert_eq!(decoded.dst, 1);
+        // Copy packed fields to avoid UB from unaligned references
+        let seq = decoded.seq;
+        let payload_len = decoded.payload_len;
+        assert_eq!(seq, 42);
+        assert_eq!(payload_len, 100);
+    }
+
+    #[test]
+    fn test_activation_message() {
+        let data: [i8; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let msg = FederationMessage::activation(
+            ChipId(0),
+            ChipId(1),
+            1,
+            0,
+            10,
+            &data,
+        ).unwrap();
+
+        let bytes = msg.to_bytes();
+        let decoded = FederationMessage::from_bytes(&bytes).unwrap();
+
+        let (layer, pos, act_data) = decoded.get_activation_data().unwrap();
+        assert_eq!(layer, 0);
+        assert_eq!(pos, 10);
+        assert_eq!(act_data.len(), 8);
+    }
+
+    #[test]
+    fn test_token_message() {
+        let msg = FederationMessage::token(ChipId(4), ChipId(0), 100, 12345);
+
+        let bytes = msg.to_bytes();
+        let decoded = FederationMessage::from_bytes(&bytes).unwrap();
+
+        assert_eq!(decoded.get_token(), Some(12345));
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/sharding.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/sharding.rs
@@ -0,0 +1,143 @@
+//! Embedding Sharding - Distribute Vocabulary Across Chips
+//!
+//! For large vocabularies, shard embeddings across chips.
+//! Each chip holds a portion of the embedding table.
+
+use heapless::Vec as HVec;
+use super::protocol::ChipId;
+
+/// Sharding configuration
+#[derive(Debug, Clone)]
+pub struct ShardConfig {
+    /// Total vocabulary size
+    pub vocab_size: usize,
+    /// Number of shards (chips)
+    pub num_shards: usize,
+    /// This chip's shard ID
+    pub shard_id: usize,
+    /// Embedding dimension
+    pub embed_dim: usize,
+    /// Vocab range for this shard
+    pub vocab_start: usize,
+    pub vocab_end: usize,
+}
+
+impl ShardConfig {
+    /// Create config for a specific shard
+    pub fn for_shard(
+        shard_id: usize,
+        num_shards: usize,
+        vocab_size: usize,
+        embed_dim: usize,
+    ) -> Self {
+        let vocab_per_shard = (vocab_size + num_shards - 1) / num_shards;
+        let vocab_start = shard_id * vocab_per_shard;
+        let vocab_end = (vocab_start + vocab_per_shard).min(vocab_size);
+
+        Self {
+            vocab_size,
+            num_shards,
+            shard_id,
+            embed_dim,
+            vocab_start,
+            vocab_end,
+        }
+    }
+
+    /// Check if this shard handles a token
+    pub fn handles_token(&self, token_id: u16) -> bool {
+        let t = token_id as usize;
+        t >= self.vocab_start && t < self.vocab_end
+    }
+
+    /// Get shard that handles a token
+    pub fn shard_for_token(token_id: u16, num_shards: usize, vocab_size: usize) -> usize {
+        let vocab_per_shard = (vocab_size + num_shards - 1) / num_shards;
+        (token_id as usize) / vocab_per_shard
+    }
+
+    /// Vocab size for this shard
+    pub fn shard_vocab_size(&self) -> usize {
+        self.vocab_end - self.vocab_start
+    }
+}
+
+/// Sharded embedding table
+pub struct ShardedEmbedding<const MAX_VOCAB: usize, const DIM: usize> {
+    config: ShardConfig,
+    /// Local embedding weights (only our shard)
+    weights: HVec<i8, 8192>, // Max 8KB per shard
+}
+
+impl<const MAX_VOCAB: usize, const DIM: usize> ShardedEmbedding<MAX_VOCAB, DIM> {
+    /// Create sharded embedding
+    pub fn new(config: ShardConfig, seed: u32) -> crate::Result<Self> {
+        let shard_size = config.shard_vocab_size() * config.embed_dim;
+
+        let mut weights = HVec::new();
+        let mut rng_state = seed.wrapping_add(config.shard_id as u32 * 12345);
+
+        for _ in 0..shard_size {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            let val = (((rng_state >> 16) & 0xFF) as i16 - 128) as i8;
+            weights.push(val).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { config, weights })
+    }
+
+    /// Lookup embedding (only works if we have the token)
+    pub fn lookup(&self, token_id: u16, output: &mut [i8]) -> crate::Result<bool> {
+        if !self.config.handles_token(token_id) {
+            return Ok(false);
+        }
+
+        let local_idx = token_id as usize - self.config.vocab_start;
+        let start = local_idx * self.config.embed_dim;
+        let end = start + self.config.embed_dim;
+
+        if end > self.weights.len() || output.len() < self.config.embed_dim {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        output[..self.config.embed_dim].copy_from_slice(&self.weights[start..end]);
+        Ok(true)
+    }
+
+    /// Memory per shard vs full embedding
+    pub fn memory_saved(&self) -> f32 {
+        self.config.num_shards as f32
+    }
+
+    /// Get responsible chip for a token
+    pub fn responsible_chip(&self, token_id: u16) -> ChipId {
+        let shard = ShardConfig::shard_for_token(
+            token_id,
+            self.config.num_shards,
+            self.config.vocab_size,
+        );
+        ChipId(shard as u8)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sharding() {
+        // 1000 vocab, 5 shards
+        let config = ShardConfig::for_shard(2, 5, 1000, 32);
+
+        assert_eq!(config.vocab_start, 400);
+        assert_eq!(config.vocab_end, 600);
+        assert!(config.handles_token(450));
+        assert!(!config.handles_token(300));
+    }
+
+    #[test]
+    fn test_shard_lookup() {
+        let shard = ShardConfig::shard_for_token(450, 5, 1000);
+        assert_eq!(shard, 2);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/speculative.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/speculative.rs
@@ -0,0 +1,294 @@
+//! Speculative Decoding - Draft and Verify
+//!
+//! Use a smaller/faster model to draft tokens, verify with larger model.
+//! Perfect for federated setup: one chip drafts, others verify in parallel.
+//!
+//! # Benefits
+//! - 2-4x speedup for autoregressive generation
+//! - Maintains exact output quality
+//! - Natural fit for multi-chip setup
+
+use heapless::Vec as HVec;
+use super::protocol::{ChipId, FederationMessage};
+
+/// Maximum draft tokens per batch
+pub const MAX_DRAFT_TOKENS: usize = 8;
+
+/// Speculative decoding configuration
+#[derive(Debug, Clone)]
+pub struct DraftVerifyConfig {
+    /// Number of draft tokens to generate
+    pub draft_length: usize,
+    /// Acceptance threshold (0.0-1.0)
+    pub acceptance_threshold: f32,
+    /// Draft chip ID (usually chip 0)
+    pub draft_chip: ChipId,
+    /// Verify chips (all others)
+    pub verify_chips: HVec<ChipId, 4>,
+    /// Enable adaptive draft length
+    pub adaptive: bool,
+}
+
+impl Default for DraftVerifyConfig {
+    fn default() -> Self {
+        Self {
+            draft_length: 4,
+            acceptance_threshold: 0.9,
+            draft_chip: ChipId(0),
+            verify_chips: HVec::new(),
+            adaptive: true,
+        }
+    }
+}
+
+impl DraftVerifyConfig {
+    /// Create config for 5-chip setup
+    pub fn for_five_chips() -> Self {
+        let mut verify_chips = HVec::new();
+        for i in 1..5 {
+            let _ = verify_chips.push(ChipId(i));
+        }
+
+        Self {
+            draft_length: 4,
+            acceptance_threshold: 0.9,
+            draft_chip: ChipId(0),
+            verify_chips,
+            adaptive: true,
+        }
+    }
+}
+
+/// Draft result from drafting chip
+#[derive(Debug, Clone)]
+pub struct DraftResult {
+    /// Draft token IDs
+    pub tokens: HVec<u16, MAX_DRAFT_TOKENS>,
+    /// Draft token probabilities (fixed-point, 0-255)
+    pub probs: HVec<u8, MAX_DRAFT_TOKENS>,
+    /// Starting position
+    pub start_pos: u16,
+}
+
+/// Verification result from verifying chip
+#[derive(Debug, Clone)]
+pub struct VerifyResult {
+    /// Number of accepted tokens
+    pub accepted_count: usize,
+    /// Correct token for first rejection (if any)
+    pub correction: Option<u16>,
+    /// Verification probabilities
+    pub verify_probs: HVec<u8, MAX_DRAFT_TOKENS>,
+}
+
+/// Speculative decoder
+pub struct SpeculativeDecoder {
+    config: DraftVerifyConfig,
+    /// Is this the draft chip?
+    is_draft_chip: bool,
+    /// Current acceptance rate (for adaptive)
+    acceptance_rate: f32,
+    /// Draft tokens waiting for verification
+    pending_draft: Option<DraftResult>,
+    /// Statistics
+    stats: SpecStats,
+}
+
+impl SpeculativeDecoder {
+    /// Create for a specific chip
+    pub fn new(config: DraftVerifyConfig, chip_id: ChipId) -> Self {
+        let is_draft_chip = chip_id == config.draft_chip;
+
+        Self {
+            config,
+            is_draft_chip,
+            acceptance_rate: 0.9,
+            pending_draft: None,
+            stats: SpecStats::default(),
+        }
+    }
+
+    /// Check if this is the drafting chip
+    pub fn is_drafter(&self) -> bool {
+        self.is_draft_chip
+    }
+
+    /// Submit draft tokens (drafter only)
+    pub fn submit_draft(&mut self, draft: DraftResult) -> crate::Result<FederationMessage> {
+        if !self.is_draft_chip {
+            return Err(crate::Error::UnsupportedFeature("Not draft chip"));
+        }
+
+        // Create message to broadcast to verify chips
+        let tokens: Vec<u16> = draft.tokens.iter().cloned().collect();
+        let msg = FederationMessage::draft_tokens(
+            self.config.draft_chip,
+            ChipId::BROADCAST,
+            draft.start_pos,
+            &tokens,
+        )?;
+
+        self.pending_draft = Some(draft);
+        self.stats.drafts_sent += 1;
+
+        Ok(msg)
+    }
+
+    /// Verify draft tokens (verifier only)
+    pub fn verify_draft<F>(
+        &mut self,
+        draft: &DraftResult,
+        mut get_prob: F,
+    ) -> VerifyResult
+    where
+        F: FnMut(u16, u16) -> u8, // (position, token) -> probability
+    {
+        let mut accepted_count = 0;
+        let mut correction = None;
+        let mut verify_probs = HVec::new();
+
+        for (i, &token) in draft.tokens.iter().enumerate() {
+            let pos = draft.start_pos + i as u16;
+            let verify_prob = get_prob(pos, token);
+            let _ = verify_probs.push(verify_prob);
+
+            let draft_prob = draft.probs.get(i).copied().unwrap_or(128);
+
+            // Acceptance criterion: verify_prob >= draft_prob * threshold
+            let threshold = (draft_prob as f32 * self.config.acceptance_threshold) as u8;
+
+            if verify_prob >= threshold {
+                accepted_count += 1;
+            } else {
+                // Rejection - sample correct token
+                // In real impl, would sample from verify distribution
+                correction = Some(token.wrapping_add(1)); // Placeholder
+                break;
+            }
+        }
+
+        VerifyResult {
+            accepted_count,
+            correction,
+            verify_probs,
+        }
+    }
+
+    /// Process verification result (drafter)
+    pub fn process_verification(&mut self, result: &VerifyResult) -> HVec<u16, MAX_DRAFT_TOKENS> {
+        let mut accepted_tokens = HVec::new();
+
+        if let Some(ref draft) = self.pending_draft {
+            // Accept tokens up to rejection point
+            for i in 0..result.accepted_count {
+                if let Some(&token) = draft.tokens.get(i) {
+                    let _ = accepted_tokens.push(token);
+                }
+            }
+
+            // Add correction if any
+            if let Some(correct_token) = result.correction {
+                let _ = accepted_tokens.push(correct_token);
+            }
+
+            self.stats.tokens_accepted += result.accepted_count;
+            self.stats.tokens_rejected += draft.tokens.len() - result.accepted_count;
+
+            // Update acceptance rate
+            let batch_rate = result.accepted_count as f32 / draft.tokens.len() as f32;
+            self.acceptance_rate = 0.9 * self.acceptance_rate + 0.1 * batch_rate;
+        }
+
+        self.pending_draft = None;
+        accepted_tokens
+    }
+
+    /// Get adaptive draft length based on acceptance rate
+    pub fn adaptive_draft_length(&self) -> usize {
+        if !self.config.adaptive {
+            return self.config.draft_length;
+        }
+
+        // Higher acceptance -> longer drafts
+        if self.acceptance_rate > 0.95 {
+            (self.config.draft_length + 2).min(MAX_DRAFT_TOKENS)
+        } else if self.acceptance_rate > 0.8 {
+            self.config.draft_length
+        } else if self.acceptance_rate > 0.5 {
+            (self.config.draft_length - 1).max(1)
+        } else {
+            1 // Fall back to no speculation
+        }
+    }
+
+    /// Get speedup estimate
+    pub fn estimated_speedup(&self) -> f32 {
+        // Speedup = accepted_tokens / (1 + verify_overhead)
+        let avg_accepted = self.acceptance_rate * self.adaptive_draft_length() as f32;
+        let verify_overhead = 0.2; // Verification overhead
+        avg_accepted / (1.0 + verify_overhead)
+    }
+
+    /// Get statistics
+    pub fn stats(&self) -> &SpecStats {
+        &self.stats
+    }
+}
+
+/// Speculative decoding statistics
+#[derive(Debug, Default, Clone)]
+pub struct SpecStats {
+    /// Total draft batches sent
+    pub drafts_sent: usize,
+    /// Total tokens accepted
+    pub tokens_accepted: usize,
+    /// Total tokens rejected
+    pub tokens_rejected: usize,
+}
+
+impl SpecStats {
+    /// Overall acceptance rate
+    pub fn acceptance_rate(&self) -> f32 {
+        let total = self.tokens_accepted + self.tokens_rejected;
+        if total == 0 {
+            0.0
+        } else {
+            self.tokens_accepted as f32 / total as f32
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_speculative_config() {
+        let config = DraftVerifyConfig::for_five_chips();
+
+        assert_eq!(config.draft_chip, ChipId(0));
+        assert_eq!(config.verify_chips.len(), 4);
+    }
+
+    #[test]
+    fn test_verify_draft() {
+        let config = DraftVerifyConfig::default();
+        let mut decoder = SpeculativeDecoder::new(config, ChipId(1));
+
+        let mut draft = DraftResult {
+            tokens: HVec::new(),
+            probs: HVec::new(),
+            start_pos: 0,
+        };
+        let _ = draft.tokens.push(100);
+        let _ = draft.tokens.push(101);
+        let _ = draft.probs.push(200);
+        let _ = draft.probs.push(200);
+
+        let result = decoder.verify_draft(&draft, |_pos, _token| 190);
+
+        // Both should be accepted (190 >= 200 * 0.9 = 180)
+        assert_eq!(result.accepted_count, 2);
+        assert!(result.correction.is_none());
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/federation/tensor_parallel.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/federation/tensor_parallel.rs
@@ -0,0 +1,144 @@
+//! Tensor Parallelism - Distributed Attention Heads
+//!
+//! Splits attention heads across chips for parallel computation.
+//! Each chip handles a subset of heads, then results are combined.
+
+use heapless::Vec as HVec;
+use super::protocol::{ChipId, FederationMessage};
+
+/// Maximum heads per chip
+pub const MAX_HEADS_PER_CHIP: usize = 4;
+
+/// Tensor parallel configuration
+#[derive(Debug, Clone)]
+pub struct TPConfig {
+    /// Number of chips
+    pub num_chips: usize,
+    /// This chip's ID
+    pub chip_id: ChipId,
+    /// Total attention heads
+    pub total_heads: usize,
+    /// Heads handled by this chip
+    pub my_heads: HVec<usize, MAX_HEADS_PER_CHIP>,
+    /// Embedding dimension per head
+    pub head_dim: usize,
+}
+
+impl TPConfig {
+    /// Create config distributing heads across chips
+    pub fn distribute_heads(
+        chip_id: usize,
+        num_chips: usize,
+        total_heads: usize,
+        head_dim: usize,
+    ) -> Self {
+        let mut my_heads = HVec::new();
+
+        // Assign heads round-robin style
+        for h in 0..total_heads {
+            if h % num_chips == chip_id {
+                let _ = my_heads.push(h);
+            }
+        }
+
+        Self {
+            num_chips,
+            chip_id: ChipId(chip_id as u8),
+            total_heads,
+            my_heads,
+            head_dim,
+        }
+    }
+}
+
+/// Tensor parallel attention node
+pub struct TensorParallelNode {
+    config: TPConfig,
+    /// Partial attention outputs from each head
+    partial_outputs: HVec<HVec<i32, 64>, MAX_HEADS_PER_CHIP>,
+    /// Combined output buffer
+    output_buffer: HVec<i32, 256>,
+}
+
+impl TensorParallelNode {
+    pub fn new(config: TPConfig) -> Self {
+        Self {
+            config,
+            partial_outputs: HVec::new(),
+            output_buffer: HVec::new(),
+        }
+    }
+
+    /// Get heads this chip handles
+    pub fn my_heads(&self) -> &[usize] {
+        &self.config.my_heads
+    }
+
+    /// Compute partial attention for assigned heads
+    pub fn compute_partial_attention(
+        &mut self,
+        query: &[i8],
+        keys: &[&[i8]],
+        values: &[&[i8]],
+    ) -> crate::Result<()> {
+        self.partial_outputs.clear();
+
+        for &head_idx in &self.config.my_heads {
+            let mut head_output = HVec::new();
+
+            // Compute Q @ K^T for this head
+            let head_start = head_idx * self.config.head_dim;
+            let head_end = head_start + self.config.head_dim;
+
+            // Simplified attention: just dot product for now
+            for &val in &values[0][head_start..head_end.min(values[0].len())] {
+                head_output.push(val as i32).map_err(|_| crate::Error::BufferOverflow)?;
+            }
+
+            self.partial_outputs.push(head_output).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(())
+    }
+
+    /// Create message with partial results
+    pub fn create_partial_result_message(&self, dst: ChipId, seq: u16) -> crate::Result<FederationMessage> {
+        let mut data: Vec<i8> = Vec::new();
+
+        for partial in &self.partial_outputs {
+            for &val in partial {
+                data.push((val >> 8) as i8); // Scale down
+            }
+        }
+
+        FederationMessage::activation(
+            self.config.chip_id,
+            dst,
+            seq,
+            0, // Not layer-based
+            0,
+            &data,
+        )
+    }
+
+    /// Memory saved vs single-chip
+    pub fn memory_reduction(&self) -> f32 {
+        self.config.num_chips as f32
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_head_distribution() {
+        // 4 heads across 5 chips
+        let config0 = TPConfig::distribute_heads(0, 5, 4, 16);
+        let config1 = TPConfig::distribute_heads(1, 5, 4, 16);
+
+        // Chip 0 gets head 0, chip 1 gets head 1, etc.
+        assert_eq!(config0.my_heads.as_slice(), &[0]);
+        assert_eq!(config1.my_heads.as_slice(), &[1]);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/lib.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/lib.rs
@@ -0,0 +1,165 @@
+//! RuvLLM ESP32 - Tiny LLM Inference for Microcontrollers
+//!
+//! This crate provides a minimal inference engine designed for ESP32 and similar
+//! resource-constrained microcontrollers.
+//!
+//! # Constraints
+//! - ~520KB SRAM available
+//! - 4-16MB flash for model storage
+//! - No floating-point unit on base ESP32 (ESP32-S3 has one)
+//! - Single/dual core @ 240MHz
+//!
+//! # Features
+//! - INT8 quantized inference
+//! - Fixed-point arithmetic option
+//! - Tiny transformer blocks
+//! - Memory-mapped model loading
+//! - Optional ESP32-S3 SIMD acceleration
+
+#![cfg_attr(feature = "no_std", no_std)]
+
+#[cfg(feature = "no_std")]
+extern crate alloc;
+
+#[cfg(feature = "no_std")]
+use alloc::{vec, vec::Vec};
+
+pub mod micro_inference;
+pub mod quantized;
+pub mod model;
+pub mod attention;
+pub mod embedding;
+pub mod optimizations;
+pub mod ota;
+pub mod benchmark;
+pub mod diagnostics;
+pub mod models;
+
+#[cfg(feature = "federation")]
+pub mod federation;
+
+// RuVector integration (vector database capabilities)
+#[cfg(feature = "federation")]
+pub mod ruvector;
+
+// Re-exports
+pub use micro_inference::{MicroEngine, InferenceConfig, InferenceResult};
+pub use quantized::{QuantizedTensor, QuantizationType};
+pub use model::{TinyModel, ModelConfig};
+
+// Optimization re-exports
+pub use optimizations::{
+    BinaryVector, BinaryEmbedding, hamming_distance, hamming_similarity,
+    ProductQuantizer, PQCode,
+    SoftmaxLUT, ExpLUT, DistanceLUT,
+    MicroLoRA, LoRAConfig,
+    SparseAttention, AttentionPattern,
+    LayerPruner, PruningConfig,
+};
+
+// Federation re-exports (optional)
+#[cfg(feature = "federation")]
+pub use federation::{
+    FederationConfig, FederationMode, FederationSpeedup,
+    PipelineNode, PipelineConfig, PipelineRole,
+    FederationMessage, MessageType, ChipId,
+    FederationCoordinator, ClusterTopology,
+    MicroFastGRNN, MicroGRNNConfig,
+    SpeculativeDecoder, DraftVerifyConfig,
+};
+
+/// Memory budget for ESP32 variants
+#[derive(Debug, Clone, Copy)]
+pub enum Esp32Variant {
+    /// Original ESP32: 520KB SRAM
+    Esp32,
+    /// ESP32-S2: 320KB SRAM
+    Esp32S2,
+    /// ESP32-S3: 512KB SRAM + vector instructions
+    Esp32S3,
+    /// ESP32-C3: 400KB SRAM, RISC-V
+    Esp32C3,
+    /// ESP32-C6: 512KB SRAM, RISC-V + WiFi 6
+    Esp32C6,
+}
+
+impl Esp32Variant {
+    /// Available SRAM in bytes
+    pub const fn sram_bytes(&self) -> usize {
+        match self {
+            Self::Esp32 => 520 * 1024,
+            Self::Esp32S2 => 320 * 1024,
+            Self::Esp32S3 => 512 * 1024,
+            Self::Esp32C3 => 400 * 1024,
+            Self::Esp32C6 => 512 * 1024,
+        }
+    }
+
+    /// Whether variant has hardware floating point
+    pub const fn has_fpu(&self) -> bool {
+        match self {
+            Self::Esp32 => false,
+            Self::Esp32S2 => false,
+            Self::Esp32S3 => true,
+            Self::Esp32C3 => false,
+            Self::Esp32C6 => false,
+        }
+    }
+
+    /// Whether variant has vector/SIMD extensions
+    pub const fn has_simd(&self) -> bool {
+        matches!(self, Self::Esp32S3)
+    }
+
+    /// Recommended max model size (leaving ~200KB for runtime)
+    pub const fn max_model_ram(&self) -> usize {
+        self.sram_bytes().saturating_sub(200 * 1024)
+    }
+}
+
+/// Error types for ESP32 inference
+#[derive(Debug, Clone)]
+pub enum Error {
+    /// Model too large for available memory
+    ModelTooLarge { required: usize, available: usize },
+    /// Invalid model format
+    InvalidModel(&'static str),
+    /// Quantization error
+    QuantizationError(&'static str),
+    /// Buffer overflow
+    BufferOverflow,
+    /// Inference failed
+    InferenceFailed(&'static str),
+    /// Feature not supported on this variant
+    UnsupportedFeature(&'static str),
+}
+
+impl core::fmt::Display for Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            Error::ModelTooLarge { required, available } => {
+                write!(f, "Model too large: requires {} bytes, only {} available", required, available)
+            }
+            Error::InvalidModel(msg) => write!(f, "Invalid model: {}", msg),
+            Error::QuantizationError(msg) => write!(f, "Quantization error: {}", msg),
+            Error::BufferOverflow => write!(f, "Buffer overflow"),
+            Error::InferenceFailed(msg) => write!(f, "Inference failed: {}", msg),
+            Error::UnsupportedFeature(msg) => write!(f, "Unsupported feature: {}", msg),
+        }
+    }
+}
+
+#[cfg(feature = "host-test")]
+impl std::error::Error for Error {}
+
+pub type Result<T> = core::result::Result<T, Error>;
+
+/// Prelude for common imports
+pub mod prelude {
+    pub use crate::{
+        MicroEngine, InferenceConfig, InferenceResult,
+        QuantizedTensor, QuantizationType,
+        TinyModel, ModelConfig,
+        Esp32Variant, Error, Result,
+    };
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/main.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/main.rs
@@ -0,0 +1,360 @@
+//! RuvLLM ESP32 Demo Application
+//!
+//! Demonstrates tiny LLM inference on ESP32 microcontrollers.
+
+#![cfg_attr(feature = "no_std", no_std)]
+#![cfg_attr(feature = "no_std", no_main)]
+
+#[cfg(feature = "esp32-std")]
+use esp_idf_svc::hal::prelude::*;
+
+#[cfg(feature = "no_std")]
+extern crate alloc;
+
+// For host testing, import from crate
+#[cfg(feature = "host-test")]
+use ruvllm_esp32::prelude::*;
+#[cfg(feature = "host-test")]
+use ruvllm_esp32::model::ModelConfig;
+#[cfg(feature = "host-test")]
+use ruvllm_esp32::embedding::SimpleTokenizer;
+
+// For ESP32 builds
+#[cfg(feature = "esp32-std")]
+use ruvllm_esp32::prelude::*;
+#[cfg(feature = "esp32-std")]
+use ruvllm_esp32::model::ModelConfig;
+#[cfg(feature = "esp32-std")]
+use ruvllm_esp32::embedding::SimpleTokenizer;
+
+#[cfg(feature = "esp32-std")]
+fn main() -> anyhow::Result<()> {
+    // Initialize ESP-IDF
+    esp_idf_svc::sys::link_patches();
+    esp_idf_svc::log::EspLogger::initialize_default();
+
+    log::info!("=== RuvLLM ESP32 Demo ===");
+    log::info!("Initializing...");
+
+    // Detect ESP32 variant and create appropriate model
+    let variant = detect_variant();
+    log::info!("Detected variant: {:?}", variant);
+    log::info!("Available RAM: {} KB", variant.sram_bytes() / 1024);
+    log::info!("Max model RAM: {} KB", variant.max_model_ram() / 1024);
+
+    // Create model config for this variant
+    let config = ModelConfig::for_variant(variant);
+    log::info!("Model config:");
+    log::info!("  Vocab size: {}", config.vocab_size);
+    log::info!("  Embed dim: {}", config.embed_dim);
+    log::info!("  Hidden dim: {}", config.hidden_dim);
+    log::info!("  Layers: {}", config.num_layers);
+    log::info!("  Heads: {}", config.num_heads);
+    log::info!("  Estimated size: {} KB", config.estimate_size() / 1024);
+
+    // Create the model
+    log::info!("Creating model...");
+    let model = TinyModel::new(config)?;
+    log::info!("Model created, actual size: {} KB", model.memory_size() / 1024);
+
+    // Create inference engine
+    log::info!("Creating inference engine...");
+    let mut engine = MicroEngine::new(model)?;
+
+    let usage = engine.memory_usage();
+    log::info!("Memory usage breakdown:");
+    log::info!("  Model weights: {} KB", usage.model_weights / 1024);
+    log::info!("  Activation buffers: {} KB", usage.activation_buffers / 1024);
+    log::info!("  KV cache: {} KB", usage.kv_cache / 1024);
+    log::info!("  Total: {} KB", usage.total / 1024);
+
+    // Run inference benchmark
+    log::info!("Running inference benchmark...");
+    run_benchmark(&mut engine)?;
+
+    // Interactive demo (if UART available)
+    log::info!("Starting interactive demo...");
+    run_interactive(&mut engine)?;
+
+    Ok(())
+}
+
+// Host test main function
+#[cfg(feature = "host-test")]
+fn main() -> anyhow::Result<()> {
+    println!("=== RuvLLM ESP32 Demo (Host Simulation) ===");
+    println!("Initializing...");
+
+    // Detect ESP32 variant (simulated)
+    let variant = Esp32Variant::Esp32;
+    println!("Simulating variant: {:?}", variant);
+    println!("Available RAM: {} KB", variant.sram_bytes() / 1024);
+    println!("Max model RAM: {} KB", variant.max_model_ram() / 1024);
+
+    // Create model config for this variant
+    let config = ModelConfig::for_variant(variant);
+    println!("Model config:");
+    println!("  Vocab size: {}", config.vocab_size);
+    println!("  Embed dim: {}", config.embed_dim);
+    println!("  Hidden dim: {}", config.hidden_dim);
+    println!("  Layers: {}", config.num_layers);
+    println!("  Heads: {}", config.num_heads);
+    println!("  Estimated size: {} KB", config.estimate_size() / 1024);
+
+    // Create the model
+    println!("Creating model...");
+    let model = TinyModel::new(config)?;
+    println!("Model created, actual size: {} KB", model.memory_size() / 1024);
+
+    // Create inference engine
+    println!("Creating inference engine...");
+    let mut engine = MicroEngine::new(model)?;
+
+    let usage = engine.memory_usage();
+    println!("Memory usage breakdown:");
+    println!("  Model weights: {} KB", usage.model_weights / 1024);
+    println!("  Activation buffers: {} KB", usage.activation_buffers / 1024);
+    println!("  KV cache: {} KB", usage.kv_cache / 1024);
+    println!("  Total: {} KB", usage.total / 1024);
+
+    // Run inference benchmark
+    println!("\nRunning inference benchmark...");
+    run_benchmark_host(&mut engine)?;
+
+    // Interactive demo
+    println!("\nStarting interactive demo...");
+    run_interactive_host(&mut engine)?;
+
+    Ok(())
+}
+
+#[cfg(feature = "host-test")]
+fn run_benchmark_host(engine: &mut MicroEngine) -> anyhow::Result<()> {
+    use std::time::Instant;
+
+    let config = InferenceConfig {
+        max_tokens: 10,
+        greedy: true,
+        ..Default::default()
+    };
+
+    // Warmup
+    println!("Warmup run...");
+    let prompt = [1u16, 2, 3, 4, 5];
+    let _ = engine.generate(&prompt, &config)?;
+    engine.reset();
+
+    // Benchmark runs
+    const NUM_RUNS: usize = 10;
+    let mut total_time_us = 0u64;
+    let mut total_tokens = 0usize;
+
+    println!("Running {} benchmark iterations...", NUM_RUNS);
+
+    for i in 0..NUM_RUNS {
+        let start = Instant::now();
+        let result = engine.generate(&prompt, &config)?;
+        let elapsed = start.elapsed();
+
+        total_time_us += elapsed.as_micros() as u64;
+        total_tokens += result.tokens.len();
+
+        println!(
+            "  Run {}: {} tokens in {} us ({:.1} tok/s)",
+            i + 1,
+            result.tokens.len(),
+            elapsed.as_micros(),
+            result.tokens.len() as f32 / elapsed.as_secs_f32()
+        );
+
+        engine.reset();
+    }
+
+    let avg_time_us = total_time_us / NUM_RUNS as u64;
+    let avg_tokens = total_tokens / NUM_RUNS;
+    let tokens_per_sec = (avg_tokens as f32 * 1_000_000.0) / avg_time_us as f32;
+
+    println!("=== Benchmark Results ===");
+    println!("Average time: {} us", avg_time_us);
+    println!("Average tokens: {}", avg_tokens);
+    println!("Throughput: {:.1} tokens/sec", tokens_per_sec);
+    println!("Latency per token: {:.1} us", avg_time_us as f32 / avg_tokens.max(1) as f32);
+
+    // Estimate ESP32 performance (roughly 15x slower)
+    let esp32_time_us = avg_time_us * 15;
+    let esp32_tokens_per_sec = tokens_per_sec / 15.0;
+    println!("\nEstimated ESP32 performance:");
+    println!("  Time: {} us ({:.2} ms)", esp32_time_us, esp32_time_us as f32 / 1000.0);
+    println!("  Throughput: {:.1} tokens/sec", esp32_tokens_per_sec);
+
+    // Performance counters
+    let counters = engine.perf_counters();
+    println!("\nPerformance counters:");
+    println!("  Embeddings: {}", counters.embeddings);
+    println!("  Attention ops: {}", counters.attention_ops);
+    println!("  FFN ops: {}", counters.ffn_ops);
+
+    Ok(())
+}
+
+#[cfg(feature = "host-test")]
+fn run_interactive_host(engine: &mut MicroEngine) -> anyhow::Result<()> {
+    let tokenizer = SimpleTokenizer::ascii();
+    let config = InferenceConfig {
+        max_tokens: 20,
+        greedy: true,
+        ..Default::default()
+    };
+
+    // Simple demo prompts
+    let prompts = [
+        "Hello",
+        "The quick brown",
+        "1 + 1 =",
+    ];
+
+    for prompt in &prompts {
+        println!("Prompt: '{}'", prompt);
+
+        let tokens = tokenizer.encode(prompt);
+        let prompt_ids: heapless::Vec<u16, 64> = tokens.iter().copied().collect();
+
+        engine.reset();
+        let result = engine.generate(&prompt_ids, &config)?;
+
+        let output = tokenizer.decode(&result.tokens);
+        let output_str = core::str::from_utf8(&output).unwrap_or("<invalid>");
+
+        println!("Generated: '{}'", output_str);
+        println!("Tokens: {:?}", result.tokens.as_slice());
+        println!("---");
+    }
+
+    Ok(())
+}
+
+#[cfg(not(any(feature = "host-test", feature = "esp32-std")))]
+#[no_mangle]
+pub extern "C" fn main() -> ! {
+    // Bare-metal entry point
+    // Initialize heap, etc.
+    loop {}
+}
+
+/// Detect ESP32 variant at runtime
+fn detect_variant() -> Esp32Variant {
+    // In real code, this would check chip ID
+    // For now, default to ESP32
+    #[cfg(feature = "esp32s3-simd")]
+    return Esp32Variant::Esp32S3;
+
+    #[cfg(not(feature = "esp32s3-simd"))]
+    Esp32Variant::Esp32
+}
+
+/// Run inference benchmark
+#[cfg(feature = "std")]
+fn run_benchmark(engine: &mut MicroEngine) -> anyhow::Result<()> {
+    use std::time::Instant;
+
+    let config = InferenceConfig {
+        max_tokens: 10,
+        greedy: true,
+        ..Default::default()
+    };
+
+    // Warmup
+    log::info!("Warmup run...");
+    let prompt = [1u16, 2, 3, 4, 5];
+    let _ = engine.generate(&prompt, &config)?;
+    engine.reset();
+
+    // Benchmark runs
+    const NUM_RUNS: usize = 10;
+    let mut total_time_us = 0u64;
+    let mut total_tokens = 0usize;
+
+    log::info!("Running {} benchmark iterations...", NUM_RUNS);
+
+    for i in 0..NUM_RUNS {
+        let start = Instant::now();
+        let result = engine.generate(&prompt, &config)?;
+        let elapsed = start.elapsed();
+
+        total_time_us += elapsed.as_micros() as u64;
+        total_tokens += result.tokens.len();
+
+        log::info!(
+            "  Run {}: {} tokens in {} us ({:.1} tok/s)",
+            i + 1,
+            result.tokens.len(),
+            elapsed.as_micros(),
+            result.tokens.len() as f32 / elapsed.as_secs_f32()
+        );
+
+        engine.reset();
+    }
+
+    let avg_time_us = total_time_us / NUM_RUNS as u64;
+    let avg_tokens = total_tokens / NUM_RUNS;
+    let tokens_per_sec = (avg_tokens as f32 * 1_000_000.0) / avg_time_us as f32;
+
+    log::info!("=== Benchmark Results ===");
+    log::info!("Average time: {} us", avg_time_us);
+    log::info!("Average tokens: {}", avg_tokens);
+    log::info!("Throughput: {:.1} tokens/sec", tokens_per_sec);
+    log::info!("Latency per token: {:.1} us", avg_time_us as f32 / avg_tokens as f32);
+
+    // Memory stats
+    let counters = engine.perf_counters();
+    log::info!("Performance counters:");
+    log::info!("  Embeddings: {}", counters.embeddings);
+    log::info!("  Attention ops: {}", counters.attention_ops);
+    log::info!("  FFN ops: {}", counters.ffn_ops);
+
+    Ok(())
+}
+
+/// Run interactive text generation
+#[cfg(feature = "std")]
+fn run_interactive(engine: &mut MicroEngine) -> anyhow::Result<()> {
+    let tokenizer = SimpleTokenizer::ascii();
+    let config = InferenceConfig {
+        max_tokens: 20,
+        greedy: true,
+        ..Default::default()
+    };
+
+    // Simple demo prompts
+    let prompts = [
+        "Hello",
+        "The quick brown",
+        "1 + 1 =",
+    ];
+
+    for prompt in &prompts {
+        log::info!("Prompt: '{}'", prompt);
+
+        let tokens = tokenizer.encode(prompt);
+        let prompt_ids: heapless::Vec<u16, 64> = tokens.iter().copied().collect();
+
+        engine.reset();
+        let result = engine.generate(&prompt_ids, &config)?;
+
+        let output = tokenizer.decode(&result.tokens);
+        let output_str = core::str::from_utf8(&output).unwrap_or("<invalid>");
+
+        log::info!("Generated: '{}'", output_str);
+        log::info!("Tokens: {:?}", result.tokens.as_slice());
+        log::info!("---");
+    }
+
+    Ok(())
+}
+
+// Panic handler for no_std
+#[cfg(all(feature = "no_std", not(test)))]
+#[panic_handler]
+fn panic(_info: &core::panic::PanicInfo) -> ! {
+    loop {}
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/micro_inference.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/micro_inference.rs
@@ -0,0 +1,620 @@
+//! Micro Inference Engine for ESP32
+//!
+//! A minimal transformer inference engine designed for microcontrollers.
+//! Supports tiny models up to ~300KB with INT8 quantization.
+
+use crate::quantized::{QuantizationType, matmul_int8, QuantParams};
+use crate::model::{TinyModel, LayerWeights};
+use heapless::Vec as HVec;
+use serde::{Deserialize, Serialize};
+
+/// Maximum sequence length for embedded inference
+pub const MAX_SEQ_LEN: usize = 32;
+/// Maximum embedding dimension
+pub const MAX_EMBED_DIM: usize = 64;
+/// Maximum vocabulary size
+pub const MAX_VOCAB_SIZE: usize = 512;
+/// Maximum hidden dimension
+pub const MAX_HIDDEN_DIM: usize = 128;
+
+/// Inference configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InferenceConfig {
+    /// Maximum tokens to generate
+    pub max_tokens: usize,
+    /// Temperature for sampling (0.0 = greedy)
+    pub temperature: f32,
+    /// Top-k sampling (0 = disabled)
+    pub top_k: usize,
+    /// Whether to use greedy decoding
+    pub greedy: bool,
+    /// Random seed for reproducibility
+    pub seed: u32,
+}
+
+impl Default for InferenceConfig {
+    fn default() -> Self {
+        Self {
+            max_tokens: 16,
+            temperature: 0.7,
+            top_k: 8,
+            greedy: true,
+            seed: 42,
+        }
+    }
+}
+
+/// Inference result
+#[derive(Debug, Clone)]
+pub struct InferenceResult {
+    /// Generated token IDs
+    pub tokens: HVec<u16, MAX_SEQ_LEN>,
+    /// Total inference time in microseconds
+    pub inference_time_us: u64,
+    /// Tokens per second
+    pub tokens_per_second: f32,
+    /// Peak memory usage estimate in bytes
+    pub peak_memory_bytes: usize,
+    /// Per-layer timing breakdown
+    pub layer_times_us: HVec<u32, 8>,
+}
+
+/// Activation buffer for intermediate computations
+/// Uses fixed-size stack allocation to avoid heap fragmentation
+pub struct ActivationBuffer {
+    /// Input embedding buffer
+    pub input: [i8; MAX_EMBED_DIM],
+    /// Hidden state buffer
+    pub hidden: [i32; MAX_HIDDEN_DIM],
+    /// Output logits buffer
+    pub logits: [i32; MAX_VOCAB_SIZE],
+    /// Attention scores buffer
+    pub attn_scores: [i32; MAX_SEQ_LEN],
+    /// Temporary buffer for matrix ops
+    pub temp: [i32; MAX_HIDDEN_DIM],
+    /// Query projection buffer
+    pub query: [i8; MAX_EMBED_DIM],
+    /// Key projection buffer
+    pub key: [i8; MAX_EMBED_DIM],
+    /// Value projection buffer
+    pub value: [i8; MAX_EMBED_DIM],
+}
+
+impl Default for ActivationBuffer {
+    fn default() -> Self {
+        Self {
+            input: [0i8; MAX_EMBED_DIM],
+            hidden: [0i32; MAX_HIDDEN_DIM],
+            logits: [0i32; MAX_VOCAB_SIZE],
+            attn_scores: [0i32; MAX_SEQ_LEN],
+            temp: [0i32; MAX_HIDDEN_DIM],
+            query: [0i8; MAX_EMBED_DIM],
+            key: [0i8; MAX_EMBED_DIM],
+            value: [0i8; MAX_EMBED_DIM],
+        }
+    }
+}
+
+impl ActivationBuffer {
+    /// Total size of activation buffers
+    pub const fn total_size() -> usize {
+        MAX_EMBED_DIM * 4          // input, query, key, value (i8)
+        + MAX_HIDDEN_DIM * 4 * 2   // hidden, temp (i32)
+        + MAX_VOCAB_SIZE * 4       // logits (i32)
+        + MAX_SEQ_LEN * 4          // attn_scores (i32)
+    }
+}
+
+/// Micro inference engine for ESP32
+pub struct MicroEngine {
+    /// Model weights and config
+    model: TinyModel,
+    /// Activation buffers (stack allocated)
+    buffers: ActivationBuffer,
+    /// Current sequence position
+    seq_pos: usize,
+    /// KV cache for autoregressive generation
+    kv_cache: KVCache,
+    /// Performance counters
+    perf: PerfCounters,
+}
+
+/// Key-Value cache for autoregressive generation
+pub struct KVCache {
+    /// Cached keys [seq_len, embed_dim]
+    keys: [[i8; MAX_EMBED_DIM]; MAX_SEQ_LEN],
+    /// Cached values [seq_len, embed_dim]
+    values: [[i8; MAX_EMBED_DIM]; MAX_SEQ_LEN],
+    /// Current cache length
+    len: usize,
+}
+
+impl Default for KVCache {
+    fn default() -> Self {
+        Self {
+            keys: [[0i8; MAX_EMBED_DIM]; MAX_SEQ_LEN],
+            values: [[0i8; MAX_EMBED_DIM]; MAX_SEQ_LEN],
+            len: 0,
+        }
+    }
+}
+
+impl KVCache {
+    /// Total memory usage
+    pub const fn memory_size() -> usize {
+        MAX_SEQ_LEN * MAX_EMBED_DIM * 2 // keys + values
+    }
+
+    /// Clear the cache
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Push new key-value pair
+    pub fn push(&mut self, key: &[i8], value: &[i8]) -> crate::Result<()> {
+        if self.len >= MAX_SEQ_LEN {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        self.keys[self.len][..key.len()].copy_from_slice(key);
+        self.values[self.len][..value.len()].copy_from_slice(value);
+        self.len += 1;
+        Ok(())
+    }
+}
+
+/// Performance counters
+#[derive(Debug, Clone, Default)]
+pub struct PerfCounters {
+    /// Total embeddings computed
+    pub embeddings: u32,
+    /// Total attention operations
+    pub attention_ops: u32,
+    /// Total FFN operations
+    pub ffn_ops: u32,
+    /// Total cycles (estimated)
+    pub cycles: u64,
+}
+
+impl MicroEngine {
+    /// Create a new micro inference engine
+    pub fn new(model: TinyModel) -> crate::Result<Self> {
+        // Validate model fits in memory constraints
+        let model_size = model.memory_size();
+        let buffer_size = ActivationBuffer::total_size();
+        let kv_size = KVCache::memory_size();
+        let total_required = model_size + buffer_size + kv_size;
+
+        let available = crate::Esp32Variant::Esp32.max_model_ram();
+        if total_required > available {
+            return Err(crate::Error::ModelTooLarge {
+                required: total_required,
+                available,
+            });
+        }
+
+        Ok(Self {
+            model,
+            buffers: ActivationBuffer::default(),
+            seq_pos: 0,
+            kv_cache: KVCache::default(),
+            perf: PerfCounters::default(),
+        })
+    }
+
+    /// Get memory usage breakdown
+    pub fn memory_usage(&self) -> MemoryUsage {
+        MemoryUsage {
+            model_weights: self.model.memory_size(),
+            activation_buffers: ActivationBuffer::total_size(),
+            kv_cache: KVCache::memory_size(),
+            total: self.model.memory_size()
+                + ActivationBuffer::total_size()
+                + KVCache::memory_size(),
+        }
+    }
+
+    /// Reset engine state for new sequence
+    pub fn reset(&mut self) {
+        self.seq_pos = 0;
+        self.kv_cache.clear();
+        self.perf = PerfCounters::default();
+    }
+
+    /// Embed a single token
+    pub fn embed_token(&mut self, token_id: u16) -> crate::Result<()> {
+        let embed_dim = self.model.config.embed_dim;
+
+        if token_id as usize >= self.model.config.vocab_size {
+            return Err(crate::Error::InvalidModel("Token ID out of range"));
+        }
+
+        // Look up embedding from quantized table
+        let embed_offset = token_id as usize * embed_dim;
+        let embed_slice = &self.model.embedding_table[embed_offset..embed_offset + embed_dim];
+
+        // Copy to input buffer
+        for (i, &v) in embed_slice.iter().enumerate() {
+            self.buffers.input[i] = v;
+        }
+
+        self.perf.embeddings += 1;
+        Ok(())
+    }
+
+    /// Single attention head computation (INT8)
+    #[allow(unused_variables)]
+    pub fn attention_head(
+        &mut self,
+        layer: &LayerWeights,
+        head_idx: usize,
+    ) -> crate::Result<()> {
+        let embed_dim = self.model.config.embed_dim;
+        let head_dim = embed_dim / self.model.config.num_heads;
+        let head_offset = head_idx * head_dim;
+
+        // Q = input @ Wq
+        matmul_int8(
+            &layer.wq[head_offset * embed_dim..(head_offset + head_dim) * embed_dim],
+            &layer.q_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.hidden[..head_dim],
+            head_dim,
+            embed_dim,
+        );
+
+        // Copy Q to query buffer
+        for i in 0..head_dim {
+            self.buffers.query[i] = (self.buffers.hidden[i] >> 8).clamp(-128, 127) as i8;
+        }
+
+        // K = input @ Wk
+        matmul_int8(
+            &layer.wk[head_offset * embed_dim..(head_offset + head_dim) * embed_dim],
+            &layer.k_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.hidden[..head_dim],
+            head_dim,
+            embed_dim,
+        );
+
+        for i in 0..head_dim {
+            self.buffers.key[i] = (self.buffers.hidden[i] >> 8).clamp(-128, 127) as i8;
+        }
+
+        // V = input @ Wv
+        matmul_int8(
+            &layer.wv[head_offset * embed_dim..(head_offset + head_dim) * embed_dim],
+            &layer.v_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.hidden[..head_dim],
+            head_dim,
+            embed_dim,
+        );
+
+        for i in 0..head_dim {
+            self.buffers.value[i] = (self.buffers.hidden[i] >> 8).clamp(-128, 127) as i8;
+        }
+
+        // Store K,V in cache (only for first head to avoid duplicates)
+        if head_idx == 0 {
+            // Only push if we haven't exceeded the sequence position
+            if self.kv_cache.len < self.seq_pos + 1 {
+                self.kv_cache.push(&self.buffers.key[..head_dim], &self.buffers.value[..head_dim])?;
+            }
+        }
+
+        // Compute attention scores: Q @ K^T for all cached positions
+        let cache_len = self.kv_cache.len;
+        for pos in 0..cache_len {
+            let mut score: i32 = 0;
+            for i in 0..head_dim {
+                score += self.buffers.query[i] as i32 * self.kv_cache.keys[pos][i] as i32;
+            }
+            // Scale by 1/sqrt(head_dim) approximated as right shift
+            self.buffers.attn_scores[pos] = score >> 4;
+        }
+
+        // Softmax approximation using fixed-point
+        Self::softmax_int32_slice(&mut self.buffers.attn_scores[..cache_len]);
+
+        // Weighted sum of values
+        for i in 0..head_dim {
+            let mut sum: i32 = 0;
+            for pos in 0..self.kv_cache.len {
+                sum += self.buffers.attn_scores[pos] * self.kv_cache.values[pos][i] as i32;
+            }
+            self.buffers.hidden[i] = sum >> 8;
+        }
+
+        self.perf.attention_ops += 1;
+        Ok(())
+    }
+
+    /// Fixed-point softmax approximation (static to avoid borrow issues)
+    fn softmax_int32_slice(scores: &mut [i32]) {
+        if scores.is_empty() {
+            return;
+        }
+
+        // Find max for numerical stability
+        let max = scores.iter().cloned().max().unwrap_or(0);
+
+        // Subtract max and compute exp approximation
+        // Using linear approximation: exp(x) ≈ max(0, 1 + x/256) for small x
+        let mut sum: i32 = 0;
+        for score in scores.iter_mut() {
+            *score = (*score - max).max(-256) + 256;
+            sum += *score;
+        }
+
+        // Normalize (fixed-point division)
+        if sum > 0 {
+            for score in scores.iter_mut() {
+                *score = (*score << 8) / sum;
+            }
+        }
+    }
+
+    /// Feed-forward network layer (INT8)
+    pub fn ffn_layer(&mut self, layer: &LayerWeights) -> crate::Result<()> {
+        let embed_dim = self.model.config.embed_dim;
+        let hidden_dim = self.model.config.hidden_dim;
+
+        // Up projection: hidden = input @ W_up
+        matmul_int8(
+            &layer.w_up,
+            &layer.up_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.hidden[..hidden_dim],
+            hidden_dim,
+            embed_dim,
+        );
+
+        // GELU approximation: gelu(x) ≈ x * sigmoid(1.702 * x)
+        // For INT8: use ReLU as simpler approximation
+        for h in self.buffers.hidden[..hidden_dim].iter_mut() {
+            *h = (*h).max(0);
+        }
+
+        // Gate projection (for gated FFN)
+        matmul_int8(
+            &layer.w_gate,
+            &layer.gate_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.temp[..hidden_dim],
+            hidden_dim,
+            embed_dim,
+        );
+
+        // Element-wise multiply with gate
+        for i in 0..hidden_dim {
+            self.buffers.hidden[i] = (self.buffers.hidden[i] >> 8) * (self.buffers.temp[i] >> 8);
+        }
+
+        // Convert back to i8 for down projection input
+        let mut hidden_i8 = [0i8; MAX_HIDDEN_DIM];
+        for i in 0..hidden_dim {
+            hidden_i8[i] = (self.buffers.hidden[i] >> 8).clamp(-128, 127) as i8;
+        }
+
+        // Down projection: output = hidden @ W_down
+        matmul_int8(
+            &layer.w_down,
+            &layer.down_params,
+            &hidden_i8[..hidden_dim],
+            &layer.up_params, // reuse params
+            &mut self.buffers.hidden[..embed_dim],
+            embed_dim,
+            hidden_dim,
+        );
+
+        // Residual connection
+        for i in 0..embed_dim {
+            let residual = self.buffers.input[i] as i32 * 256;
+            self.buffers.hidden[i] += residual;
+            self.buffers.input[i] = (self.buffers.hidden[i] >> 8).clamp(-128, 127) as i8;
+        }
+
+        self.perf.ffn_ops += 1;
+        Ok(())
+    }
+
+    /// Output projection to vocabulary
+    pub fn output_projection(&mut self) -> crate::Result<()> {
+        let embed_dim = self.model.config.embed_dim;
+        let vocab_size = self.model.config.vocab_size;
+
+        matmul_int8(
+            &self.model.output_proj,
+            &self.model.output_params,
+            &self.buffers.input[..embed_dim],
+            &self.model.input_params,
+            &mut self.buffers.logits[..vocab_size],
+            vocab_size,
+            embed_dim,
+        );
+
+        Ok(())
+    }
+
+    /// Sample next token from logits
+    pub fn sample(&self, config: &InferenceConfig) -> u16 {
+        let vocab_size = self.model.config.vocab_size;
+
+        if config.greedy || config.temperature < 0.01 {
+            // Greedy: argmax
+            let mut max_idx = 0;
+            let mut max_val = i32::MIN;
+            for (i, &logit) in self.buffers.logits[..vocab_size].iter().enumerate() {
+                if logit > max_val {
+                    max_val = logit;
+                    max_idx = i;
+                }
+            }
+            return max_idx as u16;
+        }
+
+        // Temperature sampling with top-k
+        // For embedded: simple argmax with some noise
+        let mut max_idx = 0;
+        let mut max_val = i32::MIN;
+        for (i, &logit) in self.buffers.logits[..vocab_size].iter().enumerate() {
+            if logit > max_val {
+                max_val = logit;
+                max_idx = i;
+            }
+        }
+        max_idx as u16
+    }
+
+    /// Run full inference for one token
+    pub fn forward_one(&mut self, token_id: u16) -> crate::Result<u16> {
+        // 1. Embed token
+        self.embed_token(token_id)?;
+
+        // 2. Run through transformer layers
+        let num_layers = self.model.config.num_layers;
+        let num_heads = self.model.config.num_heads;
+
+        for layer_idx in 0..num_layers {
+            // Clone layer data to avoid borrow issues
+            let layer = self.model.layers[layer_idx].clone();
+
+            // Attention
+            for head in 0..num_heads {
+                self.attention_head(&layer, head)?;
+            }
+
+            // FFN
+            self.ffn_layer(&layer)?;
+        }
+
+        // 3. Output projection
+        self.output_projection()?;
+
+        // 4. Sample next token
+        let next_token = self.sample(&InferenceConfig::default());
+
+        self.seq_pos += 1;
+        Ok(next_token)
+    }
+
+    /// Generate a sequence of tokens
+    pub fn generate(
+        &mut self,
+        prompt_tokens: &[u16],
+        config: &InferenceConfig,
+    ) -> crate::Result<InferenceResult> {
+        self.reset();
+
+        let mut result = InferenceResult {
+            tokens: HVec::new(),
+            inference_time_us: 0,
+            tokens_per_second: 0.0,
+            peak_memory_bytes: self.memory_usage().total,
+            layer_times_us: HVec::new(),
+        };
+
+        // Process prompt (prefill)
+        for &token in prompt_tokens {
+            let _ = self.forward_one(token)?;
+        }
+
+        // Generate new tokens
+        let mut next_token = prompt_tokens.last().copied().unwrap_or(0);
+        for _ in 0..config.max_tokens {
+            next_token = self.forward_one(next_token)?;
+            result.tokens.push(next_token).map_err(|_| crate::Error::BufferOverflow)?;
+
+            // Check for EOS token (assume token 0 is EOS)
+            if next_token == 0 {
+                break;
+            }
+        }
+
+        Ok(result)
+    }
+
+    /// Get performance counters
+    pub fn perf_counters(&self) -> &PerfCounters {
+        &self.perf
+    }
+}
+
+/// Memory usage breakdown
+#[derive(Debug, Clone)]
+pub struct MemoryUsage {
+    pub model_weights: usize,
+    pub activation_buffers: usize,
+    pub kv_cache: usize,
+    pub total: usize,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::model::ModelConfig;
+
+    fn create_tiny_model() -> TinyModel {
+        TinyModel::new(ModelConfig {
+            vocab_size: 256,
+            embed_dim: 64,
+            hidden_dim: 128,
+            num_layers: 2,
+            num_heads: 4,
+            max_seq_len: 32,
+            quant_type: QuantizationType::Int8,
+        }).unwrap()
+    }
+
+    #[test]
+    fn test_engine_creation() {
+        let model = create_tiny_model();
+        let engine = MicroEngine::new(model).unwrap();
+
+        let usage = engine.memory_usage();
+        println!("Memory usage: {:?}", usage);
+        assert!(usage.total < 320 * 1024); // Must fit in ESP32-S2
+    }
+
+    #[test]
+    fn test_embedding() {
+        let model = create_tiny_model();
+        let mut engine = MicroEngine::new(model).unwrap();
+
+        engine.embed_token(42).unwrap();
+        assert_eq!(engine.perf.embeddings, 1);
+    }
+
+    #[test]
+    fn test_forward_pass() {
+        let model = create_tiny_model();
+        let mut engine = MicroEngine::new(model).unwrap();
+
+        let next_token = engine.forward_one(10).unwrap();
+        assert!(next_token < 256);
+    }
+
+    #[test]
+    fn test_generation() {
+        let model = create_tiny_model();
+        let mut engine = MicroEngine::new(model).unwrap();
+
+        let prompt = [1u16, 2, 3];
+        let config = InferenceConfig {
+            max_tokens: 5,
+            greedy: true,
+            ..Default::default()
+        };
+
+        let result = engine.generate(&prompt, &config).unwrap();
+        assert!(!result.tokens.is_empty());
+        assert!(result.tokens.len() <= 5);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/model.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/model.rs
@@ -0,0 +1,444 @@
+//! Model definition and loading for ESP32
+//!
+//! Supports tiny transformer models with INT8 quantization.
+
+use crate::quantized::{QuantParams, QuantizationType};
+use heapless::Vec as HVec;
+use serde::{Deserialize, Serialize};
+
+/// Maximum number of transformer layers
+pub const MAX_LAYERS: usize = 2;
+/// Maximum embedding table size (vocab * embed_dim bytes)
+pub const MAX_EMBEDDING_SIZE: usize = 32 * 1024; // 32KB
+/// Maximum weight size per layer
+pub const MAX_LAYER_SIZE: usize = 16 * 1024; // 16KB
+
+/// Model configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelConfig {
+    /// Vocabulary size
+    pub vocab_size: usize,
+    /// Embedding dimension
+    pub embed_dim: usize,
+    /// Hidden dimension in FFN
+    pub hidden_dim: usize,
+    /// Number of transformer layers
+    pub num_layers: usize,
+    /// Number of attention heads
+    pub num_heads: usize,
+    /// Maximum sequence length
+    pub max_seq_len: usize,
+    /// Quantization type
+    pub quant_type: QuantizationType,
+}
+
+impl Default for ModelConfig {
+    fn default() -> Self {
+        // Tiny model suitable for ESP32
+        Self {
+            vocab_size: 256,
+            embed_dim: 32,
+            hidden_dim: 64,
+            num_layers: 1,
+            num_heads: 2,
+            max_seq_len: 16,
+            quant_type: QuantizationType::Int8,
+        }
+    }
+}
+
+impl ModelConfig {
+    /// Validate configuration fits ESP32 constraints
+    pub fn validate(&self, variant: crate::Esp32Variant) -> crate::Result<()> {
+        let model_size = self.estimate_size();
+        let max_ram = variant.max_model_ram();
+
+        if model_size > max_ram {
+            return Err(crate::Error::ModelTooLarge {
+                required: model_size,
+                available: max_ram,
+            });
+        }
+
+        if self.embed_dim % self.num_heads != 0 {
+            return Err(crate::Error::InvalidModel(
+                "embed_dim must be divisible by num_heads"
+            ));
+        }
+
+        if self.num_layers > MAX_LAYERS {
+            return Err(crate::Error::InvalidModel("Too many layers"));
+        }
+
+        Ok(())
+    }
+
+    /// Estimate total model size in bytes
+    pub fn estimate_size(&self) -> usize {
+        let bytes_per_weight = match self.quant_type {
+            QuantizationType::Int8 => 1,
+            QuantizationType::Int4 => 1, // 2 weights per byte
+            QuantizationType::Binary => 1, // 8 weights per byte
+            QuantizationType::Fixed16 => 2,
+        };
+
+        let divisor = match self.quant_type {
+            QuantizationType::Int4 => 2,
+            QuantizationType::Binary => 8,
+            _ => 1,
+        };
+
+        // Embedding table
+        let embed_size = (self.vocab_size * self.embed_dim * bytes_per_weight) / divisor;
+
+        // Per-layer weights
+        let qkv_size = 3 * self.embed_dim * self.embed_dim * bytes_per_weight / divisor;
+        let ffn_size = 3 * self.embed_dim * self.hidden_dim * bytes_per_weight / divisor;
+        let layer_size = qkv_size + ffn_size;
+
+        // Output projection
+        let output_size = (self.vocab_size * self.embed_dim * bytes_per_weight) / divisor;
+
+        embed_size + (layer_size * self.num_layers) + output_size
+    }
+
+    /// Get recommended config for variant
+    pub fn for_variant(variant: crate::Esp32Variant) -> Self {
+        match variant {
+            crate::Esp32Variant::Esp32 | crate::Esp32Variant::Esp32S3 => {
+                // ~300KB available, use larger model (but fits in stack)
+                Self {
+                    vocab_size: 256,
+                    embed_dim: 64,
+                    hidden_dim: 128,
+                    num_layers: 2,
+                    num_heads: 4,
+                    max_seq_len: 32,
+                    quant_type: QuantizationType::Int8,
+                }
+            }
+            crate::Esp32Variant::Esp32S2 => {
+                // ~120KB available, use smaller model
+                Self {
+                    vocab_size: 128,
+                    embed_dim: 32,
+                    hidden_dim: 64,
+                    num_layers: 1,
+                    num_heads: 2,
+                    max_seq_len: 16,
+                    quant_type: QuantizationType::Int8,
+                }
+            }
+            crate::Esp32Variant::Esp32C3 | crate::Esp32Variant::Esp32C6 => {
+                // ~200KB available
+                Self {
+                    vocab_size: 256,
+                    embed_dim: 48,
+                    hidden_dim: 96,
+                    num_layers: 2,
+                    num_heads: 3,
+                    max_seq_len: 24,
+                    quant_type: QuantizationType::Int8,
+                }
+            }
+        }
+    }
+}
+
+/// Layer weights for a single transformer layer
+#[derive(Clone)]
+pub struct LayerWeights {
+    /// Query projection weights [embed_dim, embed_dim]
+    pub wq: HVec<i8, MAX_LAYER_SIZE>,
+    /// Key projection weights
+    pub wk: HVec<i8, MAX_LAYER_SIZE>,
+    /// Value projection weights
+    pub wv: HVec<i8, MAX_LAYER_SIZE>,
+    /// Output projection weights
+    pub wo: HVec<i8, MAX_LAYER_SIZE>,
+
+    /// FFN up projection [embed_dim, hidden_dim]
+    pub w_up: HVec<i8, MAX_LAYER_SIZE>,
+    /// FFN gate projection
+    pub w_gate: HVec<i8, MAX_LAYER_SIZE>,
+    /// FFN down projection [hidden_dim, embed_dim]
+    pub w_down: HVec<i8, MAX_LAYER_SIZE>,
+
+    /// Quantization params
+    pub q_params: QuantParams,
+    pub k_params: QuantParams,
+    pub v_params: QuantParams,
+    pub o_params: QuantParams,
+    pub up_params: QuantParams,
+    pub gate_params: QuantParams,
+    pub down_params: QuantParams,
+}
+
+impl Default for LayerWeights {
+    fn default() -> Self {
+        Self {
+            wq: HVec::new(),
+            wk: HVec::new(),
+            wv: HVec::new(),
+            wo: HVec::new(),
+            w_up: HVec::new(),
+            w_gate: HVec::new(),
+            w_down: HVec::new(),
+            q_params: QuantParams::default(),
+            k_params: QuantParams::default(),
+            v_params: QuantParams::default(),
+            o_params: QuantParams::default(),
+            up_params: QuantParams::default(),
+            gate_params: QuantParams::default(),
+            down_params: QuantParams::default(),
+        }
+    }
+}
+
+impl LayerWeights {
+    /// Initialize with random weights (for testing)
+    pub fn random(config: &ModelConfig, seed: u32) -> crate::Result<Self> {
+        let mut layer = Self::default();
+
+        let embed_dim = config.embed_dim;
+        let hidden_dim = config.hidden_dim;
+
+        // Simple LCG random number generator
+        let mut rng_state = seed;
+        let mut next_rand = || {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            // Get value in range 0-127, then map to -64 to 63
+            (((rng_state >> 16) & 0x7F) as i16 - 64) as i8
+        };
+
+        // QKV projections [embed_dim, embed_dim]
+        let qkv_size = embed_dim * embed_dim;
+        for _ in 0..qkv_size {
+            layer.wq.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            layer.wk.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            layer.wv.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            layer.wo.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        // FFN projections
+        let up_size = embed_dim * hidden_dim;
+        for _ in 0..up_size {
+            layer.w_up.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+            layer.w_gate.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        let down_size = hidden_dim * embed_dim;
+        for _ in 0..down_size {
+            layer.w_down.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        // Initialize quant params with reasonable defaults
+        let scale = 1.0 / 64.0; // For weights in range [-64, 63]
+        layer.q_params = QuantParams { scale, zero_point: 0.0, min_val: -1.0, max_val: 1.0 };
+        layer.k_params = layer.q_params;
+        layer.v_params = layer.q_params;
+        layer.o_params = layer.q_params;
+        layer.up_params = layer.q_params;
+        layer.gate_params = layer.q_params;
+        layer.down_params = layer.q_params;
+
+        Ok(layer)
+    }
+
+    /// Memory size of this layer
+    pub fn memory_size(&self) -> usize {
+        self.wq.len() + self.wk.len() + self.wv.len() + self.wo.len()
+            + self.w_up.len() + self.w_gate.len() + self.w_down.len()
+    }
+}
+
+/// Complete tiny model
+pub struct TinyModel {
+    /// Model configuration
+    pub config: ModelConfig,
+    /// Embedding table [vocab_size, embed_dim]
+    pub embedding_table: HVec<i8, MAX_EMBEDDING_SIZE>,
+    /// Transformer layers
+    pub layers: [LayerWeights; MAX_LAYERS],
+    /// Output projection [embed_dim, vocab_size]
+    pub output_proj: HVec<i8, MAX_EMBEDDING_SIZE>,
+    /// Input quantization params
+    pub input_params: QuantParams,
+    /// Output quantization params
+    pub output_params: QuantParams,
+}
+
+impl TinyModel {
+    /// Create a new model with random weights
+    pub fn new(config: ModelConfig) -> crate::Result<Self> {
+        config.validate(crate::Esp32Variant::Esp32)?;
+
+        let mut embedding_table = HVec::new();
+        let mut output_proj = HVec::new();
+
+        // Initialize embedding table
+        let embed_size = config.vocab_size * config.embed_dim;
+        let mut rng_state = 12345u32;
+        let mut next_rand = || {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            // Get value in range 0-255, then map to -128 to 127
+            (((rng_state >> 16) & 0xFF) as i16 - 128) as i8
+        };
+
+        for _ in 0..embed_size {
+            embedding_table.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        // Initialize output projection
+        for _ in 0..embed_size {
+            output_proj.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        // Initialize layers
+        let mut layers: [LayerWeights; MAX_LAYERS] = Default::default();
+        for i in 0..config.num_layers {
+            layers[i] = LayerWeights::random(&config, (i * 1000) as u32)?;
+        }
+
+        Ok(Self {
+            config,
+            embedding_table,
+            layers,
+            output_proj,
+            input_params: QuantParams::default(),
+            output_params: QuantParams::default(),
+        })
+    }
+
+    /// Total memory size of model
+    pub fn memory_size(&self) -> usize {
+        let mut size = self.embedding_table.len();
+        size += self.output_proj.len();
+        for i in 0..self.config.num_layers {
+            size += self.layers[i].memory_size();
+        }
+        size
+    }
+
+    /// Load model from bytes (e.g., from flash)
+    pub fn from_bytes(data: &[u8]) -> crate::Result<Self> {
+        // Parse header
+        if data.len() < 32 {
+            return Err(crate::Error::InvalidModel("Data too small"));
+        }
+
+        // Magic number check
+        if &data[0..4] != b"RUVM" {
+            return Err(crate::Error::InvalidModel("Invalid magic number"));
+        }
+
+        // Parse config from header
+        let vocab_size = u16::from_le_bytes([data[4], data[5]]) as usize;
+        let embed_dim = u16::from_le_bytes([data[6], data[7]]) as usize;
+        let hidden_dim = u16::from_le_bytes([data[8], data[9]]) as usize;
+        let num_layers = data[10] as usize;
+        let num_heads = data[11] as usize;
+        let max_seq_len = data[12] as usize;
+        let quant_type = match data[13] {
+            0 => QuantizationType::Int8,
+            1 => QuantizationType::Int4,
+            2 => QuantizationType::Binary,
+            3 => QuantizationType::Fixed16,
+            _ => return Err(crate::Error::InvalidModel("Unknown quantization type")),
+        };
+
+        let config = ModelConfig {
+            vocab_size,
+            embed_dim,
+            hidden_dim,
+            num_layers,
+            num_heads,
+            max_seq_len,
+            quant_type,
+        };
+
+        config.validate(crate::Esp32Variant::Esp32)?;
+
+        // For now, create random weights - real implementation would parse from data
+        Self::new(config)
+    }
+
+    /// Export model to bytes
+    pub fn to_bytes(&self) -> HVec<u8, 256> {
+        let mut header: HVec<u8, 256> = HVec::new();
+
+        // Magic number
+        let _ = header.extend_from_slice(b"RUVM");
+
+        // Config
+        let _ = header.extend_from_slice(&(self.config.vocab_size as u16).to_le_bytes());
+        let _ = header.extend_from_slice(&(self.config.embed_dim as u16).to_le_bytes());
+        let _ = header.extend_from_slice(&(self.config.hidden_dim as u16).to_le_bytes());
+        let _ = header.push(self.config.num_layers as u8);
+        let _ = header.push(self.config.num_heads as u8);
+        let _ = header.push(self.config.max_seq_len as u8);
+        let _ = header.push(match self.config.quant_type {
+            QuantizationType::Int8 => 0,
+            QuantizationType::Int4 => 1,
+            QuantizationType::Binary => 2,
+            QuantizationType::Fixed16 => 3,
+        });
+
+        // Padding to 32 bytes
+        while header.len() < 32 {
+            let _ = header.push(0);
+        }
+
+        header
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_config() {
+        let config = ModelConfig::default();
+        assert!(config.validate(crate::Esp32Variant::Esp32S2).is_ok());
+
+        let size = config.estimate_size();
+        println!("Default model size: {} bytes ({:.1} KB)", size, size as f32 / 1024.0);
+        assert!(size < 50 * 1024); // < 50KB for testing
+    }
+
+    #[test]
+    fn test_variant_configs() {
+        for variant in [
+            crate::Esp32Variant::Esp32,
+            crate::Esp32Variant::Esp32S2,
+            crate::Esp32Variant::Esp32S3,
+            crate::Esp32Variant::Esp32C3,
+            crate::Esp32Variant::Esp32C6,
+        ] {
+            let config = ModelConfig::for_variant(variant);
+            assert!(config.validate(variant).is_ok());
+
+            let size = config.estimate_size();
+            println!("{:?}: {} bytes ({:.1} KB)", variant, size, size as f32 / 1024.0);
+        }
+    }
+
+    #[test]
+    fn test_model_creation() {
+        let config = ModelConfig::default();
+        let model = TinyModel::new(config).unwrap();
+
+        let size = model.memory_size();
+        println!("Actual model size: {} bytes ({:.1} KB)", size, size as f32 / 1024.0);
+    }
+
+    #[test]
+    fn test_serialization() {
+        let config = ModelConfig::default();
+        let model = TinyModel::new(config).unwrap();
+
+        let header = model.to_bytes();
+        assert_eq!(&header[0..4], b"RUVM");
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/models/mod.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/models/mod.rs
@@ -0,0 +1,238 @@
+//! Model Zoo - Pre-quantized Models for RuvLLM ESP32
+//!
+//! Ready-to-use language models optimized for ESP32 microcontrollers.
+//!
+//! # Available Models
+//!
+//! | Model | Size | RAM | Tokens/sec | Use Case |
+//! |-------|------|-----|------------|----------|
+//! | TinyStories | 8KB | 20KB | ~50 | Story generation |
+//! | MicroChat | 16KB | 32KB | ~30 | Simple chatbot |
+//! | NanoEmbed | 4KB | 8KB | ~100 | Embeddings only |
+//! | TinyQA | 12KB | 24KB | ~40 | Question answering |
+
+use heapless::Vec;
+
+/// Model metadata
+#[derive(Clone)]
+pub struct ModelInfo {
+    /// Model name
+    pub name: &'static str,
+    /// Model version
+    pub version: &'static str,
+    /// Model size in bytes
+    pub size_bytes: u32,
+    /// Required RAM in bytes
+    pub ram_bytes: u32,
+    /// Vocabulary size
+    pub vocab_size: u16,
+    /// Hidden dimension
+    pub hidden_dim: u16,
+    /// Number of layers
+    pub num_layers: u8,
+    /// Number of attention heads
+    pub num_heads: u8,
+    /// Maximum sequence length
+    pub max_seq_len: u16,
+    /// Quantization bits (8 = INT8, 4 = INT4, 1 = binary)
+    pub quant_bits: u8,
+    /// Description
+    pub description: &'static str,
+}
+
+/// Available pre-quantized models
+pub const MODELS: &[ModelInfo] = &[
+    ModelInfo {
+        name: "tinystories-1m",
+        version: "1.0.0",
+        size_bytes: 8 * 1024,      // 8KB
+        ram_bytes: 20 * 1024,      // 20KB
+        vocab_size: 256,
+        hidden_dim: 64,
+        num_layers: 2,
+        num_heads: 2,
+        max_seq_len: 64,
+        quant_bits: 8,
+        description: "Tiny model for simple story generation",
+    },
+    ModelInfo {
+        name: "microchat-2m",
+        version: "1.0.0",
+        size_bytes: 16 * 1024,     // 16KB
+        ram_bytes: 32 * 1024,      // 32KB
+        vocab_size: 512,
+        hidden_dim: 96,
+        num_layers: 3,
+        num_heads: 3,
+        max_seq_len: 128,
+        quant_bits: 8,
+        description: "Simple chatbot for basic conversations",
+    },
+    ModelInfo {
+        name: "nanoembed-500k",
+        version: "1.0.0",
+        size_bytes: 4 * 1024,      // 4KB
+        ram_bytes: 8 * 1024,       // 8KB
+        vocab_size: 256,
+        hidden_dim: 32,
+        num_layers: 1,
+        num_heads: 1,
+        max_seq_len: 32,
+        quant_bits: 8,
+        description: "Ultra-light embedding model for semantic search",
+    },
+    ModelInfo {
+        name: "tinyqa-1.5m",
+        version: "1.0.0",
+        size_bytes: 12 * 1024,     // 12KB
+        ram_bytes: 24 * 1024,      // 24KB
+        vocab_size: 384,
+        hidden_dim: 80,
+        num_layers: 2,
+        num_heads: 2,
+        max_seq_len: 96,
+        quant_bits: 8,
+        description: "Question-answering model for simple queries",
+    },
+    ModelInfo {
+        name: "binary-embed-250k",
+        version: "1.0.0",
+        size_bytes: 2 * 1024,      // 2KB
+        ram_bytes: 4 * 1024,       // 4KB
+        vocab_size: 128,
+        hidden_dim: 64,
+        num_layers: 1,
+        num_heads: 1,
+        max_seq_len: 16,
+        quant_bits: 1,             // Binary quantization
+        description: "Binary quantized embeddings (32x compression)",
+    },
+];
+
+/// Model selection by use case
+#[derive(Debug, Clone, Copy)]
+pub enum UseCase {
+    /// Story/text generation
+    Generation,
+    /// Conversational AI
+    Chat,
+    /// Semantic embeddings
+    Embedding,
+    /// Question answering
+    QA,
+    /// Minimum memory footprint
+    MinMemory,
+}
+
+/// Get recommended model for use case
+pub fn recommend_model(use_case: UseCase, max_ram_kb: u32) -> Option<&'static ModelInfo> {
+    let max_ram = max_ram_kb * 1024;
+
+    let candidates: Vec<&ModelInfo, 8> = MODELS
+        .iter()
+        .filter(|m| m.ram_bytes <= max_ram)
+        .collect();
+
+    match use_case {
+        UseCase::Generation => candidates
+            .iter()
+            .find(|m| m.name.contains("stories"))
+            .copied(),
+        UseCase::Chat => candidates
+            .iter()
+            .find(|m| m.name.contains("chat"))
+            .copied(),
+        UseCase::Embedding => candidates
+            .iter()
+            .find(|m| m.name.contains("embed"))
+            .copied(),
+        UseCase::QA => candidates
+            .iter()
+            .find(|m| m.name.contains("qa"))
+            .copied(),
+        UseCase::MinMemory => candidates
+            .iter()
+            .min_by_key(|m| m.ram_bytes)
+            .copied(),
+    }
+}
+
+/// Get model by name
+pub fn get_model(name: &str) -> Option<&'static ModelInfo> {
+    MODELS.iter().find(|m| m.name == name)
+}
+
+/// List all models
+pub fn list_models() -> &'static [ModelInfo] {
+    MODELS
+}
+
+/// Calculate tokens per second estimate for model on given chip
+pub fn estimate_performance(model: &ModelInfo, chip: &str) -> u32 {
+    let base_speed = match chip {
+        "esp32s3" => 60,  // SIMD acceleration
+        "esp32" => 40,
+        "esp32s2" => 35,
+        "esp32c3" => 30,
+        "esp32c6" => 35,
+        _ => 30,
+    };
+
+    // Adjust for model complexity
+    let complexity_factor = 1.0 / (model.num_layers as f32 * 0.3 + 1.0);
+    let quant_factor = if model.quant_bits == 1 { 2.0 } else { 1.0 };
+
+    (base_speed as f32 * complexity_factor * quant_factor) as u32
+}
+
+/// Print model info table
+pub fn print_model_table() -> heapless::String<1024> {
+    let mut output = heapless::String::new();
+
+    let _ = output.push_str("Available Models:\n");
+    let _ = output.push_str("─────────────────────────────────────────────────\n");
+    let _ = output.push_str("Name              Size    RAM     Quant  Use Case\n");
+    let _ = output.push_str("─────────────────────────────────────────────────\n");
+
+    for model in MODELS {
+        let _ = core::fmt::write(
+            &mut output,
+            format_args!(
+                "{:<17} {:>4}KB  {:>4}KB  INT{:<2}  {}\n",
+                model.name,
+                model.size_bytes / 1024,
+                model.ram_bytes / 1024,
+                model.quant_bits,
+                model.description.chars().take(20).collect::<heapless::String<20>>()
+            )
+        );
+    }
+
+    output
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_model_lookup() {
+        let model = get_model("tinystories-1m");
+        assert!(model.is_some());
+        assert_eq!(model.unwrap().vocab_size, 256);
+    }
+
+    #[test]
+    fn test_recommend_model() {
+        let model = recommend_model(UseCase::MinMemory, 10);
+        assert!(model.is_some());
+        assert_eq!(model.unwrap().name, "binary-embed-250k");
+    }
+
+    #[test]
+    fn test_performance_estimate() {
+        let model = get_model("nanoembed-500k").unwrap();
+        let speed = estimate_performance(model, "esp32s3");
+        assert!(speed > 0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/binary_quant.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/binary_quant.rs
@@ -0,0 +1,273 @@
+//! Binary Quantization - 32x Memory Compression
+//!
+//! Adapted from ruvector-postgres/src/quantization/binary.rs
+//! Converts f32/i8 vectors to 1-bit per dimension with Hamming distance.
+
+use heapless::Vec as HVec;
+
+/// Maximum binary vector size in bytes (supports up to 512 dimensions)
+pub const MAX_BINARY_SIZE: usize = 64;
+
+/// Binary quantized vector - 1 bit per dimension
+#[derive(Debug, Clone)]
+pub struct BinaryVector<const N: usize> {
+    /// Packed binary data (8 dimensions per byte)
+    pub data: HVec<u8, N>,
+    /// Original dimension count
+    pub dim: usize,
+    /// Threshold used for binarization
+    pub threshold: i8,
+}
+
+impl<const N: usize> BinaryVector<N> {
+    /// Create binary vector from INT8 values
+    /// Values >= threshold become 1, values < threshold become 0
+    pub fn from_i8(values: &[i8], threshold: i8) -> crate::Result<Self> {
+        let dim = values.len();
+        let num_bytes = (dim + 7) / 8;
+
+        if num_bytes > N {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        let mut data = HVec::new();
+
+        for chunk_idx in 0..(num_bytes) {
+            let mut byte = 0u8;
+            for bit_idx in 0..8 {
+                let val_idx = chunk_idx * 8 + bit_idx;
+                if val_idx < dim && values[val_idx] >= threshold {
+                    byte |= 1 << bit_idx;
+                }
+            }
+            data.push(byte).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { data, dim, threshold })
+    }
+
+    /// Create binary vector from f32 values (for host-side quantization)
+    #[cfg(feature = "host-test")]
+    pub fn from_f32(values: &[f32], threshold: f32) -> crate::Result<Self> {
+        let i8_threshold = (threshold * 127.0) as i8;
+        let i8_values: heapless::Vec<i8, 512> = values
+            .iter()
+            .map(|&v| (v * 127.0).clamp(-128.0, 127.0) as i8)
+            .collect();
+        Self::from_i8(&i8_values, i8_threshold)
+    }
+
+    /// Get number of packed bytes
+    pub fn num_bytes(&self) -> usize {
+        self.data.len()
+    }
+
+    /// Memory savings compared to INT8
+    pub fn compression_ratio(&self) -> f32 {
+        self.dim as f32 / self.data.len() as f32
+    }
+}
+
+/// Binary embedding table for vocabulary (32x smaller than INT8)
+pub struct BinaryEmbedding<const VOCAB: usize, const DIM_BYTES: usize> {
+    /// Packed binary embeddings [VOCAB * DIM_BYTES]
+    data: HVec<u8, { 32 * 1024 }>, // Max 32KB
+    /// Vocabulary size
+    vocab_size: usize,
+    /// Dimensions (in bits)
+    dim: usize,
+    /// Bytes per embedding
+    bytes_per_embed: usize,
+}
+
+impl<const VOCAB: usize, const DIM_BYTES: usize> BinaryEmbedding<VOCAB, DIM_BYTES> {
+    /// Create random binary embeddings for testing
+    pub fn random(vocab_size: usize, dim: usize, seed: u32) -> crate::Result<Self> {
+        let bytes_per_embed = (dim + 7) / 8;
+        let total_bytes = vocab_size * bytes_per_embed;
+
+        let mut data = HVec::new();
+        let mut rng_state = seed;
+
+        for _ in 0..total_bytes {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            let byte = ((rng_state >> 16) & 0xFF) as u8;
+            data.push(byte).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            data,
+            vocab_size,
+            dim,
+            bytes_per_embed,
+        })
+    }
+
+    /// Look up binary embedding for a token
+    pub fn lookup(&self, token_id: u16, output: &mut [u8]) -> crate::Result<()> {
+        let id = token_id as usize;
+        if id >= self.vocab_size {
+            return Err(crate::Error::InvalidModel("Token ID out of range"));
+        }
+
+        let start = id * self.bytes_per_embed;
+        let end = start + self.bytes_per_embed;
+
+        if output.len() < self.bytes_per_embed {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        output[..self.bytes_per_embed].copy_from_slice(&self.data[start..end]);
+        Ok(())
+    }
+
+    /// Memory size in bytes
+    pub fn memory_size(&self) -> usize {
+        self.data.len()
+    }
+
+    /// Compression vs INT8 embedding of same dimensions
+    pub fn compression_vs_int8(&self) -> f32 {
+        8.0 // 8 bits per dimension -> 1 bit per dimension = 8x
+    }
+}
+
+/// Hamming distance between two binary vectors
+///
+/// Counts the number of differing bits. Uses POPCNT-like operations.
+/// On ESP32, this is extremely fast as it uses simple bitwise operations.
+#[inline]
+pub fn hamming_distance(a: &[u8], b: &[u8]) -> u32 {
+    debug_assert_eq!(a.len(), b.len());
+
+    let mut distance: u32 = 0;
+
+    // Process 4 bytes at a time for better performance
+    let chunks = a.len() / 4;
+    for i in 0..chunks {
+        let idx = i * 4;
+        let xor0 = a[idx] ^ b[idx];
+        let xor1 = a[idx + 1] ^ b[idx + 1];
+        let xor2 = a[idx + 2] ^ b[idx + 2];
+        let xor3 = a[idx + 3] ^ b[idx + 3];
+
+        distance += popcount8(xor0) + popcount8(xor1) + popcount8(xor2) + popcount8(xor3);
+    }
+
+    // Handle remainder
+    for i in (chunks * 4)..a.len() {
+        distance += popcount8(a[i] ^ b[i]);
+    }
+
+    distance
+}
+
+/// Hamming similarity (inverted distance, normalized to 0-1 range)
+#[inline]
+pub fn hamming_similarity(a: &[u8], b: &[u8]) -> f32 {
+    let total_bits = (a.len() * 8) as f32;
+    let distance = hamming_distance(a, b) as f32;
+    1.0 - (distance / total_bits)
+}
+
+/// Hamming similarity as fixed-point (0-255 range)
+#[inline]
+pub fn hamming_similarity_fixed(a: &[u8], b: &[u8]) -> u8 {
+    let total_bits = (a.len() * 8) as u32;
+    let matching_bits = total_bits - hamming_distance(a, b);
+    ((matching_bits * 255) / total_bits) as u8
+}
+
+/// Population count for a single byte (count of 1 bits)
+/// Uses lookup table for ESP32 efficiency
+#[inline]
+pub fn popcount8(x: u8) -> u32 {
+    // Lookup table for byte population count
+    const POPCOUNT_TABLE: [u8; 256] = [
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+    ];
+    POPCOUNT_TABLE[x as usize] as u32
+}
+
+/// XNOR-popcount for binary neural network inference
+/// Equivalent to computing dot product of {-1, +1} vectors
+#[inline]
+pub fn xnor_popcount(a: &[u8], b: &[u8]) -> i32 {
+    debug_assert_eq!(a.len(), b.len());
+
+    let total_bits = (a.len() * 8) as i32;
+    let mut matching: i32 = 0;
+
+    for (&x, &y) in a.iter().zip(b.iter()) {
+        // XNOR: same bits = 1, different bits = 0
+        let xnor = !(x ^ y);
+        matching += popcount8(xnor) as i32;
+    }
+
+    // Convert to {-1, +1} dot product equivalent
+    // matching bits contribute +1, non-matching contribute -1
+    // result = 2 * matching - total_bits
+    2 * matching - total_bits
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_binary_quantization() {
+        let values = [10i8, -5, 20, -10, 0, 15, -8, 30];
+        let binary = BinaryVector::<8>::from_i8(&values, 0).unwrap();
+
+        assert_eq!(binary.dim, 8);
+        assert_eq!(binary.num_bytes(), 1);
+
+        // Expected: bits where value >= 0: positions 0, 2, 4, 5, 7
+        // Binary: 10110101 = 0xB5
+        assert_eq!(binary.data[0], 0b10110101);
+    }
+
+    #[test]
+    fn test_hamming_distance() {
+        let a = [0b11110000u8, 0b10101010];
+        let b = [0b11110000u8, 0b10101010];
+        assert_eq!(hamming_distance(&a, &b), 0);
+
+        let c = [0b00001111u8, 0b01010101];
+        assert_eq!(hamming_distance(&a, &c), 16); // All bits different
+    }
+
+    #[test]
+    fn test_xnor_popcount() {
+        let a = [0b11111111u8];
+        let b = [0b11111111u8];
+        // Perfect match: 8 matching bits -> 2*8 - 8 = 8
+        assert_eq!(xnor_popcount(&a, &b), 8);
+
+        let c = [0b00000000u8];
+        // Complete mismatch: 0 matching bits -> 2*0 - 8 = -8
+        assert_eq!(xnor_popcount(&a, &c), -8);
+    }
+
+    #[test]
+    fn test_compression_ratio() {
+        let values = [0i8; 64];
+        let binary = BinaryVector::<8>::from_i8(&values, 0).unwrap();
+        assert_eq!(binary.compression_ratio(), 8.0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/lookup_tables.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/lookup_tables.rs
@@ -0,0 +1,266 @@
+//! Lookup Tables for Fast Fixed-Point Operations
+//!
+//! Pre-computed tables for softmax, exp, and distance operations.
+//! Critical for ESP32 which lacks FPU on most variants.
+
+/// Softmax lookup table (256 entries)
+///
+/// Pre-computed exp(x) values for x in [-8, 0] range, scaled to INT8.
+/// Used for fast fixed-point softmax without floating-point operations.
+pub struct SoftmaxLUT {
+    /// exp(x) values, scaled by 255
+    exp_table: [u8; 256],
+    /// Scale factor for input normalization
+    input_scale: i32,
+}
+
+impl SoftmaxLUT {
+    /// Create softmax LUT with default parameters
+    pub const fn new() -> Self {
+        // Pre-compute exp(x) for x in [-8, 0], scaled to [0, 255]
+        // exp(-8) ≈ 0.000335, exp(0) = 1
+        // We discretize into 256 bins
+
+        let mut exp_table = [0u8; 256];
+
+        // Approximate exp using polynomial: exp(x) ≈ 1 + x + x²/2 + x³/6
+        // For integer approximation: exp(x/32) scaled by 255
+        let mut i = 0;
+        while i < 256 {
+            // x ranges from -8 (i=0) to 0 (i=255)
+            // x = (i - 255) / 32
+            let x_scaled = i as i32 - 255; // Range: -255 to 0
+
+            // Linear approximation of exp for negative values
+            // exp(x) ≈ 255 + x for small |x|, clamped to [1, 255]
+            let mut exp_approx = 255 + x_scaled;
+            if exp_approx < 1 { exp_approx = 1; }
+            if exp_approx > 255 { exp_approx = 255; }
+            exp_table[i] = exp_approx as u8;
+
+            i += 1;
+        }
+
+        Self {
+            exp_table,
+            input_scale: 32, // Divide input by 32 before lookup
+        }
+    }
+
+    /// Look up approximate exp(x) for x in [-8, 0]
+    #[inline]
+    pub fn exp(&self, x: i32) -> u8 {
+        // Clamp x to valid range and scale
+        let x_clamped = x.max(-255).min(0);
+        let idx = (x_clamped + 255) as usize;
+        self.exp_table[idx]
+    }
+
+    /// Compute softmax over an array of INT32 logits
+    /// Output is scaled by 256 (i.e., 256 = probability 1.0)
+    pub fn softmax(&self, logits: &[i32], output: &mut [u16]) {
+        if logits.is_empty() {
+            return;
+        }
+
+        // Find max for numerical stability
+        let max_logit = logits.iter().cloned().max().unwrap_or(0);
+
+        // Compute exp and sum
+        let mut sum: u32 = 0;
+        for (&logit, out) in logits.iter().zip(output.iter_mut()) {
+            let x = logit - max_logit;
+            let exp_val = self.exp(x) as u16;
+            *out = exp_val;
+            sum += exp_val as u32;
+        }
+
+        // Normalize: probability = exp / sum, scaled by 256
+        if sum > 0 {
+            for out in output.iter_mut() {
+                *out = ((*out as u32 * 256) / sum) as u16;
+            }
+        }
+    }
+
+    /// Fast softmax using only integer operations
+    /// Returns probabilities scaled by 256
+    pub fn softmax_fast(&self, logits: &mut [i32]) {
+        if logits.is_empty() {
+            return;
+        }
+
+        // Find max
+        let max = logits.iter().cloned().max().unwrap_or(0);
+
+        // Subtract max and apply exp approximation
+        let mut sum: i32 = 0;
+        for logit in logits.iter_mut() {
+            let x = (*logit - max).max(-255);
+            *logit = self.exp_table[(x + 255) as usize] as i32;
+            sum += *logit;
+        }
+
+        // Normalize (multiply by 256 then divide by sum)
+        if sum > 0 {
+            for logit in logits.iter_mut() {
+                *logit = (*logit << 8) / sum;
+            }
+        }
+    }
+}
+
+impl Default for SoftmaxLUT {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Exponential lookup table for more precise exp approximation
+pub struct ExpLUT {
+    /// exp(x/64) for x in [0, 255], scaled by 256
+    table: [u16; 256],
+}
+
+impl ExpLUT {
+    /// Create with higher precision (uses more memory)
+    pub const fn new() -> Self {
+        let mut table = [0u16; 256];
+
+        let mut i = 0;
+        while i < 256 {
+            // exp(x/64) for x in [0, 255]
+            // At x=0: exp(0) = 1 -> 256
+            // At x=255: exp(255/64) ≈ exp(3.98) ≈ 53.5 -> scaled
+
+            // Polynomial approximation: 1 + x + x²/2
+            let x = i as i32;
+            let x_scaled = x * 256 / 64; // x/64 * 256 for fixed-point
+            let x2 = (x_scaled * x_scaled) >> 9; // x² / 512
+
+            let mut exp_val = 256 + x_scaled + (x2 >> 1);
+            if exp_val > 65535 { exp_val = 65535; }
+            table[i] = exp_val as u16;
+
+            i += 1;
+        }
+
+        Self { table }
+    }
+
+    /// exp(x) where x is in range [0, 4) scaled by 64
+    #[inline]
+    pub fn exp(&self, x: u8) -> u16 {
+        self.table[x as usize]
+    }
+}
+
+/// Distance lookup table for common embedding similarities
+pub struct DistanceLUT<const SIZE: usize> {
+    /// Pre-computed squared differences for INT8 pairs
+    sq_diff_table: [u16; 512], // For INT8 diffs in [-255, 255]
+}
+
+impl<const SIZE: usize> DistanceLUT<SIZE> {
+    /// Create distance LUT
+    pub const fn new() -> Self {
+        let mut sq_diff_table = [0u16; 512];
+
+        let mut i = 0i32;
+        while i < 512 {
+            let diff = i - 256; // Map [0, 511] to [-256, 255]
+            let mut sq = diff * diff;
+            if sq > 65535 { sq = 65535; }
+            sq_diff_table[i as usize] = sq as u16;
+            i += 1;
+        }
+
+        Self { sq_diff_table }
+    }
+
+    /// Look up squared difference between two INT8 values
+    #[inline]
+    pub fn squared_diff(&self, a: i8, b: i8) -> u16 {
+        let diff = a as i32 - b as i32;
+        let idx = (diff + 256) as usize;
+        self.sq_diff_table[idx]
+    }
+
+    /// Compute L2 squared distance using lookup table
+    pub fn l2_squared(&self, a: &[i8], b: &[i8]) -> u32 {
+        debug_assert_eq!(a.len(), b.len());
+
+        let mut sum: u32 = 0;
+        for (&x, &y) in a.iter().zip(b.iter()) {
+            sum += self.squared_diff(x, y) as u32;
+        }
+        sum
+    }
+}
+
+/// Global static lookup tables (no heap allocation)
+pub static SOFTMAX_LUT: SoftmaxLUT = SoftmaxLUT::new();
+pub static EXP_LUT: ExpLUT = ExpLUT::new();
+pub static DISTANCE_LUT: DistanceLUT<256> = DistanceLUT::new();
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_softmax_lut() {
+        let lut = SoftmaxLUT::new();
+
+        // exp(0) should be maximum (255)
+        assert_eq!(lut.exp(0), 255);
+
+        // exp(-255) should be minimum (1)
+        assert_eq!(lut.exp(-255), 1);
+    }
+
+    #[test]
+    fn test_softmax_normalization() {
+        let lut = SoftmaxLUT::new();
+        let logits = [100i32, 50, 0, -50];
+        let mut output = [0u16; 4];
+
+        lut.softmax(&logits, &mut output);
+
+        // Sum should be approximately 256
+        let sum: u16 = output.iter().sum();
+        assert!((sum as i32 - 256).abs() < 10);
+
+        // First element should have highest probability
+        assert!(output[0] > output[1]);
+        assert!(output[1] > output[2]);
+        assert!(output[2] > output[3]);
+    }
+
+    #[test]
+    fn test_distance_lut() {
+        let lut = DistanceLUT::<256>::new();
+
+        // Same values: squared diff = 0
+        assert_eq!(lut.squared_diff(10, 10), 0);
+
+        // Diff of 10: squared = 100
+        assert_eq!(lut.squared_diff(10, 0), 100);
+        assert_eq!(lut.squared_diff(0, 10), 100);
+
+        // Negative values
+        assert_eq!(lut.squared_diff(-10, 0), 100);
+    }
+
+    #[test]
+    fn test_l2_distance() {
+        let lut = DistanceLUT::<256>::new();
+
+        let a = [10i8, 20, 30, 40];
+        let b = [10i8, 20, 30, 40];
+        assert_eq!(lut.l2_squared(&a, &b), 0);
+
+        let c = [0i8, 0, 0, 0];
+        // (10² + 20² + 30² + 40²) = 100 + 400 + 900 + 1600 = 3000
+        assert_eq!(lut.l2_squared(&a, &c), 3000);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/micro_lora.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/micro_lora.rs
@@ -0,0 +1,323 @@
+//! MicroLoRA - Tiny Low-Rank Adaptation for ESP32
+//!
+//! Adapted from ruvLLM's SONA architecture for on-device adaptation.
+//! Uses INT8 weights with rank 1-2 for minimal memory footprint.
+
+use heapless::Vec as HVec;
+use crate::quantized::QuantParams;
+
+/// Maximum LoRA rank (keep very small for ESP32)
+pub const MAX_LORA_RANK: usize = 2;
+/// Maximum dimension for LoRA matrices
+pub const MAX_LORA_DIM: usize = 64;
+
+/// MicroLoRA configuration
+#[derive(Debug, Clone, Copy)]
+pub struct LoRAConfig {
+    /// Rank of the low-rank matrices (1 or 2 for ESP32)
+    pub rank: usize,
+    /// Input/output dimension
+    pub dim: usize,
+    /// Scaling factor (alpha / rank)
+    pub scale: i8,
+    /// Whether LoRA is frozen (inference-only)
+    pub frozen: bool,
+}
+
+impl Default for LoRAConfig {
+    fn default() -> Self {
+        Self {
+            rank: 1,
+            dim: 32,
+            scale: 8, // alpha=8, rank=1 -> scale=8
+            frozen: true,
+        }
+    }
+}
+
+/// MicroLoRA adapter for a single layer
+///
+/// Implements: output = input + scale * (input @ A) @ B
+/// Where A is [dim, rank] and B is [rank, dim]
+pub struct MicroLoRA {
+    /// Down projection: A matrix [dim, rank] as INT8
+    a_weights: HVec<i8, { MAX_LORA_DIM * MAX_LORA_RANK }>,
+    /// Up projection: B matrix [rank, dim] as INT8
+    b_weights: HVec<i8, { MAX_LORA_RANK * MAX_LORA_DIM }>,
+    /// Configuration
+    config: LoRAConfig,
+    /// Quantization params for A
+    a_params: QuantParams,
+    /// Quantization params for B
+    b_params: QuantParams,
+    /// Intermediate buffer for rank-sized vector
+    intermediate: [i32; MAX_LORA_RANK],
+}
+
+impl MicroLoRA {
+    /// Create new MicroLoRA with random initialization
+    pub fn new(config: LoRAConfig, seed: u32) -> crate::Result<Self> {
+        if config.rank > MAX_LORA_RANK || config.dim > MAX_LORA_DIM {
+            return Err(crate::Error::InvalidModel("LoRA dimensions too large"));
+        }
+
+        let mut a_weights = HVec::new();
+        let mut b_weights = HVec::new();
+
+        let mut rng_state = seed;
+        let mut next_rand = || {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            (((rng_state >> 16) & 0x3F) as i16 - 32) as i8 // Small values [-32, 31]
+        };
+
+        // Initialize A with small random values
+        for _ in 0..(config.dim * config.rank) {
+            a_weights.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        // Initialize B with zeros (LoRA starts as identity)
+        for _ in 0..(config.rank * config.dim) {
+            b_weights.push(0).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            a_weights,
+            b_weights,
+            config,
+            a_params: QuantParams::default(),
+            b_params: QuantParams::default(),
+            intermediate: [0; MAX_LORA_RANK],
+        })
+    }
+
+    /// Create MicroLoRA from pre-trained weights
+    pub fn from_weights(
+        config: LoRAConfig,
+        a_weights: &[i8],
+        b_weights: &[i8],
+    ) -> crate::Result<Self> {
+        if a_weights.len() != config.dim * config.rank {
+            return Err(crate::Error::InvalidModel("A weights size mismatch"));
+        }
+        if b_weights.len() != config.rank * config.dim {
+            return Err(crate::Error::InvalidModel("B weights size mismatch"));
+        }
+
+        let mut a_vec = HVec::new();
+        let mut b_vec = HVec::new();
+
+        for &w in a_weights {
+            a_vec.push(w).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        for &w in b_weights {
+            b_vec.push(w).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self {
+            a_weights: a_vec,
+            b_weights: b_vec,
+            config,
+            a_params: QuantParams::default(),
+            b_params: QuantParams::default(),
+            intermediate: [0; MAX_LORA_RANK],
+        })
+    }
+
+    /// Apply LoRA adaptation to input
+    ///
+    /// Computes: output = input + scale * (input @ A) @ B
+    /// All operations in INT8/INT32
+    #[inline]
+    pub fn apply(&mut self, input: &[i8], output: &mut [i32]) {
+        let dim = self.config.dim;
+        let rank = self.config.rank;
+        let scale = self.config.scale as i32;
+
+        // Clear intermediate buffer
+        for i in 0..rank {
+            self.intermediate[i] = 0;
+        }
+
+        // Step 1: intermediate = input @ A (down projection)
+        // A is [dim, rank], input is [dim], result is [rank]
+        for r in 0..rank {
+            let mut sum: i32 = 0;
+            for d in 0..dim {
+                sum += input[d] as i32 * self.a_weights[d * rank + r] as i32;
+            }
+            self.intermediate[r] = sum >> 4; // Scale down to prevent overflow
+        }
+
+        // Step 2: lora_output = intermediate @ B (up projection)
+        // B is [rank, dim], intermediate is [rank], result is [dim]
+        for d in 0..dim {
+            let mut sum: i32 = 0;
+            for r in 0..rank {
+                sum += self.intermediate[r] * self.b_weights[r * dim + d] as i32;
+            }
+            // Add scaled LoRA output to original output
+            output[d] += (sum * scale) >> 8;
+        }
+    }
+
+    /// Apply LoRA and store result in-place
+    pub fn apply_inplace(&mut self, data: &mut [i32], input: &[i8]) {
+        self.apply(input, data);
+    }
+
+    /// Memory size of this LoRA adapter
+    pub fn memory_size(&self) -> usize {
+        self.a_weights.len() + self.b_weights.len()
+    }
+
+    /// Update LoRA weights with gradient (simplified for on-device learning)
+    ///
+    /// Uses a simple gradient accumulation approach suitable for ESP32:
+    /// A += lr * input^T @ grad_intermediate
+    /// B += lr * intermediate^T @ grad_output
+    #[cfg(not(feature = "frozen"))]
+    pub fn update(&mut self, input: &[i8], grad_output: &[i32], learning_rate: i8) {
+        let dim = self.config.dim;
+        let rank = self.config.rank;
+        let lr = learning_rate as i32;
+
+        // Compute gradient for intermediate (simplified)
+        let mut grad_intermediate = [0i32; MAX_LORA_RANK];
+        for r in 0..rank {
+            let mut sum: i32 = 0;
+            for d in 0..dim {
+                sum += grad_output[d] * self.b_weights[r * dim + d] as i32;
+            }
+            grad_intermediate[r] = sum >> 8;
+        }
+
+        // Update A weights: A += lr * outer(input, grad_intermediate)
+        for d in 0..dim {
+            for r in 0..rank {
+                let grad = (input[d] as i32 * grad_intermediate[r] * lr) >> 12;
+                let idx = d * rank + r;
+                let new_val = self.a_weights[idx] as i32 + grad;
+                self.a_weights[idx] = new_val.clamp(-127, 127) as i8;
+            }
+        }
+
+        // Update B weights: B += lr * outer(intermediate, grad_output)
+        for r in 0..rank {
+            for d in 0..dim {
+                let grad = (self.intermediate[r] * grad_output[d] * lr) >> 12;
+                let idx = r * dim + d;
+                let new_val = self.b_weights[idx] as i32 + grad;
+                self.b_weights[idx] = new_val.clamp(-127, 127) as i8;
+            }
+        }
+    }
+}
+
+/// Collection of MicroLoRA adapters for all layers
+pub struct LoRAStack<const NUM_LAYERS: usize> {
+    /// LoRA adapters per layer
+    adapters: [Option<MicroLoRA>; NUM_LAYERS],
+    /// Number of active adapters
+    active_count: usize,
+}
+
+impl<const NUM_LAYERS: usize> LoRAStack<NUM_LAYERS> {
+    /// Create empty LoRA stack
+    pub fn new() -> Self {
+        Self {
+            adapters: core::array::from_fn(|_| None),
+            active_count: 0,
+        }
+    }
+
+    /// Add LoRA adapter to a layer
+    pub fn add_adapter(&mut self, layer_idx: usize, adapter: MicroLoRA) -> crate::Result<()> {
+        if layer_idx >= NUM_LAYERS {
+            return Err(crate::Error::InvalidModel("Layer index out of range"));
+        }
+        self.adapters[layer_idx] = Some(adapter);
+        self.active_count += 1;
+        Ok(())
+    }
+
+    /// Get adapter for a layer (if exists)
+    pub fn get(&mut self, layer_idx: usize) -> Option<&mut MicroLoRA> {
+        self.adapters.get_mut(layer_idx).and_then(|a| a.as_mut())
+    }
+
+    /// Total memory used by all adapters
+    pub fn total_memory(&self) -> usize {
+        self.adapters.iter()
+            .filter_map(|a| a.as_ref())
+            .map(|a| a.memory_size())
+            .sum()
+    }
+}
+
+impl<const N: usize> Default for LoRAStack<N> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_micro_lora_creation() {
+        let config = LoRAConfig {
+            rank: 2,
+            dim: 32,
+            scale: 8,
+            frozen: true,
+        };
+
+        let lora = MicroLoRA::new(config, 42).unwrap();
+
+        // A: 32 * 2 = 64 bytes, B: 2 * 32 = 64 bytes
+        assert_eq!(lora.memory_size(), 128);
+    }
+
+    #[test]
+    fn test_lora_apply() {
+        let config = LoRAConfig {
+            rank: 1,
+            dim: 4,
+            scale: 64, // Larger scale for testing
+            frozen: true,
+        };
+
+        // Create with known weights - larger values to survive scaling
+        let a_weights = [16i8, 32, 48, 64]; // [4, 1]
+        let b_weights = [64i8, 64, 64, 64]; // [1, 4]
+
+        let mut lora = MicroLoRA::from_weights(config, &a_weights, &b_weights).unwrap();
+
+        let input = [64i8, 64, 64, 64];
+        let mut output = [0i32; 4];
+
+        lora.apply(&input, &mut output);
+
+        // With larger values, the output should be non-zero after scaling
+        // intermediate = sum(64 * [16,32,48,64]) >> 4 = (10240) >> 4 = 640
+        // output = (640 * 64 * scale) >> 8
+        // This should produce non-zero results
+        let non_zero_count = output.iter().filter(|&&o| o != 0).count();
+        assert!(non_zero_count > 0, "At least some outputs should be non-zero, got {:?}", output);
+    }
+
+    #[test]
+    fn test_lora_stack() {
+        let mut stack = LoRAStack::<4>::new();
+
+        let config = LoRAConfig::default();
+        let adapter = MicroLoRA::new(config, 42).unwrap();
+
+        stack.add_adapter(0, adapter).unwrap();
+
+        assert!(stack.get(0).is_some());
+        assert!(stack.get(1).is_none());
+        assert!(stack.total_memory() > 0);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/mod.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/mod.rs
@@ -0,0 +1,25 @@
+//! Advanced Optimizations from Ruvector
+//!
+//! This module brings key optimizations from the ruvector ecosystem to ESP32:
+//! - Binary quantization (32x compression)
+//! - Product quantization (8-32x compression)
+//! - Hamming distance with POPCNT
+//! - Fixed-point softmax with lookup tables
+//! - MicroLoRA for on-device adaptation
+//! - Sparse attention patterns
+//! - MinCut-inspired layer pruning
+
+pub mod binary_quant;
+pub mod product_quant;
+pub mod lookup_tables;
+pub mod micro_lora;
+pub mod sparse_attention;
+pub mod pruning;
+
+// Re-exports
+pub use binary_quant::{BinaryVector, BinaryEmbedding, hamming_distance, hamming_similarity};
+pub use product_quant::{ProductQuantizer, PQCode};
+pub use lookup_tables::{SoftmaxLUT, ExpLUT, DistanceLUT};
+pub use micro_lora::{MicroLoRA, LoRAConfig};
+pub use sparse_attention::{SparseAttention, AttentionPattern};
+pub use pruning::{LayerPruner, PruningConfig};
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/product_quant.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/product_quant.rs
@@ -0,0 +1,336 @@
+//! Product Quantization - 8-32x Memory Compression
+//!
+//! Adapted from ruvector-postgres for ESP32 constraints.
+//! Splits vectors into subvectors and quantizes each independently.
+
+use heapless::Vec as HVec;
+
+/// Maximum number of subquantizers
+pub const MAX_SUBQUANTIZERS: usize = 8;
+/// Maximum codebook size per subquantizer
+pub const MAX_CODEBOOK_SIZE: usize = 16; // 4-bit codes
+/// Maximum subvector dimension
+pub const MAX_SUBVEC_DIM: usize = 8;
+
+/// Product Quantization configuration
+#[derive(Debug, Clone, Copy)]
+pub struct PQConfig {
+    /// Number of subquantizers (M)
+    pub num_subquantizers: usize,
+    /// Number of codes per subquantizer (K = 2^bits)
+    pub codebook_size: usize,
+    /// Dimension of each subvector
+    pub subvec_dim: usize,
+    /// Total vector dimension
+    pub dim: usize,
+}
+
+impl Default for PQConfig {
+    fn default() -> Self {
+        Self {
+            num_subquantizers: 4,
+            codebook_size: 16, // 4-bit codes
+            subvec_dim: 8,
+            dim: 32,
+        }
+    }
+}
+
+/// Product Quantized code for a vector
+#[derive(Debug, Clone)]
+pub struct PQCode<const M: usize> {
+    /// Code indices for each subquantizer (4-bit packed)
+    pub codes: HVec<u8, M>,
+}
+
+impl<const M: usize> PQCode<M> {
+    /// Create from code indices
+    pub fn from_codes(codes: &[u8]) -> crate::Result<Self> {
+        let mut code_vec = HVec::new();
+        for &c in codes {
+            code_vec.push(c).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(Self { codes: code_vec })
+    }
+
+    /// Get code for subquantizer i
+    #[inline]
+    pub fn get_code(&self, i: usize) -> u8 {
+        self.codes.get(i).copied().unwrap_or(0)
+    }
+
+    /// Memory size in bytes
+    pub fn memory_size(&self) -> usize {
+        self.codes.len()
+    }
+}
+
+/// Product Quantizer with codebooks
+pub struct ProductQuantizer<const M: usize, const K: usize, const D: usize> {
+    /// Codebooks: [M][K][D] flattened to [M * K * D]
+    /// Each subquantizer has K centroids of dimension D
+    codebooks: HVec<i8, { 8 * 16 * 8 }>, // Max 1024 bytes
+    /// Configuration
+    config: PQConfig,
+}
+
+impl<const M: usize, const K: usize, const D: usize> ProductQuantizer<M, K, D> {
+    /// Create with random codebooks (for testing)
+    pub fn random(config: PQConfig, seed: u32) -> crate::Result<Self> {
+        let total_size = config.num_subquantizers * config.codebook_size * config.subvec_dim;
+
+        let mut codebooks = HVec::new();
+        let mut rng_state = seed;
+
+        for _ in 0..total_size {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            let val = (((rng_state >> 16) & 0xFF) as i16 - 128) as i8;
+            codebooks.push(val).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { codebooks, config })
+    }
+
+    /// Create from pre-trained codebooks
+    pub fn from_codebooks(config: PQConfig, codebooks: &[i8]) -> crate::Result<Self> {
+        let expected = config.num_subquantizers * config.codebook_size * config.subvec_dim;
+        if codebooks.len() != expected {
+            return Err(crate::Error::InvalidModel("Codebook size mismatch"));
+        }
+
+        let mut cb_vec = HVec::new();
+        for &v in codebooks {
+            cb_vec.push(v).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { codebooks: cb_vec, config })
+    }
+
+    /// Get centroid for subquantizer m, code k
+    #[inline]
+    fn get_centroid(&self, m: usize, k: usize) -> &[i8] {
+        let d = self.config.subvec_dim;
+        let kk = self.config.codebook_size;
+        let start = m * kk * d + k * d;
+        &self.codebooks[start..start + d]
+    }
+
+    /// Encode a vector to PQ codes
+    pub fn encode(&self, vector: &[i8]) -> crate::Result<PQCode<M>> {
+        if vector.len() != self.config.dim {
+            return Err(crate::Error::InvalidModel("Vector dimension mismatch"));
+        }
+
+        let mut codes = HVec::new();
+        let d = self.config.subvec_dim;
+
+        for m in 0..self.config.num_subquantizers {
+            let subvec = &vector[m * d..(m + 1) * d];
+
+            // Find nearest centroid
+            let mut best_code = 0u8;
+            let mut best_dist = i32::MAX;
+
+            for k in 0..self.config.codebook_size {
+                let centroid = self.get_centroid(m, k);
+                let dist = Self::l2_squared(subvec, centroid);
+                if dist < best_dist {
+                    best_dist = dist;
+                    best_code = k as u8;
+                }
+            }
+
+            codes.push(best_code).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(PQCode { codes })
+    }
+
+    /// Decode PQ codes back to approximate vector
+    pub fn decode(&self, code: &PQCode<M>, output: &mut [i8]) -> crate::Result<()> {
+        if output.len() != self.config.dim {
+            return Err(crate::Error::InvalidModel("Output dimension mismatch"));
+        }
+
+        let d = self.config.subvec_dim;
+
+        for m in 0..self.config.num_subquantizers {
+            let k = code.get_code(m) as usize;
+            let centroid = self.get_centroid(m, k);
+            output[m * d..(m + 1) * d].copy_from_slice(centroid);
+        }
+
+        Ok(())
+    }
+
+    /// Compute asymmetric distance: exact query vs PQ-encoded database vector
+    pub fn asymmetric_distance(&self, query: &[i8], code: &PQCode<M>) -> i32 {
+        let d = self.config.subvec_dim;
+        let mut total_dist: i32 = 0;
+
+        for m in 0..self.config.num_subquantizers {
+            let query_sub = &query[m * d..(m + 1) * d];
+            let k = code.get_code(m) as usize;
+            let centroid = self.get_centroid(m, k);
+            total_dist += Self::l2_squared(query_sub, centroid);
+        }
+
+        total_dist
+    }
+
+    /// Compute distance using pre-computed distance table (faster for batch queries)
+    pub fn distance_with_table(&self, table: &PQDistanceTable<M, K>, code: &PQCode<M>) -> i32 {
+        let mut total: i32 = 0;
+        for m in 0..self.config.num_subquantizers {
+            let k = code.get_code(m) as usize;
+            total += table.get(m, k);
+        }
+        total
+    }
+
+    /// Build distance table for a query (precompute all query-centroid distances)
+    pub fn build_distance_table(&self, query: &[i8]) -> PQDistanceTable<M, K> {
+        let mut table = PQDistanceTable::new();
+        let d = self.config.subvec_dim;
+
+        for m in 0..self.config.num_subquantizers {
+            let query_sub = &query[m * d..(m + 1) * d];
+            for k in 0..self.config.codebook_size {
+                let centroid = self.get_centroid(m, k);
+                let dist = Self::l2_squared(query_sub, centroid);
+                table.set(m, k, dist);
+            }
+        }
+
+        table
+    }
+
+    /// L2 squared distance between two INT8 vectors
+    #[inline]
+    fn l2_squared(a: &[i8], b: &[i8]) -> i32 {
+        let mut sum: i32 = 0;
+        for (&x, &y) in a.iter().zip(b.iter()) {
+            let diff = x as i32 - y as i32;
+            sum += diff * diff;
+        }
+        sum
+    }
+
+    /// Memory usage of codebooks
+    pub fn memory_size(&self) -> usize {
+        self.codebooks.len()
+    }
+
+    /// Compression ratio vs INT8
+    pub fn compression_ratio(&self) -> f32 {
+        let original = self.config.dim as f32; // 1 byte per dim
+        let compressed = self.config.num_subquantizers as f32; // 1 byte per code
+        original / compressed
+    }
+}
+
+/// Pre-computed distance table for fast PQ distance computation
+pub struct PQDistanceTable<const M: usize, const K: usize> {
+    /// Distances: [M][K] flattened
+    distances: [i32; 128], // Max 8 subquantizers * 16 codes
+}
+
+impl<const M: usize, const K: usize> PQDistanceTable<M, K> {
+    /// Create empty table
+    pub fn new() -> Self {
+        Self { distances: [0; 128] }
+    }
+
+    /// Get distance for subquantizer m, code k
+    #[inline]
+    pub fn get(&self, m: usize, k: usize) -> i32 {
+        self.distances[m * K + k]
+    }
+
+    /// Set distance for subquantizer m, code k
+    #[inline]
+    pub fn set(&mut self, m: usize, k: usize, dist: i32) {
+        self.distances[m * K + k] = dist;
+    }
+}
+
+impl<const M: usize, const K: usize> Default for PQDistanceTable<M, K> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pq_config() {
+        let config = PQConfig::default();
+        assert_eq!(config.num_subquantizers, 4);
+        assert_eq!(config.codebook_size, 16);
+        assert_eq!(config.subvec_dim, 8);
+        assert_eq!(config.dim, 32);
+    }
+
+    #[test]
+    fn test_pq_encode_decode() {
+        let config = PQConfig {
+            num_subquantizers: 4,
+            codebook_size: 16,
+            subvec_dim: 8,
+            dim: 32,
+        };
+
+        let pq = ProductQuantizer::<4, 16, 8>::random(config, 42).unwrap();
+
+        // Create a test vector
+        let mut vector = [0i8; 32];
+        for i in 0..32 {
+            vector[i] = (i as i8).wrapping_mul(3);
+        }
+
+        // Encode
+        let code = pq.encode(&vector).unwrap();
+        assert_eq!(code.codes.len(), 4);
+
+        // Decode
+        let mut decoded = [0i8; 32];
+        pq.decode(&code, &mut decoded).unwrap();
+
+        // Decoded should be approximate (using centroids)
+        // Just verify it runs without error
+    }
+
+    #[test]
+    fn test_pq_compression() {
+        let config = PQConfig::default();
+        let pq = ProductQuantizer::<4, 16, 8>::random(config, 42).unwrap();
+
+        // 32 bytes original -> 4 bytes codes = 8x compression
+        assert_eq!(pq.compression_ratio(), 8.0);
+    }
+
+    #[test]
+    fn test_distance_table() {
+        let config = PQConfig::default();
+        let pq = ProductQuantizer::<4, 16, 8>::random(config, 42).unwrap();
+
+        let mut query = [0i8; 32];
+        for i in 0..32 {
+            query[i] = i as i8;
+        }
+
+        let table = pq.build_distance_table(&query);
+
+        // Encode a vector and compute distance both ways
+        let mut vector = [10i8; 32];
+        let code = pq.encode(&vector).unwrap();
+
+        let dist1 = pq.asymmetric_distance(&query, &code);
+        let dist2 = pq.distance_with_table(&table, &code);
+
+        // Should be equal
+        assert_eq!(dist1, dist2);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/pruning.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/pruning.rs
@@ -0,0 +1,446 @@
+//! MinCut-Inspired Layer Pruning for ESP32
+//!
+//! Intelligent pruning strategies adapted from ruvector graph algorithms.
+//! Identifies and removes least important weights/neurons while preserving model quality.
+
+use heapless::Vec as HVec;
+
+/// Maximum neurons to track for pruning
+pub const MAX_PRUNING_UNITS: usize = 64;
+
+/// Pruning configuration
+#[derive(Debug, Clone, Copy)]
+pub struct PruningConfig {
+    /// Target sparsity (0.0 = no pruning, 1.0 = all pruned)
+    pub target_sparsity: f32,
+    /// Minimum importance threshold (absolute value)
+    pub importance_threshold: i8,
+    /// Enable structured pruning (whole neurons vs individual weights)
+    pub structured: bool,
+    /// Gradual pruning steps (0 = one-shot)
+    pub gradual_steps: usize,
+}
+
+impl Default for PruningConfig {
+    fn default() -> Self {
+        Self {
+            target_sparsity: 0.5,
+            importance_threshold: 8,
+            structured: true,
+            gradual_steps: 0,
+        }
+    }
+}
+
+/// Maximum mask words (supports up to 2048 weights)
+pub const MAX_MASK_WORDS: usize = 64;
+
+/// Pruning mask for a weight matrix
+#[derive(Debug, Clone)]
+pub struct PruningMask<const N: usize> {
+    /// Bitmask: 1 = keep, 0 = prune
+    pub mask: HVec<u32, MAX_MASK_WORDS>,
+    /// Number of elements
+    pub size: usize,
+    /// Number of pruned elements
+    pub pruned_count: usize,
+}
+
+impl<const N: usize> PruningMask<N> {
+    /// Create mask with all weights kept
+    pub fn new(size: usize) -> crate::Result<Self> {
+        let num_words = (size + 31) / 32;
+        let mut mask = HVec::new();
+
+        for i in 0..num_words {
+            let bits = if i == num_words - 1 && size % 32 != 0 {
+                (1u32 << (size % 32)) - 1
+            } else {
+                u32::MAX
+            };
+            mask.push(bits).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { mask, size, pruned_count: 0 })
+    }
+
+    /// Check if weight at index is kept
+    #[inline]
+    pub fn is_kept(&self, idx: usize) -> bool {
+        let word = idx / 32;
+        let bit = idx % 32;
+        (self.mask.get(word).copied().unwrap_or(0) >> bit) & 1 == 1
+    }
+
+    /// Prune weight at index
+    pub fn prune(&mut self, idx: usize) {
+        if idx < self.size && self.is_kept(idx) {
+            let word = idx / 32;
+            let bit = idx % 32;
+            if let Some(w) = self.mask.get_mut(word) {
+                *w &= !(1 << bit);
+                self.pruned_count += 1;
+            }
+        }
+    }
+
+    /// Current sparsity level
+    pub fn sparsity(&self) -> f32 {
+        self.pruned_count as f32 / self.size as f32
+    }
+}
+
+/// Layer-level pruner using importance scoring
+pub struct LayerPruner {
+    /// Configuration
+    config: PruningConfig,
+    /// Importance scores for neurons/weights
+    importance_scores: HVec<i16, MAX_PRUNING_UNITS>,
+    /// Current pruning step (for gradual pruning)
+    current_step: usize,
+}
+
+impl LayerPruner {
+    /// Create new pruner with config
+    pub fn new(config: PruningConfig) -> Self {
+        Self {
+            config,
+            importance_scores: HVec::new(),
+            current_step: 0,
+        }
+    }
+
+    /// Compute importance scores for weights using magnitude
+    pub fn compute_magnitude_importance(&mut self, weights: &[i8]) {
+        self.importance_scores.clear();
+
+        for &w in weights.iter().take(MAX_PRUNING_UNITS) {
+            let importance = (w as i16).abs();
+            let _ = self.importance_scores.push(importance);
+        }
+    }
+
+    /// Compute importance using gradient information (simplified)
+    /// For on-device: use weight * activation as proxy
+    pub fn compute_gradient_importance(&mut self, weights: &[i8], activations: &[i8]) {
+        self.importance_scores.clear();
+
+        for (&w, &a) in weights.iter().zip(activations.iter()).take(MAX_PRUNING_UNITS) {
+            // |weight * activation| as importance proxy
+            let importance = ((w as i32 * a as i32).abs() >> 4) as i16;
+            let _ = self.importance_scores.push(importance);
+        }
+    }
+
+    /// Create pruning mask based on importance scores
+    pub fn create_mask<const N: usize>(&self, size: usize) -> crate::Result<PruningMask<N>> {
+        let mut mask = PruningMask::new(size)?;
+
+        // Count weights below threshold
+        let threshold = self.compute_threshold(size);
+
+        for (idx, &score) in self.importance_scores.iter().enumerate() {
+            if score < threshold {
+                mask.prune(idx);
+            }
+        }
+
+        Ok(mask)
+    }
+
+    /// Compute importance threshold for target sparsity
+    fn compute_threshold(&self, size: usize) -> i16 {
+        let target_pruned = (size as f32 * self.config.target_sparsity) as usize;
+
+        if target_pruned == 0 || self.importance_scores.is_empty() {
+            return 0;
+        }
+
+        // Find threshold that achieves target sparsity
+        // Simple approach: sort importance and pick threshold
+        let mut sorted: HVec<i16, MAX_PRUNING_UNITS> = HVec::new();
+        for &s in &self.importance_scores {
+            let _ = sorted.push(s);
+        }
+
+        // Bubble sort (fine for small arrays)
+        for i in 0..sorted.len() {
+            for j in 0..sorted.len() - 1 - i {
+                if sorted[j] > sorted[j + 1] {
+                    sorted.swap(j, j + 1);
+                }
+            }
+        }
+
+        let idx = target_pruned.min(sorted.len().saturating_sub(1));
+        sorted.get(idx).copied().unwrap_or(0)
+    }
+
+    /// Apply pruning mask to weights in-place
+    pub fn apply_mask<const N: usize>(&self, weights: &mut [i8], mask: &PruningMask<N>) {
+        for (idx, weight) in weights.iter_mut().enumerate() {
+            if !mask.is_kept(idx) {
+                *weight = 0;
+            }
+        }
+    }
+
+    /// Structured pruning: remove entire neurons
+    pub fn prune_neurons(
+        &mut self,
+        weights: &mut [i8],
+        input_dim: usize,
+        output_dim: usize,
+    ) -> HVec<bool, MAX_PRUNING_UNITS> {
+        // Compute per-neuron importance (L1 norm of weights)
+        let mut neuron_importance: HVec<i32, MAX_PRUNING_UNITS> = HVec::new();
+
+        for out_idx in 0..output_dim.min(MAX_PRUNING_UNITS) {
+            let mut l1_sum: i32 = 0;
+            for in_idx in 0..input_dim {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() {
+                    l1_sum += (weights[w_idx] as i32).abs();
+                }
+            }
+            let _ = neuron_importance.push(l1_sum);
+        }
+
+        // Find threshold
+        let target_pruned = (output_dim as f32 * self.config.target_sparsity) as usize;
+        let mut sorted: HVec<i32, MAX_PRUNING_UNITS> = neuron_importance.clone();
+
+        for i in 0..sorted.len() {
+            for j in 0..sorted.len() - 1 - i {
+                if sorted[j] > sorted[j + 1] {
+                    sorted.swap(j, j + 1);
+                }
+            }
+        }
+
+        let threshold = sorted.get(target_pruned).copied().unwrap_or(0);
+
+        // Mark neurons to prune
+        let mut keep_mask: HVec<bool, MAX_PRUNING_UNITS> = HVec::new();
+
+        for &importance in &neuron_importance {
+            let _ = keep_mask.push(importance >= threshold);
+        }
+
+        // Zero out pruned neurons
+        for out_idx in 0..output_dim.min(keep_mask.len()) {
+            if !keep_mask[out_idx] {
+                for in_idx in 0..input_dim {
+                    let w_idx = out_idx * input_dim + in_idx;
+                    if w_idx < weights.len() {
+                        weights[w_idx] = 0;
+                    }
+                }
+            }
+        }
+
+        keep_mask
+    }
+
+    /// Get statistics about pruning
+    pub fn pruning_stats<const N: usize>(&self, mask: &PruningMask<N>) -> PruningStats {
+        PruningStats {
+            total_weights: mask.size,
+            pruned_weights: mask.pruned_count,
+            sparsity: mask.sparsity(),
+            memory_saved: mask.pruned_count, // 1 byte per weight
+        }
+    }
+}
+
+/// Statistics about pruning results
+#[derive(Debug, Clone)]
+pub struct PruningStats {
+    /// Total weight count
+    pub total_weights: usize,
+    /// Number of pruned weights
+    pub pruned_weights: usize,
+    /// Achieved sparsity
+    pub sparsity: f32,
+    /// Memory saved in bytes
+    pub memory_saved: usize,
+}
+
+/// MinCut-inspired importance scoring
+/// Treats weight matrix as bipartite graph, finds min-cut to preserve information flow
+pub struct MinCutScorer {
+    /// Flow values from source to each input neuron
+    input_flow: HVec<i32, MAX_PRUNING_UNITS>,
+    /// Flow values from each output neuron to sink
+    output_flow: HVec<i32, MAX_PRUNING_UNITS>,
+}
+
+impl MinCutScorer {
+    /// Create scorer
+    pub fn new() -> Self {
+        Self {
+            input_flow: HVec::new(),
+            output_flow: HVec::new(),
+        }
+    }
+
+    /// Compute edge importance using simplified max-flow
+    /// Edges in min-cut are most critical for information flow
+    pub fn compute_edge_importance(
+        &mut self,
+        weights: &[i8],
+        input_dim: usize,
+        output_dim: usize,
+    ) -> HVec<i16, MAX_PRUNING_UNITS> {
+        // Initialize flow (simplified: use column/row sums)
+        self.input_flow.clear();
+        self.output_flow.clear();
+
+        // Input flow: sum of absolute weights per input
+        for in_idx in 0..input_dim.min(MAX_PRUNING_UNITS) {
+            let mut flow: i32 = 0;
+            for out_idx in 0..output_dim {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() {
+                    flow += (weights[w_idx] as i32).abs();
+                }
+            }
+            let _ = self.input_flow.push(flow);
+        }
+
+        // Output flow: sum of absolute weights per output
+        for out_idx in 0..output_dim.min(MAX_PRUNING_UNITS) {
+            let mut flow: i32 = 0;
+            for in_idx in 0..input_dim {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() {
+                    flow += (weights[w_idx] as i32).abs();
+                }
+            }
+            let _ = self.output_flow.push(flow);
+        }
+
+        // Edge importance = min(input_flow, output_flow) * |weight|
+        // Edges on min-cut have bottleneck flow
+        let mut importance: HVec<i16, MAX_PRUNING_UNITS> = HVec::new();
+
+        for out_idx in 0..output_dim.min(self.output_flow.len()) {
+            let out_flow = self.output_flow[out_idx];
+            for in_idx in 0..input_dim.min(self.input_flow.len()) {
+                let in_flow = self.input_flow[in_idx];
+                let w_idx = out_idx * input_dim + in_idx;
+
+                if w_idx < weights.len() {
+                    let w = (weights[w_idx] as i32).abs();
+                    let bottleneck = in_flow.min(out_flow);
+                    let edge_importance = ((w * bottleneck) >> 10) as i16;
+
+                    if importance.len() < MAX_PRUNING_UNITS {
+                        let _ = importance.push(edge_importance);
+                    }
+                }
+            }
+        }
+
+        importance
+    }
+}
+
+impl Default for MinCutScorer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pruning_mask() {
+        let mut mask = PruningMask::<64>::new(50).unwrap();
+
+        assert!(mask.is_kept(0));
+        assert!(mask.is_kept(49));
+        assert_eq!(mask.sparsity(), 0.0);
+
+        mask.prune(10);
+        mask.prune(20);
+
+        assert!(!mask.is_kept(10));
+        assert!(!mask.is_kept(20));
+        assert!(mask.is_kept(15));
+        assert_eq!(mask.pruned_count, 2);
+    }
+
+    #[test]
+    fn test_magnitude_pruning() {
+        let config = PruningConfig {
+            target_sparsity: 0.5,
+            ..Default::default()
+        };
+
+        let mut pruner = LayerPruner::new(config);
+
+        // Weights with varying magnitudes
+        let weights: [i8; 8] = [1, -2, 50, -60, 3, -4, 70, 5];
+        pruner.compute_magnitude_importance(&weights);
+
+        let mask = pruner.create_mask::<8>(8).unwrap();
+
+        // Should prune ~50% (low magnitude weights)
+        assert!(mask.sparsity() >= 0.25 && mask.sparsity() <= 0.75);
+
+        // High magnitude weights should be kept
+        assert!(mask.is_kept(2)); // 50
+        assert!(mask.is_kept(3)); // -60
+        assert!(mask.is_kept(6)); // 70
+    }
+
+    #[test]
+    fn test_structured_pruning() {
+        let config = PruningConfig {
+            target_sparsity: 0.5,
+            structured: true,
+            ..Default::default()
+        };
+
+        let mut pruner = LayerPruner::new(config);
+
+        // 4x4 weight matrix
+        let mut weights: [i8; 16] = [
+            10, 10, 10, 10,   // High importance neuron
+            1, 1, 1, 1,       // Low importance
+            20, 20, 20, 20,   // High importance
+            2, 2, 2, 2,       // Low importance
+        ];
+
+        let keep_mask = pruner.prune_neurons(&mut weights, 4, 4);
+
+        // Should keep high importance neurons
+        assert!(keep_mask[0]); // First neuron kept
+        assert!(keep_mask[2]); // Third neuron kept
+
+        // Low importance neurons should be zeroed
+        if !keep_mask[1] {
+            assert_eq!(weights[4], 0);
+            assert_eq!(weights[5], 0);
+        }
+    }
+
+    #[test]
+    fn test_mincut_scorer() {
+        let mut scorer = MinCutScorer::new();
+
+        let weights: [i8; 9] = [
+            10, 20, 30,
+            5, 10, 15,
+            1, 2, 3,
+        ];
+
+        let importance = scorer.compute_edge_importance(&weights, 3, 3);
+
+        // Should have computed importance for edges
+        assert!(!importance.is_empty());
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/sparse_attention.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/optimizations/sparse_attention.rs
@@ -0,0 +1,298 @@
+//! Sparse Attention Patterns for ESP32
+//!
+//! Reduces attention complexity from O(n²) to O(n) using:
+//! - Sliding window attention
+//! - Strided patterns
+//! - Block-sparse attention
+
+use heapless::Vec as HVec;
+
+/// Maximum sequence length for sparse patterns
+pub const MAX_SPARSE_SEQ: usize = 32;
+/// Maximum window size
+pub const MAX_WINDOW_SIZE: usize = 8;
+
+/// Attention pattern types
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum AttentionPattern {
+    /// Full attention (O(n²)) - baseline
+    Full,
+    /// Sliding window attention (O(n * w))
+    SlidingWindow { window_size: usize },
+    /// Strided attention (O(n * n/s))
+    Strided { stride: usize },
+    /// Combined window + stride
+    Longformer { window_size: usize, stride: usize },
+    /// Block diagonal attention
+    BlockDiagonal { block_size: usize },
+    /// Local + global tokens
+    BigBird { window_size: usize, global_tokens: usize },
+}
+
+impl Default for AttentionPattern {
+    fn default() -> Self {
+        // Sliding window is best for tiny models
+        Self::SlidingWindow { window_size: 4 }
+    }
+}
+
+/// Sparse attention implementation
+pub struct SparseAttention {
+    /// Pattern type
+    pattern: AttentionPattern,
+    /// Attention mask (true = attend, false = skip)
+    /// Stored as bitmask for memory efficiency
+    mask_data: HVec<u32, MAX_SPARSE_SEQ>,
+    /// Sequence length
+    seq_len: usize,
+}
+
+impl SparseAttention {
+    /// Create sparse attention with given pattern
+    pub fn new(pattern: AttentionPattern, seq_len: usize) -> crate::Result<Self> {
+        if seq_len > MAX_SPARSE_SEQ {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        let mut sa = Self {
+            pattern,
+            mask_data: HVec::new(),
+            seq_len,
+        };
+
+        sa.build_mask()?;
+        Ok(sa)
+    }
+
+    /// Build attention mask based on pattern
+    fn build_mask(&mut self) -> crate::Result<()> {
+        self.mask_data.clear();
+
+        for i in 0..self.seq_len {
+            let mut row_mask: u32 = 0;
+
+            for j in 0..self.seq_len {
+                if j <= i && self.should_attend(i, j) {
+                    row_mask |= 1 << j;
+                }
+            }
+
+            self.mask_data.push(row_mask).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(())
+    }
+
+    /// Check if position i should attend to position j
+    fn should_attend(&self, i: usize, j: usize) -> bool {
+        match self.pattern {
+            AttentionPattern::Full => true,
+
+            AttentionPattern::SlidingWindow { window_size } => {
+                i.saturating_sub(window_size) <= j
+            }
+
+            AttentionPattern::Strided { stride } => {
+                j % stride == 0 || i.saturating_sub(1) <= j
+            }
+
+            AttentionPattern::Longformer { window_size, stride } => {
+                // Local window OR strided global
+                i.saturating_sub(window_size) <= j || j % stride == 0
+            }
+
+            AttentionPattern::BlockDiagonal { block_size } => {
+                // Same block
+                i / block_size == j / block_size
+            }
+
+            AttentionPattern::BigBird { window_size, global_tokens } => {
+                // Local window OR global tokens (first N positions)
+                i.saturating_sub(window_size) <= j || j < global_tokens
+            }
+        }
+    }
+
+    /// Check if query position i should attend to key position j
+    #[inline]
+    pub fn should_attend_at(&self, i: usize, j: usize) -> bool {
+        if i >= self.seq_len || j >= self.seq_len {
+            return false;
+        }
+        (self.mask_data[i] >> j) & 1 == 1
+    }
+
+    /// Get mask row for position i (for vectorized attention)
+    #[inline]
+    pub fn get_mask_row(&self, i: usize) -> u32 {
+        self.mask_data.get(i).copied().unwrap_or(0)
+    }
+
+    /// Apply sparse attention: scores = Q @ K^T, masked
+    /// Only computes necessary positions
+    pub fn sparse_qk(
+        &self,
+        query: &[i8],      // [dim]
+        keys: &[&[i8]],    // [seq_len][dim]
+        scores: &mut [i32], // [seq_len]
+        query_pos: usize,
+    ) {
+        let mask = self.get_mask_row(query_pos);
+
+        for (j, key) in keys.iter().enumerate() {
+            if (mask >> j) & 1 == 1 {
+                // Compute dot product
+                let mut sum: i32 = 0;
+                for (&q, &k) in query.iter().zip(key.iter()) {
+                    sum += q as i32 * k as i32;
+                }
+                scores[j] = sum;
+            } else {
+                scores[j] = i32::MIN; // Will be zeroed by softmax
+            }
+        }
+    }
+
+    /// Count active attention positions
+    pub fn active_positions(&self) -> usize {
+        self.mask_data.iter().map(|m| m.count_ones() as usize).sum()
+    }
+
+    /// Theoretical vs actual computation ratio
+    pub fn sparsity_ratio(&self) -> f32 {
+        let full = self.seq_len * (self.seq_len + 1) / 2; // Lower triangular
+        let sparse = self.active_positions();
+        sparse as f32 / full as f32
+    }
+
+    /// Memory savings description
+    pub fn memory_savings(&self) -> &'static str {
+        match self.pattern {
+            AttentionPattern::Full => "None (O(n²))",
+            AttentionPattern::SlidingWindow { .. } => "O(n) - linear",
+            AttentionPattern::Strided { .. } => "O(n) - linear",
+            AttentionPattern::Longformer { .. } => "O(n) - linear",
+            AttentionPattern::BlockDiagonal { .. } => "O(n) - block-linear",
+            AttentionPattern::BigBird { .. } => "O(n) - linear",
+        }
+    }
+}
+
+/// Precomputed attention patterns for different sequence lengths
+pub struct AttentionPatternCache {
+    /// Cached patterns for common lengths
+    patterns: [Option<SparseAttention>; 4],
+}
+
+impl AttentionPatternCache {
+    /// Create cache with sliding window patterns
+    pub fn new_sliding(window_size: usize) -> Self {
+        let pattern = AttentionPattern::SlidingWindow { window_size };
+
+        Self {
+            patterns: [
+                SparseAttention::new(pattern, 8).ok(),
+                SparseAttention::new(pattern, 16).ok(),
+                SparseAttention::new(pattern, 24).ok(),
+                SparseAttention::new(pattern, 32).ok(),
+            ],
+        }
+    }
+
+    /// Get pattern for sequence length
+    pub fn get(&self, seq_len: usize) -> Option<&SparseAttention> {
+        let idx = match seq_len {
+            1..=8 => 0,
+            9..=16 => 1,
+            17..=24 => 2,
+            25..=32 => 3,
+            _ => return None,
+        };
+        self.patterns[idx].as_ref()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sliding_window() {
+        let sa = SparseAttention::new(
+            AttentionPattern::SlidingWindow { window_size: 2 },
+            8,
+        ).unwrap();
+
+        // Position 0: should only attend to 0
+        assert!(sa.should_attend_at(0, 0));
+        assert!(!sa.should_attend_at(0, 1));
+
+        // Position 4: should attend to 2, 3, 4
+        assert!(!sa.should_attend_at(4, 1));
+        assert!(sa.should_attend_at(4, 2));
+        assert!(sa.should_attend_at(4, 3));
+        assert!(sa.should_attend_at(4, 4));
+        assert!(!sa.should_attend_at(4, 5)); // Future
+    }
+
+    #[test]
+    fn test_strided() {
+        let sa = SparseAttention::new(
+            AttentionPattern::Strided { stride: 4 },
+            16,
+        ).unwrap();
+
+        // Position 10: attends to 0, 4, 8, 9, 10
+        assert!(sa.should_attend_at(10, 0));   // stride
+        assert!(sa.should_attend_at(10, 4));   // stride
+        assert!(sa.should_attend_at(10, 8));   // stride
+        assert!(sa.should_attend_at(10, 9));   // local
+        assert!(sa.should_attend_at(10, 10));  // self
+        assert!(!sa.should_attend_at(10, 1));  // not stride, not local
+    }
+
+    #[test]
+    fn test_sparsity() {
+        let full = SparseAttention::new(AttentionPattern::Full, 16).unwrap();
+        let sparse = SparseAttention::new(
+            AttentionPattern::SlidingWindow { window_size: 4 },
+            16,
+        ).unwrap();
+
+        // Full should have all positions
+        assert!(full.sparsity_ratio() > 0.99);
+
+        // Sparse should save computation
+        assert!(sparse.sparsity_ratio() < full.sparsity_ratio());
+    }
+
+    #[test]
+    fn test_block_diagonal() {
+        let sa = SparseAttention::new(
+            AttentionPattern::BlockDiagonal { block_size: 4 },
+            16,
+        ).unwrap();
+
+        // Position 5 (block 1): attends to 4, 5 only
+        assert!(!sa.should_attend_at(5, 3)); // Block 0
+        assert!(sa.should_attend_at(5, 4));  // Block 1
+        assert!(sa.should_attend_at(5, 5));  // Block 1, self
+        assert!(!sa.should_attend_at(5, 6)); // Block 1, future
+        assert!(!sa.should_attend_at(5, 8)); // Block 2
+    }
+
+    #[test]
+    fn test_bigbird() {
+        let sa = SparseAttention::new(
+            AttentionPattern::BigBird { window_size: 2, global_tokens: 2 },
+            16,
+        ).unwrap();
+
+        // Position 10: attends to 0, 1 (global), 8, 9, 10 (window)
+        assert!(sa.should_attend_at(10, 0));   // global
+        assert!(sa.should_attend_at(10, 1));   // global
+        assert!(!sa.should_attend_at(10, 5));  // neither
+        assert!(sa.should_attend_at(10, 8));   // window
+        assert!(sa.should_attend_at(10, 10));  // self
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ota.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ota.rs
@@ -0,0 +1,418 @@
+//! Over-the-Air (OTA) Update System for RuvLLM ESP32
+//!
+//! Enables wireless firmware updates via WiFi without physical access to the device.
+//!
+//! # Features
+//! - HTTPS firmware download with verification
+//! - SHA256 checksum validation
+//! - Rollback on failed update
+//! - Progress callbacks
+//! - Minimal RAM footprint (streaming update)
+
+use core::fmt;
+
+/// OTA update configuration
+#[derive(Clone)]
+pub struct OtaConfig {
+    /// Firmware server URL
+    pub server_url: heapless::String<128>,
+    /// Current firmware version
+    pub current_version: heapless::String<16>,
+    /// WiFi SSID
+    pub wifi_ssid: heapless::String<32>,
+    /// WiFi password
+    pub wifi_password: heapless::String<64>,
+    /// Check interval in seconds (0 = manual only)
+    pub check_interval_secs: u32,
+    /// Enable automatic updates
+    pub auto_update: bool,
+}
+
+impl Default for OtaConfig {
+    fn default() -> Self {
+        Self {
+            server_url: heapless::String::new(),
+            current_version: heapless::String::try_from("0.2.1").unwrap_or_default(),
+            wifi_ssid: heapless::String::new(),
+            wifi_password: heapless::String::new(),
+            check_interval_secs: 3600, // 1 hour
+            auto_update: false,
+        }
+    }
+}
+
+/// OTA update state
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OtaState {
+    /// Idle, waiting for update check
+    Idle,
+    /// Checking for updates
+    Checking,
+    /// Update available
+    UpdateAvailable,
+    /// Downloading firmware
+    Downloading,
+    /// Verifying firmware
+    Verifying,
+    /// Applying update
+    Applying,
+    /// Update complete, pending reboot
+    Complete,
+    /// Update failed
+    Failed,
+}
+
+impl fmt::Display for OtaState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            OtaState::Idle => write!(f, "Idle"),
+            OtaState::Checking => write!(f, "Checking"),
+            OtaState::UpdateAvailable => write!(f, "Update Available"),
+            OtaState::Downloading => write!(f, "Downloading"),
+            OtaState::Verifying => write!(f, "Verifying"),
+            OtaState::Applying => write!(f, "Applying"),
+            OtaState::Complete => write!(f, "Complete"),
+            OtaState::Failed => write!(f, "Failed"),
+        }
+    }
+}
+
+/// Update information
+#[derive(Clone)]
+pub struct UpdateInfo {
+    /// New version string
+    pub version: heapless::String<16>,
+    /// Firmware size in bytes
+    pub size: u32,
+    /// SHA256 checksum (hex string)
+    pub checksum: heapless::String<64>,
+    /// Release notes
+    pub notes: heapless::String<256>,
+    /// Download URL
+    pub download_url: heapless::String<256>,
+}
+
+/// OTA update error
+#[derive(Debug, Clone, Copy)]
+pub enum OtaError {
+    /// WiFi connection failed
+    WifiError,
+    /// HTTP request failed
+    HttpError,
+    /// Invalid response from server
+    InvalidResponse,
+    /// Checksum mismatch
+    ChecksumMismatch,
+    /// Not enough storage space
+    InsufficientSpace,
+    /// Flash write failed
+    FlashError,
+    /// Update verification failed
+    VerificationFailed,
+    /// No update available
+    NoUpdate,
+    /// Already up to date
+    AlreadyUpToDate,
+}
+
+impl fmt::Display for OtaError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            OtaError::WifiError => write!(f, "WiFi connection failed"),
+            OtaError::HttpError => write!(f, "HTTP request failed"),
+            OtaError::InvalidResponse => write!(f, "Invalid server response"),
+            OtaError::ChecksumMismatch => write!(f, "Checksum verification failed"),
+            OtaError::InsufficientSpace => write!(f, "Not enough storage space"),
+            OtaError::FlashError => write!(f, "Flash write error"),
+            OtaError::VerificationFailed => write!(f, "Update verification failed"),
+            OtaError::NoUpdate => write!(f, "No update available"),
+            OtaError::AlreadyUpToDate => write!(f, "Already up to date"),
+        }
+    }
+}
+
+/// Progress callback type
+pub type ProgressCallback = fn(downloaded: u32, total: u32);
+
+/// OTA Update Manager
+pub struct OtaManager {
+    config: OtaConfig,
+    state: OtaState,
+    progress: u32,
+    last_error: Option<OtaError>,
+    update_info: Option<UpdateInfo>,
+}
+
+impl OtaManager {
+    /// Create new OTA manager with config
+    pub fn new(config: OtaConfig) -> Self {
+        Self {
+            config,
+            state: OtaState::Idle,
+            progress: 0,
+            last_error: None,
+            update_info: None,
+        }
+    }
+
+    /// Get current state
+    pub fn state(&self) -> OtaState {
+        self.state
+    }
+
+    /// Get download progress (0-100)
+    pub fn progress(&self) -> u32 {
+        self.progress
+    }
+
+    /// Get last error
+    pub fn last_error(&self) -> Option<OtaError> {
+        self.last_error
+    }
+
+    /// Get available update info
+    pub fn update_info(&self) -> Option<&UpdateInfo> {
+        self.update_info.as_ref()
+    }
+
+    /// Check for updates (simulation for no_std)
+    ///
+    /// In a real implementation, this would:
+    /// 1. Connect to WiFi
+    /// 2. Query the update server
+    /// 3. Parse the response
+    /// 4. Compare versions
+    pub fn check_for_update(&mut self) -> Result<bool, OtaError> {
+        self.state = OtaState::Checking;
+        self.last_error = None;
+
+        // Simulated version check
+        // In real impl: HTTP GET to {server_url}/version.json
+        let server_version = "0.2.2"; // Would come from server
+
+        if self.is_newer_version(server_version) {
+            self.update_info = Some(UpdateInfo {
+                version: heapless::String::try_from(server_version).unwrap_or_default(),
+                size: 512 * 1024, // 512KB
+                checksum: heapless::String::try_from(
+                    "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+                ).unwrap_or_default(),
+                notes: heapless::String::try_from("Performance improvements and bug fixes").unwrap_or_default(),
+                download_url: heapless::String::try_from(
+                    "https://github.com/ruvnet/ruvector/releases/latest/download/ruvllm-esp32"
+                ).unwrap_or_default(),
+            });
+            self.state = OtaState::UpdateAvailable;
+            Ok(true)
+        } else {
+            self.state = OtaState::Idle;
+            self.last_error = Some(OtaError::AlreadyUpToDate);
+            Ok(false)
+        }
+    }
+
+    /// Compare version strings (simple semver comparison)
+    fn is_newer_version(&self, server_version: &str) -> bool {
+        let current = self.parse_version(self.config.current_version.as_str());
+        let server = self.parse_version(server_version);
+
+        server > current
+    }
+
+    /// Parse version string to tuple
+    fn parse_version(&self, version: &str) -> (u32, u32, u32) {
+        let mut parts = version.split('.');
+        let major = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
+        let minor = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
+        let patch = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
+        (major, minor, patch)
+    }
+
+    /// Start firmware download
+    ///
+    /// In real implementation:
+    /// 1. Stream download to flash partition
+    /// 2. Verify checksum incrementally
+    /// 3. Call progress callback
+    pub fn download_update(&mut self, _progress_cb: Option<ProgressCallback>) -> Result<(), OtaError> {
+        if self.state != OtaState::UpdateAvailable {
+            return Err(OtaError::NoUpdate);
+        }
+
+        self.state = OtaState::Downloading;
+        self.progress = 0;
+
+        // Simulated download
+        // In real impl: HTTP GET with streaming to flash
+        let total_size = self.update_info.as_ref().map(|i| i.size).unwrap_or(0);
+
+        // Simulate progress
+        for i in 0..=100 {
+            self.progress = i;
+            if let Some(cb) = _progress_cb {
+                cb(i * total_size / 100, total_size);
+            }
+        }
+
+        self.state = OtaState::Verifying;
+        Ok(())
+    }
+
+    /// Verify downloaded firmware
+    pub fn verify_update(&mut self) -> Result<(), OtaError> {
+        if self.state != OtaState::Verifying {
+            return Err(OtaError::VerificationFailed);
+        }
+
+        // In real impl: Calculate SHA256 of downloaded partition
+        // Compare with expected checksum
+
+        // Simulated verification
+        self.state = OtaState::Complete;
+        Ok(())
+    }
+
+    /// Apply update and reboot
+    ///
+    /// In real implementation:
+    /// 1. Set boot partition to new firmware
+    /// 2. Reboot device
+    pub fn apply_update(&mut self) -> Result<(), OtaError> {
+        if self.state != OtaState::Complete {
+            return Err(OtaError::VerificationFailed);
+        }
+
+        self.state = OtaState::Applying;
+
+        // In real impl:
+        // esp_ota_set_boot_partition(...)
+        // esp_restart()
+
+        Ok(())
+    }
+
+    /// Rollback to previous firmware
+    pub fn rollback(&mut self) -> Result<(), OtaError> {
+        // In real impl:
+        // esp_ota_mark_app_invalid_rollback_and_reboot()
+        self.state = OtaState::Idle;
+        Ok(())
+    }
+
+    /// Get human-readable status
+    pub fn status_string(&self) -> &'static str {
+        match self.state {
+            OtaState::Idle => "Ready",
+            OtaState::Checking => "Checking for updates...",
+            OtaState::UpdateAvailable => "Update available!",
+            OtaState::Downloading => "Downloading update...",
+            OtaState::Verifying => "Verifying firmware...",
+            OtaState::Applying => "Applying update...",
+            OtaState::Complete => "Update complete! Reboot to apply.",
+            OtaState::Failed => "Update failed",
+        }
+    }
+}
+
+/// OTA serial command handler
+pub fn handle_ota_command(manager: &mut OtaManager, command: &str) -> heapless::String<256> {
+    let mut response = heapless::String::new();
+
+    let parts: heapless::Vec<&str, 4> = command.split_whitespace().collect();
+    let cmd = parts.first().copied().unwrap_or("");
+
+    match cmd {
+        "status" => {
+            let _ = core::fmt::write(
+                &mut response,
+                format_args!("OTA Status: {} ({}%)", manager.status_string(), manager.progress())
+            );
+        }
+        "check" => {
+            match manager.check_for_update() {
+                Ok(true) => {
+                    if let Some(info) = manager.update_info() {
+                        let _ = core::fmt::write(
+                            &mut response,
+                            format_args!("Update available: v{} ({}KB)", info.version, info.size / 1024)
+                        );
+                    }
+                }
+                Ok(false) => {
+                    let _ = response.push_str("Already up to date");
+                }
+                Err(e) => {
+                    let _ = core::fmt::write(&mut response, format_args!("Check failed: {}", e));
+                }
+            }
+        }
+        "download" => {
+            match manager.download_update(None) {
+                Ok(()) => {
+                    let _ = response.push_str("Download complete");
+                }
+                Err(e) => {
+                    let _ = core::fmt::write(&mut response, format_args!("Download failed: {}", e));
+                }
+            }
+        }
+        "apply" => {
+            let _ = manager.verify_update();
+            match manager.apply_update() {
+                Ok(()) => {
+                    let _ = response.push_str("Rebooting to apply update...");
+                }
+                Err(e) => {
+                    let _ = core::fmt::write(&mut response, format_args!("Apply failed: {}", e));
+                }
+            }
+        }
+        "rollback" => {
+            match manager.rollback() {
+                Ok(()) => {
+                    let _ = response.push_str("Rolling back to previous firmware...");
+                }
+                Err(e) => {
+                    let _ = core::fmt::write(&mut response, format_args!("Rollback failed: {}", e));
+                }
+            }
+        }
+        _ => {
+            let _ = response.push_str("OTA commands: status, check, download, apply, rollback");
+        }
+    }
+
+    response
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_version_comparison() {
+        let config = OtaConfig {
+            current_version: heapless::String::try_from("0.2.1").unwrap(),
+            ..Default::default()
+        };
+        let manager = OtaManager::new(config);
+
+        assert!(manager.is_newer_version("0.2.2"));
+        assert!(manager.is_newer_version("0.3.0"));
+        assert!(manager.is_newer_version("1.0.0"));
+        assert!(!manager.is_newer_version("0.2.1"));
+        assert!(!manager.is_newer_version("0.2.0"));
+        assert!(!manager.is_newer_version("0.1.0"));
+    }
+
+    #[test]
+    fn test_state_transitions() {
+        let config = OtaConfig::default();
+        let mut manager = OtaManager::new(config);
+
+        assert_eq!(manager.state(), OtaState::Idle);
+
+        let _ = manager.check_for_update();
+        assert!(matches!(manager.state(), OtaState::UpdateAvailable | OtaState::Idle));
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/quantized.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/quantized.rs
@@ -0,0 +1,316 @@
+//! Quantized tensor operations for memory-efficient inference
+//!
+//! Supports INT8, INT4, and binary quantization for extreme memory savings.
+
+use heapless::Vec as HVec;
+use serde::{Deserialize, Serialize};
+
+/// Maximum tensor size for stack allocation (16KB)
+pub const MAX_TENSOR_SIZE: usize = 16 * 1024;
+
+/// Quantization type
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum QuantizationType {
+    /// 8-bit signed integer (-128 to 127)
+    Int8,
+    /// 4-bit signed integer (-8 to 7), packed 2 per byte
+    Int4,
+    /// Binary weights (-1 or +1), packed 8 per byte
+    Binary,
+    /// 16-bit fixed point (8.8 format)
+    Fixed16,
+}
+
+impl QuantizationType {
+    /// Bits per weight
+    pub const fn bits(&self) -> usize {
+        match self {
+            Self::Int8 => 8,
+            Self::Int4 => 4,
+            Self::Binary => 1,
+            Self::Fixed16 => 16,
+        }
+    }
+
+    /// Compression ratio vs FP32
+    pub const fn compression_ratio(&self) -> usize {
+        32 / self.bits()
+    }
+}
+
+/// Quantization parameters for dequantization
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct QuantParams {
+    /// Scale factor: real_value = quantized_value * scale + zero_point
+    pub scale: f32,
+    /// Zero point offset
+    pub zero_point: f32,
+    /// Min value in original tensor
+    pub min_val: f32,
+    /// Max value in original tensor
+    pub max_val: f32,
+}
+
+impl Default for QuantParams {
+    fn default() -> Self {
+        Self {
+            scale: 1.0 / 127.0,
+            zero_point: 0.0,
+            min_val: -1.0,
+            max_val: 1.0,
+        }
+    }
+}
+
+/// Quantized tensor stored in compact format
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct QuantizedTensor<const N: usize> {
+    /// Quantized data
+    pub data: HVec<u8, N>,
+    /// Shape (max 4 dimensions for embedded)
+    pub shape: [usize; 4],
+    /// Number of dimensions used
+    pub ndim: usize,
+    /// Quantization type
+    pub quant_type: QuantizationType,
+    /// Quantization parameters
+    pub params: QuantParams,
+}
+
+impl<const N: usize> QuantizedTensor<N> {
+    /// Create a new quantized tensor from f32 data
+    pub fn from_f32(data: &[f32], shape: &[usize], quant_type: QuantizationType) -> crate::Result<Self> {
+        if data.is_empty() {
+            return Err(crate::Error::QuantizationError("Empty data"));
+        }
+
+        // Calculate min/max
+        let mut min_val = f32::MAX;
+        let mut max_val = f32::MIN;
+        for &v in data {
+            if v < min_val { min_val = v; }
+            if v > max_val { max_val = v; }
+        }
+
+        let params = match quant_type {
+            QuantizationType::Int8 => {
+                let scale = (max_val - min_val) / 255.0;
+                let zero_point = -min_val / scale - 128.0;
+                QuantParams { scale, zero_point, min_val, max_val }
+            }
+            QuantizationType::Int4 => {
+                let scale = (max_val - min_val) / 15.0;
+                let zero_point = -min_val / scale - 8.0;
+                QuantParams { scale, zero_point, min_val, max_val }
+            }
+            QuantizationType::Binary => {
+                QuantParams {
+                    scale: 1.0,
+                    zero_point: 0.0,
+                    min_val: -1.0,
+                    max_val: 1.0,
+                }
+            }
+            QuantizationType::Fixed16 => {
+                let scale = (max_val - min_val) / 65535.0;
+                QuantParams { scale, zero_point: min_val, min_val, max_val }
+            }
+        };
+
+        let quantized_data = Self::quantize_data(data, quant_type, &params)?;
+
+        let mut shape_arr = [0usize; 4];
+        let ndim = shape.len().min(4);
+        for (i, &s) in shape.iter().take(4).enumerate() {
+            shape_arr[i] = s;
+        }
+
+        Ok(Self {
+            data: quantized_data,
+            shape: shape_arr,
+            ndim,
+            quant_type,
+            params,
+        })
+    }
+
+    fn quantize_data(data: &[f32], quant_type: QuantizationType, params: &QuantParams) -> crate::Result<HVec<u8, N>> {
+        let mut result = HVec::new();
+
+        match quant_type {
+            QuantizationType::Int8 => {
+                for &v in data {
+                    let q = ((v - params.min_val) / params.scale).round() as i16;
+                    let q = q.clamp(-128, 127) as i8;
+                    result.push(q as u8).map_err(|_| crate::Error::BufferOverflow)?;
+                }
+            }
+            QuantizationType::Int4 => {
+                // Pack 2 values per byte
+                for chunk in data.chunks(2) {
+                    let v0 = ((chunk[0] - params.min_val) / params.scale).round() as i8;
+                    let v1 = if chunk.len() > 1 {
+                        ((chunk[1] - params.min_val) / params.scale).round() as i8
+                    } else {
+                        0
+                    };
+                    let v0 = (v0.clamp(-8, 7) + 8) as u8;
+                    let v1 = (v1.clamp(-8, 7) + 8) as u8;
+                    let packed = (v0 & 0x0F) | ((v1 & 0x0F) << 4);
+                    result.push(packed).map_err(|_| crate::Error::BufferOverflow)?;
+                }
+            }
+            QuantizationType::Binary => {
+                // Pack 8 values per byte
+                for chunk in data.chunks(8) {
+                    let mut byte = 0u8;
+                    for (i, &v) in chunk.iter().enumerate() {
+                        if v >= 0.0 {
+                            byte |= 1 << i;
+                        }
+                    }
+                    result.push(byte).map_err(|_| crate::Error::BufferOverflow)?;
+                }
+            }
+            QuantizationType::Fixed16 => {
+                for &v in data {
+                    let q = ((v - params.min_val) / params.scale).round() as u16;
+                    result.push((q >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+                    result.push((q & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
+                }
+            }
+        }
+
+        Ok(result)
+    }
+
+    /// Get total number of elements
+    pub fn numel(&self) -> usize {
+        self.shape[..self.ndim].iter().product()
+    }
+
+    /// Get compressed size in bytes
+    pub fn compressed_size(&self) -> usize {
+        self.data.len()
+    }
+
+    /// Memory savings compared to FP32
+    pub fn memory_savings(&self) -> f32 {
+        let fp32_size = self.numel() * 4;
+        1.0 - (self.compressed_size() as f32 / fp32_size as f32)
+    }
+}
+
+/// INT8 matrix-vector multiplication (optimized for ESP32)
+///
+/// Computes: output = weights @ input
+/// Where weights is [out_dim, in_dim] and input is [in_dim]
+#[inline(never)] // Prevent inlining for better cache behavior
+pub fn matmul_int8(
+    weights: &[i8],
+    _weight_params: &QuantParams,
+    input: &[i8],
+    _input_params: &QuantParams,
+    output: &mut [i32],
+    out_dim: usize,
+    in_dim: usize,
+) {
+    debug_assert_eq!(weights.len(), out_dim * in_dim);
+    debug_assert_eq!(input.len(), in_dim);
+    debug_assert_eq!(output.len(), out_dim);
+
+    for i in 0..out_dim {
+        let mut acc: i32 = 0;
+        let row_start = i * in_dim;
+
+        // Process 4 elements at a time for better performance
+        let chunks = in_dim / 4;
+        for j in 0..chunks {
+            let idx = j * 4;
+            acc += weights[row_start + idx] as i32 * input[idx] as i32;
+            acc += weights[row_start + idx + 1] as i32 * input[idx + 1] as i32;
+            acc += weights[row_start + idx + 2] as i32 * input[idx + 2] as i32;
+            acc += weights[row_start + idx + 3] as i32 * input[idx + 3] as i32;
+        }
+
+        // Handle remainder
+        for j in (chunks * 4)..in_dim {
+            acc += weights[row_start + j] as i32 * input[j] as i32;
+        }
+
+        output[i] = acc;
+    }
+}
+
+/// Dequantize INT32 accumulator to f32
+#[inline]
+pub fn dequantize_accumulator(
+    acc: i32,
+    weight_params: &QuantParams,
+    input_params: &QuantParams,
+) -> f32 {
+    acc as f32 * weight_params.scale * input_params.scale
+}
+
+/// Binary XNOR-popcount for extreme efficiency
+///
+/// For binary neural networks: computes hamming similarity
+#[inline]
+pub fn binary_xnor_popcount(a: &[u8], b: &[u8]) -> i32 {
+    debug_assert_eq!(a.len(), b.len());
+
+    let mut count: i32 = 0;
+    for (&x, &y) in a.iter().zip(b.iter()) {
+        // XNOR: same bits = 1, different = 0
+        let xnor = !(x ^ y);
+        count += xnor.count_ones() as i32;
+    }
+
+    // Convert popcount to -1/+1 dot product equivalent
+    // Each byte has 8 bits, so:
+    // dot = popcount * 2 - total_bits
+    let total_bits = (a.len() * 8) as i32;
+    count * 2 - total_bits
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_int8_quantization() {
+        let data = [-1.0f32, -0.5, 0.0, 0.5, 1.0];
+        let tensor: QuantizedTensor<64> = QuantizedTensor::from_f32(
+            &data,
+            &[5],
+            QuantizationType::Int8
+        ).unwrap();
+
+        assert_eq!(tensor.numel(), 5);
+        assert_eq!(tensor.compressed_size(), 5);
+        assert!(tensor.memory_savings() > 0.7); // 75% savings
+    }
+
+    #[test]
+    fn test_binary_xnor() {
+        let a = [0b11110000u8, 0b10101010];
+        let b = [0b11110000u8, 0b10101010];
+
+        // Perfect match: all 16 bits same
+        let result = binary_xnor_popcount(&a, &b);
+        assert_eq!(result, 16); // 16 * 2 - 16 = 16
+    }
+
+    #[test]
+    fn test_int4_packing() {
+        let data = [0.0f32, 0.5, -0.5, 1.0];
+        let tensor: QuantizedTensor<64> = QuantizedTensor::from_f32(
+            &data,
+            &[4],
+            QuantizationType::Int4
+        ).unwrap();
+
+        // 4 values packed into 2 bytes
+        assert_eq!(tensor.compressed_size(), 2);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/anomaly.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/anomaly.rs
@@ -0,0 +1,480 @@
+//! Anomaly Detection - Intelligent Pattern Recognition for ESP32
+//!
+//! Uses vector embeddings to detect unusual patterns in sensor data,
+//! behavior, or any time-series data. Perfect for:
+//! - Industrial equipment monitoring
+//! - Security systems
+//! - Health monitoring
+//! - Environmental sensing
+//!
+//! # How It Works
+//!
+//! ```text
+//! Training Phase:
+//! ┌─────────────────────────────────────────────────────────┐
+//! │  Normal readings ──▶ Embed ──▶ Store in cluster         │
+//! │  [temp=25, vibration=1.2, sound=40dB]                   │
+//! │           ▼                                              │
+//! │     [0.2, 0.1, 0.8, ...]  ──▶  Centroid A               │
+//! └─────────────────────────────────────────────────────────┘
+//!
+//! Detection Phase:
+//! ┌─────────────────────────────────────────────────────────┐
+//! │  New reading ──▶ Embed ──▶ Distance to clusters         │
+//! │  [temp=85, vibration=15.0, sound=95dB]  ◀── ANOMALY!    │
+//! │           ▼                                              │
+//! │     [0.9, 0.8, 0.1, ...]  ──▶  Distance: 0.95           │
+//! │                                (threshold: 0.5)          │
+//! └─────────────────────────────────────────────────────────┘
+//! ```
+
+use heapless::Vec as HVec;
+use super::{MicroHNSW, HNSWConfig, MicroVector, DistanceMetric, euclidean_distance_i8};
+
+/// Maximum normal patterns to learn
+pub const MAX_PATTERNS: usize = 128;
+/// Pattern embedding dimension
+pub const PATTERN_DIM: usize = 32;
+/// Maximum clusters
+pub const MAX_CLUSTERS: usize = 8;
+
+/// Anomaly detection configuration
+#[derive(Debug, Clone)]
+pub struct AnomalyConfig {
+    /// Distance threshold for anomaly (0-1000 scale)
+    pub threshold: i32,
+    /// Minimum samples to establish baseline
+    pub min_samples: usize,
+    /// Enable adaptive threshold
+    pub adaptive: bool,
+    /// Smoothing factor for running average (0-100)
+    pub smoothing: u8,
+    /// Number of clusters for pattern grouping
+    pub num_clusters: usize,
+}
+
+impl Default for AnomalyConfig {
+    fn default() -> Self {
+        Self {
+            threshold: 500,      // Distance threshold
+            min_samples: 10,     // Need 10 samples for baseline
+            adaptive: true,      // Adapt threshold over time
+            smoothing: 80,       // 80% weight to historical average
+            num_clusters: 4,     // Group into 4 clusters
+        }
+    }
+}
+
+/// Anomaly detection result
+#[derive(Debug, Clone)]
+pub struct AnomalyResult {
+    /// Is this an anomaly?
+    pub is_anomaly: bool,
+    /// Distance to nearest normal pattern
+    pub distance: i32,
+    /// Anomaly score (0-100, higher = more anomalous)
+    pub score: u8,
+    /// Nearest cluster ID
+    pub nearest_cluster: Option<u8>,
+    /// Confidence level (0-100)
+    pub confidence: u8,
+    /// Suggested label for anomaly type
+    pub anomaly_type: AnomalyType,
+}
+
+/// Types of anomalies
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum AnomalyType {
+    /// Normal operation
+    Normal,
+    /// Point anomaly (single unusual reading)
+    Point,
+    /// Contextual anomaly (unusual for this context)
+    Contextual,
+    /// Collective anomaly (pattern of unusual readings)
+    Collective,
+    /// Drift (gradual change from baseline)
+    Drift,
+    /// Spike (sudden large change)
+    Spike,
+    /// Unknown pattern
+    Unknown,
+}
+
+/// Cluster centroid
+#[derive(Debug, Clone)]
+struct Cluster {
+    /// Centroid embedding
+    centroid: HVec<i32, PATTERN_DIM>,
+    /// Number of samples in cluster
+    count: u32,
+    /// Sum for online averaging
+    sum: HVec<i64, PATTERN_DIM>,
+    /// Variance estimate
+    variance: i32,
+}
+
+impl Default for Cluster {
+    fn default() -> Self {
+        Self {
+            centroid: HVec::new(),
+            count: 0,
+            sum: HVec::new(),
+            variance: 0,
+        }
+    }
+}
+
+/// Anomaly Detector
+pub struct AnomalyDetector {
+    /// Configuration
+    config: AnomalyConfig,
+    /// HNSW index for pattern matching
+    index: MicroHNSW<PATTERN_DIM, MAX_PATTERNS>,
+    /// Pattern storage
+    patterns: HVec<HVec<i8, PATTERN_DIM>, MAX_PATTERNS>,
+    /// Cluster centroids
+    clusters: HVec<Cluster, MAX_CLUSTERS>,
+    /// Running average distance
+    avg_distance: i32,
+    /// Running variance
+    variance: i32,
+    /// Sample count
+    sample_count: u32,
+    /// Consecutive anomaly count
+    anomaly_streak: u16,
+    /// Last few readings for collective detection
+    recent_window: HVec<i32, 16>,
+}
+
+impl AnomalyDetector {
+    /// Create new anomaly detector
+    pub fn new(config: AnomalyConfig) -> Self {
+        let hnsw_config = HNSWConfig {
+            m: 4,
+            m_max0: 8,
+            ef_construction: 16,
+            ef_search: 8,
+            metric: DistanceMetric::Euclidean,
+            binary_mode: false,
+        };
+
+        let mut clusters = HVec::new();
+        for _ in 0..config.num_clusters {
+            let _ = clusters.push(Cluster::default());
+        }
+
+        Self {
+            config,
+            index: MicroHNSW::new(hnsw_config),
+            patterns: HVec::new(),
+            clusters,
+            avg_distance: 0,
+            variance: 0,
+            sample_count: 0,
+            anomaly_streak: 0,
+            recent_window: HVec::new(),
+        }
+    }
+
+    /// Number of learned patterns
+    pub fn pattern_count(&self) -> usize {
+        self.patterns.len()
+    }
+
+    /// Has enough samples for reliable detection
+    pub fn is_trained(&self) -> bool {
+        self.sample_count >= self.config.min_samples as u32
+    }
+
+    /// Memory usage in bytes
+    pub fn memory_bytes(&self) -> usize {
+        self.index.memory_bytes() +
+        self.patterns.len() * PATTERN_DIM +
+        self.clusters.len() * core::mem::size_of::<Cluster>()
+    }
+
+    /// Learn a normal pattern
+    pub fn learn(&mut self, embedding: &[i8]) -> Result<(), &'static str> {
+        if self.patterns.len() >= MAX_PATTERNS {
+            // Remove oldest pattern
+            self.patterns.swap_remove(0);
+        }
+
+        // Store pattern
+        let mut pattern = HVec::new();
+        for &v in embedding.iter().take(PATTERN_DIM) {
+            pattern.push(v).map_err(|_| "Pattern overflow")?;
+        }
+
+        // Add to index
+        let vec = MicroVector {
+            data: pattern.clone(),
+            id: self.patterns.len() as u32,
+        };
+        self.index.insert(&vec)?;
+
+        // Update clusters
+        self.update_clusters(&pattern);
+
+        self.patterns.push(pattern).map_err(|_| "Pattern storage full")?;
+        self.sample_count += 1;
+
+        Ok(())
+    }
+
+    /// Detect if embedding is anomalous
+    pub fn detect(&mut self, embedding: &[i8]) -> AnomalyResult {
+        // Not enough training data
+        if !self.is_trained() {
+            // Learn this as normal
+            let _ = self.learn(embedding);
+            return AnomalyResult {
+                is_anomaly: false,
+                distance: 0,
+                score: 0,
+                nearest_cluster: None,
+                confidence: 0,
+                anomaly_type: AnomalyType::Normal,
+            };
+        }
+
+        // Find nearest pattern
+        let results = self.index.search(embedding, 3);
+
+        let distance = if results.is_empty() {
+            i32::MAX
+        } else {
+            results[0].distance
+        };
+
+        // Find nearest cluster
+        let (nearest_cluster, cluster_distance) = self.find_nearest_cluster(embedding);
+
+        // Update running statistics
+        self.update_statistics(distance);
+
+        // Calculate adaptive threshold
+        let threshold = if self.config.adaptive {
+            self.avg_distance + 2 * self.variance.max(100)
+        } else {
+            self.config.threshold
+        };
+
+        // Determine anomaly type
+        let is_anomaly = distance > threshold;
+        let anomaly_type = self.classify_anomaly(distance, is_anomaly);
+
+        // Update streak
+        if is_anomaly {
+            self.anomaly_streak = self.anomaly_streak.saturating_add(1);
+        } else {
+            self.anomaly_streak = 0;
+            // Optionally learn this as normal
+            if distance < threshold / 2 {
+                let _ = self.learn(embedding);
+            }
+        }
+
+        // Calculate score (0-100)
+        let score = if threshold > 0 {
+            ((distance * 100) / threshold).min(100) as u8
+        } else {
+            0
+        };
+
+        // Confidence based on sample count (0-100 scale)
+        let confidence = self.sample_count.min(100) as u8;
+
+        AnomalyResult {
+            is_anomaly,
+            distance,
+            score,
+            nearest_cluster: Some(nearest_cluster),
+            confidence,
+            anomaly_type,
+        }
+    }
+
+    /// Update running statistics
+    fn update_statistics(&mut self, distance: i32) {
+        // Online mean and variance (Welford's algorithm)
+        self.sample_count += 1;
+        let n = self.sample_count as i64;
+
+        let delta = distance - self.avg_distance;
+        self.avg_distance += (delta / n as i32);
+
+        let delta2 = distance - self.avg_distance;
+        self.variance = ((self.variance as i64 * (n - 1) + (delta as i64 * delta2 as i64)) / n) as i32;
+
+        // Update recent window
+        if self.recent_window.len() >= 16 {
+            self.recent_window.remove(0);
+        }
+        let _ = self.recent_window.push(distance);
+    }
+
+    /// Update cluster centroids
+    fn update_clusters(&mut self, pattern: &[i8]) {
+        // Find nearest cluster
+        let (cluster_idx, _) = self.find_nearest_cluster(pattern);
+
+        if let Some(cluster) = self.clusters.get_mut(cluster_idx as usize) {
+            // Initialize if empty
+            if cluster.count == 0 {
+                for &v in pattern.iter().take(PATTERN_DIM) {
+                    let _ = cluster.centroid.push(v as i32);
+                    let _ = cluster.sum.push(v as i64);
+                }
+            } else {
+                // Online centroid update
+                for (i, &v) in pattern.iter().take(PATTERN_DIM).enumerate() {
+                    if i < cluster.sum.len() {
+                        cluster.sum[i] += v as i64;
+                    }
+                    if i < cluster.centroid.len() {
+                        cluster.centroid[i] = (cluster.sum[i] / (cluster.count as i64 + 1)) as i32;
+                    }
+                }
+            }
+            cluster.count += 1;
+        }
+    }
+
+    /// Find nearest cluster centroid
+    fn find_nearest_cluster(&self, pattern: &[i8]) -> (u8, i32) {
+        let mut best_idx = 0u8;
+        let mut best_dist = i32::MAX;
+
+        for (i, cluster) in self.clusters.iter().enumerate() {
+            if cluster.count == 0 {
+                continue;
+            }
+
+            // Calculate distance to centroid
+            let mut dist = 0i32;
+            for (j, &v) in pattern.iter().take(PATTERN_DIM).enumerate() {
+                if j < cluster.centroid.len() {
+                    let diff = v as i32 - cluster.centroid[j];
+                    dist += diff * diff;
+                }
+            }
+
+            if dist < best_dist {
+                best_dist = dist;
+                best_idx = i as u8;
+            }
+        }
+
+        (best_idx, best_dist)
+    }
+
+    /// Classify the type of anomaly
+    fn classify_anomaly(&self, distance: i32, is_anomaly: bool) -> AnomalyType {
+        if !is_anomaly {
+            return AnomalyType::Normal;
+        }
+
+        // Check for spike (sudden large deviation)
+        if distance > self.avg_distance * 3 {
+            return AnomalyType::Spike;
+        }
+
+        // Check for collective (multiple anomalies in window)
+        let anomalies_in_window = self.recent_window.iter()
+            .filter(|&&d| d > self.config.threshold)
+            .count();
+
+        if anomalies_in_window >= 3 {
+            return AnomalyType::Collective;
+        }
+
+        // Check for drift (gradual increase)
+        if self.recent_window.len() >= 8 {
+            let first_half_avg: i32 = self.recent_window[..4].iter().sum::<i32>() / 4;
+            let second_half_avg: i32 = self.recent_window[4..8].iter().sum::<i32>() / 4;
+            if second_half_avg > first_half_avg + self.variance {
+                return AnomalyType::Drift;
+            }
+        }
+
+        // Check for streak
+        if self.anomaly_streak > 2 {
+            return AnomalyType::Collective;
+        }
+
+        AnomalyType::Point
+    }
+
+    /// Get current threshold
+    pub fn current_threshold(&self) -> i32 {
+        if self.config.adaptive {
+            self.avg_distance + 2 * self.variance.max(100)
+        } else {
+            self.config.threshold
+        }
+    }
+
+    /// Reset to untrained state
+    pub fn reset(&mut self) {
+        self.patterns.clear();
+        self.sample_count = 0;
+        self.avg_distance = 0;
+        self.variance = 0;
+        self.anomaly_streak = 0;
+        self.recent_window.clear();
+
+        for cluster in self.clusters.iter_mut() {
+            cluster.count = 0;
+            cluster.centroid.clear();
+            cluster.sum.clear();
+        }
+    }
+}
+
+impl Default for AnomalyDetector {
+    fn default() -> Self {
+        Self::new(AnomalyConfig::default())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_anomaly_detector() {
+        let mut detector = AnomalyDetector::default();
+
+        // Train with normal patterns
+        for i in 0..20 {
+            let pattern: HVec<i8, PATTERN_DIM> = (0..PATTERN_DIM).map(|j| ((i + j) % 20) as i8).collect();
+            detector.learn(&pattern).unwrap();
+        }
+
+        assert!(detector.is_trained());
+        assert!(detector.pattern_count() >= 10);
+    }
+
+    #[test]
+    fn test_detect_anomaly() {
+        let mut detector = AnomalyDetector::default();
+
+        // Train with similar patterns
+        for _ in 0..20 {
+            let pattern = [10i8; PATTERN_DIM];
+            detector.learn(&pattern).unwrap();
+        }
+
+        // Normal pattern
+        let normal = [11i8; PATTERN_DIM];
+        let result = detector.detect(&normal);
+        assert!(!result.is_anomaly || result.score < 50);
+
+        // Anomalous pattern
+        let anomaly = [100i8; PATTERN_DIM];
+        let result = detector.detect(&anomaly);
+        assert!(result.is_anomaly || result.score > 50);
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/federated_search.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/federated_search.rs
@@ -0,0 +1,399 @@
+//! Federated Vector Search - Distributed Similarity Search Across ESP32 Clusters
+//!
+//! Enables vector search across multiple ESP32 chips for:
+//! - Larger knowledge bases (1M+ vectors across cluster)
+//! - Faster search (parallel query execution)
+//! - Resilient systems (no single point of failure)
+//! - Distributed embeddings (each chip stores subset)
+//!
+//! # Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────────────────────┐
+//! │                     FEDERATED VECTOR SEARCH                                 │
+//! ├─────────────────────────────────────────────────────────────────────────────┤
+//! │                                                                             │
+//! │   Query: "What is machine learning?"                                        │
+//! │              │                                                              │
+//! │              ▼                                                              │
+//! │   ┌─────────────────┐                                                       │
+//! │   │  Coordinator    │ ──▶ Broadcast query to all shards                     │
+//! │   │  (Chip 0)       │                                                       │
+//! │   └─────────────────┘                                                       │
+//! │          │ │ │ │                                                            │
+//! │          ▼ ▼ ▼ ▼                                                            │
+//! │   ┌────┐ ┌────┐ ┌────┐ ┌────┐                                               │
+//! │   │ S1 │ │ S2 │ │ S3 │ │ S4 │  ◀── Each shard searches locally             │
+//! │   └────┘ └────┘ └────┘ └────┘                                               │
+//! │     │      │      │      │                                                  │
+//! │     └──────┴──────┴──────┘                                                  │
+//! │              │                                                              │
+//! │              ▼                                                              │
+//! │   ┌─────────────────┐                                                       │
+//! │   │  Merge Results  │ ──▶ Return top-k globally                             │
+//! │   └─────────────────┘                                                       │
+//! │                                                                             │
+//! └─────────────────────────────────────────────────────────────────────────────┘
+//! ```
+
+use heapless::Vec as HVec;
+use super::{MicroHNSW, HNSWConfig, SearchResult, MicroVector, DistanceMetric, MAX_VECTORS};
+
+/// Maximum shards in federation
+pub const MAX_SHARDS: usize = 16;
+/// Local shard capacity
+pub const SHARD_CAPACITY: usize = 256;
+/// Shard embedding dimension
+pub const SHARD_DIM: usize = 32;
+
+/// Shard configuration
+#[derive(Debug, Clone)]
+pub struct ShardConfig {
+    /// Shard ID (0-indexed)
+    pub shard_id: u8,
+    /// Total shards in federation
+    pub total_shards: u8,
+    /// This chip's role
+    pub role: ShardRole,
+    /// Replication factor (1 = no replication)
+    pub replication: u8,
+}
+
+/// Role of this chip in the federation
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum ShardRole {
+    /// Coordinator: receives queries, distributes, merges
+    Coordinator,
+    /// Worker: stores vectors, processes local queries
+    Worker,
+    /// Hybrid: both coordinator and worker
+    Hybrid,
+}
+
+/// Query message between chips
+#[derive(Debug, Clone)]
+pub struct ShardQuery {
+    /// Query ID for tracking
+    pub query_id: u32,
+    /// Query embedding
+    pub embedding: HVec<i8, SHARD_DIM>,
+    /// Number of results requested per shard
+    pub k: u8,
+    /// Source chip ID
+    pub source: u8,
+}
+
+/// Response from a shard
+#[derive(Debug, Clone)]
+pub struct ShardResponse {
+    /// Query ID this responds to
+    pub query_id: u32,
+    /// Shard that processed the query
+    pub shard_id: u8,
+    /// Results from this shard
+    pub results: HVec<ShardResult, 16>,
+    /// Processing time in microseconds
+    pub latency_us: u32,
+}
+
+/// Single result from a shard
+#[derive(Debug, Clone, Copy)]
+pub struct ShardResult {
+    /// Vector ID
+    pub id: u32,
+    /// Distance
+    pub distance: i32,
+    /// Shard ID where vector lives
+    pub shard_id: u8,
+}
+
+/// Federated Index (local view)
+pub struct FederatedIndex {
+    /// Configuration
+    config: ShardConfig,
+    /// Local HNSW index
+    local_index: MicroHNSW<SHARD_DIM, SHARD_CAPACITY>,
+    /// Pending queries (for coordinator)
+    pending_queries: HVec<(u32, u8), 16>,  // (query_id, responses_received)
+    /// Collected results (for merging)
+    collected_results: HVec<ShardResult, 64>,
+    /// Next query ID
+    next_query_id: u32,
+    /// Statistics
+    local_query_count: u32,
+    federated_query_count: u32,
+}
+
+impl FederatedIndex {
+    /// Create new federated index
+    pub fn new(config: ShardConfig) -> Self {
+        let hnsw_config = HNSWConfig {
+            m: 6,
+            m_max0: 12,
+            ef_construction: 24,
+            ef_search: 16,
+            metric: DistanceMetric::Euclidean,
+            binary_mode: false,
+        };
+
+        Self {
+            config,
+            local_index: MicroHNSW::new(hnsw_config),
+            pending_queries: HVec::new(),
+            collected_results: HVec::new(),
+            next_query_id: 0,
+            local_query_count: 0,
+            federated_query_count: 0,
+        }
+    }
+
+    /// Insert vector into local shard
+    pub fn insert(&mut self, vector: &MicroVector<SHARD_DIM>) -> Result<usize, &'static str> {
+        // Check if this vector belongs to this shard (hash-based sharding)
+        let shard_for_id = (vector.id as usize) % (self.config.total_shards as usize);
+
+        if shard_for_id != self.config.shard_id as usize {
+            return Err("Vector belongs to different shard");
+        }
+
+        self.local_index.insert(vector)
+    }
+
+    /// Insert vector regardless of sharding (for local-only mode)
+    pub fn insert_local(&mut self, vector: &MicroVector<SHARD_DIM>) -> Result<usize, &'static str> {
+        self.local_index.insert(vector)
+    }
+
+    /// Number of vectors in local shard
+    pub fn local_count(&self) -> usize {
+        self.local_index.len()
+    }
+
+    /// Estimated total vectors across federation
+    pub fn estimated_total(&self) -> usize {
+        self.local_index.len() * self.config.total_shards as usize
+    }
+
+    /// Local search only
+    pub fn search_local(&mut self, query: &[i8], k: usize) -> HVec<SearchResult, 32> {
+        self.local_query_count += 1;
+        self.local_index.search(query, k)
+    }
+
+    /// Create a federated query (for coordinator)
+    pub fn create_query(&mut self, embedding: &[i8], k: u8) -> ShardQuery {
+        let query_id = self.next_query_id;
+        self.next_query_id += 1;
+        self.federated_query_count += 1;
+
+        // Track pending query
+        let _ = self.pending_queries.push((query_id, 0));
+
+        let mut embed = HVec::new();
+        for &v in embedding.iter().take(SHARD_DIM) {
+            let _ = embed.push(v);
+        }
+
+        ShardQuery {
+            query_id,
+            embedding: embed,
+            k,
+            source: self.config.shard_id,
+        }
+    }
+
+    /// Process incoming query (for workers)
+    pub fn process_query(&mut self, query: &ShardQuery) -> ShardResponse {
+        let start = 0u32; // Would use actual timer on ESP32
+
+        let local_results = self.local_index.search(&query.embedding, query.k as usize);
+
+        let mut results = HVec::new();
+        for r in local_results.iter() {
+            let _ = results.push(ShardResult {
+                id: r.id,
+                distance: r.distance,
+                shard_id: self.config.shard_id,
+            });
+        }
+
+        let latency = 100u32; // Simulated
+
+        ShardResponse {
+            query_id: query.query_id,
+            shard_id: self.config.shard_id,
+            results,
+            latency_us: latency,
+        }
+    }
+
+    /// Collect response from shard (for coordinator)
+    pub fn collect_response(&mut self, response: ShardResponse) {
+        // Add results to collected
+        for r in response.results.iter() {
+            let _ = self.collected_results.push(*r);
+        }
+
+        // Update pending query
+        for (qid, count) in self.pending_queries.iter_mut() {
+            if *qid == response.query_id {
+                *count += 1;
+                break;
+            }
+        }
+    }
+
+    /// Check if all responses received
+    pub fn is_query_complete(&self, query_id: u32) -> bool {
+        for (qid, count) in self.pending_queries.iter() {
+            if *qid == query_id {
+                return *count >= self.config.total_shards;
+            }
+        }
+        false
+    }
+
+    /// Merge and return final results
+    pub fn merge_results(&mut self, query_id: u32, k: usize) -> HVec<ShardResult, 32> {
+        // Sort by distance
+        self.collected_results.sort_by_key(|r| r.distance);
+
+        // Take top k
+        let mut final_results = HVec::new();
+        for r in self.collected_results.iter().take(k) {
+            let _ = final_results.push(*r);
+        }
+
+        // Clean up
+        self.collected_results.clear();
+        self.pending_queries.retain(|(qid, _)| *qid != query_id);
+
+        final_results
+    }
+
+    /// Get shard ID for a vector ID
+    pub fn shard_for_id(vector_id: u32, total_shards: u8) -> u8 {
+        (vector_id % total_shards as u32) as u8
+    }
+
+    /// Get configuration
+    pub fn config(&self) -> &ShardConfig {
+        &self.config
+    }
+
+    /// Get statistics
+    pub fn stats(&self) -> (u32, u32) {
+        (self.local_query_count, self.federated_query_count)
+    }
+}
+
+/// Swarm Vector Store - Shared vector memory across swarm
+pub struct SwarmVectorStore {
+    /// Local shard
+    shard: FederatedIndex,
+    /// Peer chip IDs
+    peers: HVec<u8, MAX_SHARDS>,
+    /// Shared knowledge count per peer
+    peer_counts: HVec<u32, MAX_SHARDS>,
+}
+
+impl SwarmVectorStore {
+    /// Create swarm vector store
+    pub fn new(chip_id: u8, total_chips: u8) -> Self {
+        let config = ShardConfig {
+            shard_id: chip_id,
+            total_shards: total_chips,
+            role: if chip_id == 0 { ShardRole::Hybrid } else { ShardRole::Worker },
+            replication: 1,
+        };
+
+        let mut peers = HVec::new();
+        let mut peer_counts = HVec::new();
+        for i in 0..total_chips {
+            if i != chip_id {
+                let _ = peers.push(i);
+                let _ = peer_counts.push(0);
+            }
+        }
+
+        Self {
+            shard: FederatedIndex::new(config),
+            peers,
+            peer_counts,
+        }
+    }
+
+    /// Store shared knowledge
+    pub fn share_knowledge(&mut self, embedding: &[i8], id: u32) -> Result<(), &'static str> {
+        let mut vec_data = HVec::new();
+        for &v in embedding.iter().take(SHARD_DIM) {
+            vec_data.push(v).map_err(|_| "Overflow")?;
+        }
+
+        let vec = MicroVector { data: vec_data, id };
+        self.shard.insert_local(&vec)?;
+        Ok(())
+    }
+
+    /// Query swarm knowledge
+    pub fn query_swarm(&mut self, embedding: &[i8], k: usize) -> HVec<SearchResult, 32> {
+        // For now, just query local shard
+        // In real implementation, would broadcast to peers
+        self.shard.search_local(embedding, k)
+    }
+
+    /// Sync with peer (called when communication received)
+    pub fn sync_peer(&mut self, peer_id: u8, vectors: &[(u32, HVec<i8, SHARD_DIM>)]) {
+        for (id, embedding) in vectors {
+            let vec = MicroVector { data: embedding.clone(), id: *id };
+            let _ = self.shard.insert_local(&vec);
+        }
+
+        // Update peer count
+        if let Some(pos) = self.peers.iter().position(|&p| p == peer_id) {
+            if pos < self.peer_counts.len() {
+                self.peer_counts[pos] += vectors.len() as u32;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_federated_index() {
+        let config = ShardConfig {
+            shard_id: 0,
+            total_shards: 4,
+            role: ShardRole::Hybrid,
+            replication: 1,
+        };
+
+        let mut index = FederatedIndex::new(config);
+
+        // Insert vectors that hash to this shard
+        for i in (0..20).step_by(4) {  // IDs 0, 4, 8, 12, 16 belong to shard 0
+            let data: HVec<i8, SHARD_DIM> = (0..SHARD_DIM).map(|j| ((i + j) % 100) as i8).collect();
+            let vec = MicroVector { data, id: i as u32 };
+            index.insert(&vec).unwrap();
+        }
+
+        assert!(index.local_count() > 0);
+    }
+
+    #[test]
+    fn test_swarm_store() {
+        let mut store = SwarmVectorStore::new(0, 4);
+
+        for i in 0..10 {
+            let embedding = [(i * 10) as i8; SHARD_DIM];
+            store.share_knowledge(&embedding, i).unwrap();
+        }
+
+        let query = [25i8; SHARD_DIM];
+        let results = store.query_swarm(&query, 3);
+        assert!(!results.is_empty());
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/hyperbolic.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/hyperbolic.rs
@@ -0,0 +1,266 @@
+//! Hyperbolic Embeddings for RuvLLM ESP32
+//!
+//! Implements hyperbolic geometry distance metrics optimized for microcontrollers.
+//! Hyperbolic spaces are ideal for hierarchical data (taxonomies, knowledge graphs)
+//! as they naturally represent tree-like structures with exponentially growing space.
+//!
+//! # Models
+//!
+//! ## Poincaré Ball Model
+//! - Points in unit ball: ||x|| < 1
+//! - Conformal (preserves angles)
+//! - Distance: d(x,y) = arcosh(1 + 2||x-y||² / ((1-||x||²)(1-||y||²)))
+//!
+//! ## Lorentz (Hyperboloid) Model
+//! - Points on hyperboloid: -x₀² + x₁² + ... + xₙ² = -1, x₀ > 0
+//! - More numerically stable
+//! - Distance: d(x,y) = arcosh(-⟨x,y⟩_L)
+
+use heapless::Vec as HVec;
+use libm::{acoshf, sqrtf};
+
+/// Scale factor for INT8 to float conversion
+const POINCARE_SCALE: f32 = 127.0 / 0.787;
+
+/// Default curvature of hyperbolic space
+const DEFAULT_CURVATURE: f32 = -1.0;
+
+/// Hyperbolic embedding configuration
+#[derive(Debug, Clone, Copy)]
+pub struct HyperbolicConfig {
+    /// Curvature of the hyperbolic space (negative value)
+    pub curvature: f32,
+    /// Dimension of the embedding
+    pub dim: usize,
+    /// Epsilon for numerical stability
+    pub eps: f32,
+}
+
+impl Default for HyperbolicConfig {
+    fn default() -> Self {
+        Self {
+            curvature: DEFAULT_CURVATURE,
+            dim: 32,
+            eps: 1e-5,
+        }
+    }
+}
+
+/// Poincaré distance between two INT8 vectors
+pub fn poincare_distance_i8(a: &[i8], b: &[i8]) -> i32 {
+    let c = 1.0; // |curvature|
+    let scale = 1.0 / POINCARE_SCALE;
+
+    let mut norm_a_sq: f32 = 0.0;
+    let mut norm_b_sq: f32 = 0.0;
+    let mut diff_sq: f32 = 0.0;
+
+    for (x, y) in a.iter().zip(b.iter()) {
+        let xf = (*x as f32) * scale;
+        let yf = (*y as f32) * scale;
+        norm_a_sq += xf * xf;
+        norm_b_sq += yf * yf;
+        diff_sq += (xf - yf) * (xf - yf);
+    }
+
+    // Clamp norms to stay inside ball
+    let max_norm = 1.0 - 1e-5;
+    norm_a_sq = norm_a_sq.min(max_norm * max_norm);
+    norm_b_sq = norm_b_sq.min(max_norm * max_norm);
+
+    let numerator = 2.0 * c * diff_sq;
+    let denom_a = 1.0 - c * norm_a_sq;
+    let denom_b = 1.0 - c * norm_b_sq;
+    let denominator = denom_a * denom_b;
+
+    if denominator < 1e-10 {
+        return i32::MAX / 2;
+    }
+
+    let arg = (1.0 + numerator / denominator).max(1.0);
+    let dist = acoshf(arg);
+
+    (dist * 1000.0) as i32
+}
+
+/// Lorentz distance from spatial coordinates
+pub fn lorentz_distance_spatial_i8(a: &[i8], b: &[i8]) -> i32 {
+    let scale = 1.0 / POINCARE_SCALE;
+    let k = 1.0; // 1/|c| for c = -1
+
+    let mut norm_a_sq: f32 = 0.0;
+    let mut norm_b_sq: f32 = 0.0;
+    let mut spatial_dot: f32 = 0.0;
+
+    for (x, y) in a.iter().zip(b.iter()) {
+        let xf = (*x as f32) * scale;
+        let yf = (*y as f32) * scale;
+        norm_a_sq += xf * xf;
+        norm_b_sq += yf * yf;
+        spatial_dot += xf * yf;
+    }
+
+    // Compute timelike components: x₀ = √(k + ||x||²)
+    let t_a = sqrtf(k + norm_a_sq);
+    let t_b = sqrtf(k + norm_b_sq);
+
+    // Lorentz inner product: -t_a*t_b + spatial_dot
+    let inner = -t_a * t_b + spatial_dot;
+    let arg = (-inner).max(1.0);
+    let dist = acoshf(arg);
+
+    (dist * 1000.0) as i32
+}
+
+/// Convert Euclidean INT8 vector to Poincaré ball
+pub fn to_poincare_i8(euclidean: &[i8]) -> HVec<i8, 64> {
+    let mut result: HVec<i8, 64> = HVec::new();
+
+    let mut norm_sq: f32 = 0.0;
+    for x in euclidean {
+        let xf = *x as f32;
+        norm_sq += xf * xf;
+    }
+    let norm = sqrtf(norm_sq);
+
+    if norm < 1e-6 {
+        for _ in 0..euclidean.len() {
+            let _ = result.push(0);
+        }
+        return result;
+    }
+
+    let scale = (norm / (2.0 * POINCARE_SCALE)).tanh() * POINCARE_SCALE / norm;
+
+    for x in euclidean {
+        let mapped = ((*x as f32) * scale).clamp(-127.0, 127.0) as i8;
+        let _ = result.push(mapped);
+    }
+
+    result
+}
+
+/// Convert Euclidean INT8 vector to Lorentz hyperboloid
+pub fn to_lorentz_i8(spatial: &[i8]) -> HVec<i8, 65> {
+    let mut result: HVec<i8, 65> = HVec::new();
+    let scale = 1.0 / POINCARE_SCALE;
+
+    let mut norm_sq: f32 = 0.0;
+    for x in spatial {
+        let xf = (*x as f32) * scale;
+        norm_sq += xf * xf;
+    }
+
+    let t = sqrtf(1.0 + norm_sq);
+    let t_scaled = (t * 127.0).clamp(-127.0, 127.0) as i8;
+    let _ = result.push(t_scaled);
+
+    for x in spatial {
+        let _ = result.push(*x);
+    }
+
+    result
+}
+
+/// Hyperbolic midpoint between two points (Poincaré ball)
+pub fn hyperbolic_midpoint(a: &[i8], b: &[i8]) -> HVec<i8, 64> {
+    let scale = 1.0 / POINCARE_SCALE;
+    let mut result: HVec<i8, 64> = HVec::new();
+
+    // Simple approximation: weighted average scaled back
+    for (x, y) in a.iter().zip(b.iter()) {
+        let xf = (*x as f32) * scale;
+        let yf = (*y as f32) * scale;
+        let mid = (xf + yf) * 0.5;
+        let mapped = (mid * POINCARE_SCALE).clamp(-127.0, 127.0) as i8;
+        let _ = result.push(mapped);
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_poincare_distance_zero() {
+        let a = [0i8, 0, 0, 0];
+        let b = [0i8, 0, 0, 0];
+        let dist = poincare_distance_i8(&a, &b);
+        assert!(dist < 10, "Distance at origin should be ~0, got {}", dist);
+    }
+
+    #[test]
+    fn test_poincare_distance_symmetric() {
+        let a = [10i8, 20, 30, 40];
+        let b = [50i8, 60, 70, 80];
+        let d1 = poincare_distance_i8(&a, &b);
+        let d2 = poincare_distance_i8(&b, &a);
+        assert_eq!(d1, d2, "Distance should be symmetric");
+    }
+
+    #[test]
+    fn test_poincare_distance_triangle_inequality() {
+        let a = [10i8, 0, 0, 0];
+        let b = [0i8, 10, 0, 0];
+        let c = [0i8, 0, 10, 0];
+        let ab = poincare_distance_i8(&a, &b);
+        let bc = poincare_distance_i8(&b, &c);
+        let ac = poincare_distance_i8(&a, &c);
+        assert!(ac <= ab + bc + 1, "Triangle inequality violated");
+    }
+
+    #[test]
+    fn test_lorentz_distance_spatial() {
+        let a = [10i8, 20, 30];
+        let b = [60i8, 70, 80];
+        let dist = lorentz_distance_spatial_i8(&a, &b);
+        assert!(dist >= 0, "Distance should be non-negative, got {}", dist);
+        let zero_dist = lorentz_distance_spatial_i8(&a, &a);
+        assert!(zero_dist < 10, "Same point distance should be ~0, got {}", zero_dist);
+    }
+
+    #[test]
+    fn test_lorentz_distance_symmetric() {
+        let a = [10i8, 20, 30];
+        let b = [50i8, 60, 70];
+        let d1 = lorentz_distance_spatial_i8(&a, &b);
+        let d2 = lorentz_distance_spatial_i8(&b, &a);
+        assert_eq!(d1, d2, "Lorentz distance should be symmetric");
+    }
+
+    #[test]
+    fn test_to_poincare_origin() {
+        let euclidean = [0i8, 0, 0, 0];
+        let poincare = to_poincare_i8(&euclidean);
+        for x in poincare.iter() {
+            assert_eq!(*x, 0, "Origin should map to origin");
+        }
+    }
+
+    #[test]
+    fn test_to_lorentz() {
+        let spatial = [50i8, 50, 50];
+        let lorentz = to_lorentz_i8(&spatial);
+        assert!(lorentz[0] > 0, "Timelike component should be positive");
+        assert_eq!(lorentz.len(), spatial.len() + 1, "Should add timelike component");
+    }
+
+    #[test]
+    fn test_hyperbolic_midpoint() {
+        let a = [20i8, 0, 0, 0];
+        let b = [-20i8, 0, 0, 0];
+        let mid = hyperbolic_midpoint(&a, &b);
+        let norm: i32 = mid.iter().map(|&x| (x as i32).abs()).sum();
+        assert!(norm < 50, "Midpoint of symmetric points should be near origin");
+    }
+
+    #[test]
+    fn test_boundary_behavior() {
+        let center = [0i8, 0, 0, 0];
+        let near_boundary = [120i8, 0, 0, 0];
+        let dist = poincare_distance_i8(&center, &near_boundary);
+        assert!(dist > 500, "Distance to boundary should be large");
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/micro_hnsw.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/micro_hnsw.rs
@@ -0,0 +1,446 @@
+//! Micro HNSW - Approximate Nearest Neighbor for ESP32
+//!
+//! A minimal HNSW (Hierarchical Navigable Small World) implementation
+//! designed for ESP32's memory constraints.
+//!
+//! # Features
+//! - Fixed-size graph structure (no dynamic allocation)
+//! - INT8 quantized vectors
+//! - Binary quantization option (32x smaller)
+//! - O(log n) search complexity
+//!
+//! # Memory Usage
+//!
+//! For 64-dimensional INT8 vectors:
+//! - 100 vectors: ~8 KB
+//! - 500 vectors: ~40 KB
+//! - 1000 vectors (binary): ~10 KB
+
+use heapless::Vec as HVec;
+use heapless::BinaryHeap;
+use heapless::binary_heap::Min;
+use super::{MicroVector, DistanceMetric, euclidean_distance_i8, MAX_NEIGHBORS};
+
+/// Maximum vectors in the index
+pub const INDEX_CAPACITY: usize = 256;
+/// Maximum layers in HNSW
+pub const MAX_LAYERS: usize = 4;
+/// Default neighbors per layer
+pub const DEFAULT_M: usize = 8;
+/// Search expansion factor
+pub const EF_SEARCH: usize = 16;
+
+/// HNSW Configuration
+#[derive(Debug, Clone)]
+pub struct HNSWConfig {
+    /// Max neighbors per node
+    pub m: usize,
+    /// Neighbors at layer 0 (usually 2*M)
+    pub m_max0: usize,
+    /// Construction expansion factor
+    pub ef_construction: usize,
+    /// Search expansion factor
+    pub ef_search: usize,
+    /// Distance metric
+    pub metric: DistanceMetric,
+    /// Enable binary quantization
+    pub binary_mode: bool,
+}
+
+impl Default for HNSWConfig {
+    fn default() -> Self {
+        Self {
+            m: 8,
+            m_max0: 16,
+            ef_construction: 32,
+            ef_search: 16,
+            metric: DistanceMetric::Euclidean,
+            binary_mode: false,
+        }
+    }
+}
+
+/// Search result
+#[derive(Debug, Clone, Copy)]
+pub struct SearchResult {
+    /// Vector ID
+    pub id: u32,
+    /// Distance to query
+    pub distance: i32,
+    /// Index in storage
+    pub index: usize,
+}
+
+impl PartialEq for SearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.distance == other.distance
+    }
+}
+
+impl Eq for SearchResult {}
+
+impl PartialOrd for SearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for SearchResult {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.distance.cmp(&other.distance)
+    }
+}
+
+/// Node in the HNSW graph
+#[derive(Debug, Clone)]
+struct HNSWNode<const DIM: usize> {
+    /// Vector data
+    vector: HVec<i8, DIM>,
+    /// User ID
+    id: u32,
+    /// Neighbors per layer [layer][neighbor_indices]
+    neighbors: [HVec<u16, MAX_NEIGHBORS>; MAX_LAYERS],
+    /// Maximum layer this node exists on
+    max_layer: u8,
+}
+
+impl<const DIM: usize> Default for HNSWNode<DIM> {
+    fn default() -> Self {
+        Self {
+            vector: HVec::new(),
+            id: 0,
+            neighbors: Default::default(),
+            max_layer: 0,
+        }
+    }
+}
+
+/// Micro HNSW Index
+pub struct MicroHNSW<const DIM: usize, const CAPACITY: usize> {
+    /// Configuration
+    config: HNSWConfig,
+    /// Stored nodes
+    nodes: HVec<HNSWNode<DIM>, CAPACITY>,
+    /// Entry point (highest layer node)
+    entry_point: Option<usize>,
+    /// Current maximum layer
+    max_layer: u8,
+    /// Random seed for layer selection
+    rng_state: u32,
+}
+
+impl<const DIM: usize, const CAPACITY: usize> MicroHNSW<DIM, CAPACITY> {
+    /// Create new HNSW index
+    pub fn new(config: HNSWConfig) -> Self {
+        Self {
+            config,
+            nodes: HVec::new(),
+            entry_point: None,
+            max_layer: 0,
+            rng_state: 12345, // Default seed
+        }
+    }
+
+    /// Set random seed
+    pub fn with_seed(mut self, seed: u32) -> Self {
+        self.rng_state = seed;
+        self
+    }
+
+    /// Number of vectors in index
+    pub fn len(&self) -> usize {
+        self.nodes.len()
+    }
+
+    /// Check if empty
+    pub fn is_empty(&self) -> bool {
+        self.nodes.is_empty()
+    }
+
+    /// Memory usage in bytes
+    pub fn memory_bytes(&self) -> usize {
+        // Approximate: vectors + neighbor lists
+        self.nodes.len() * (DIM + MAX_LAYERS * MAX_NEIGHBORS * 2 + 8)
+    }
+
+    /// Insert a vector
+    pub fn insert(&mut self, vector: &MicroVector<DIM>) -> Result<usize, &'static str> {
+        if self.nodes.len() >= CAPACITY {
+            return Err("Index full");
+        }
+
+        let new_idx = self.nodes.len();
+        let new_layer = self.random_layer();
+
+        // Create node
+        let mut node = HNSWNode::<DIM>::default();
+        node.vector = vector.data.clone();
+        node.id = vector.id;
+        node.max_layer = new_layer;
+
+        // First node is simple
+        if self.entry_point.is_none() {
+            self.nodes.push(node).map_err(|_| "Push failed")?;
+            self.entry_point = Some(new_idx);
+            self.max_layer = new_layer;
+            return Ok(new_idx);
+        }
+
+        let entry = self.entry_point.unwrap();
+
+        // Add node first so we can reference it
+        self.nodes.push(node).map_err(|_| "Push failed")?;
+
+        // Search for neighbors from top layer down
+        let mut current = entry;
+
+        // Traverse upper layers
+        for layer in (new_layer as usize + 1..=self.max_layer as usize).rev() {
+            current = self.greedy_search_layer(current, &vector.data, layer);
+        }
+
+        // Insert at each layer
+        for layer in (0..=(new_layer as usize).min(self.max_layer as usize)).rev() {
+            let neighbors = self.search_layer(current, &vector.data, layer, self.config.ef_construction);
+
+            // Connect to best neighbors
+            let max_neighbors = if layer == 0 { self.config.m_max0 } else { self.config.m };
+            let mut added = 0;
+
+            for result in neighbors.iter().take(max_neighbors) {
+                if added >= MAX_NEIGHBORS {
+                    break;
+                }
+
+                // Add bidirectional connection
+                if let Some(new_node) = self.nodes.get_mut(new_idx) {
+                    let _ = new_node.neighbors[layer].push(result.index as u16);
+                }
+
+                if let Some(neighbor_node) = self.nodes.get_mut(result.index) {
+                    if neighbor_node.neighbors[layer].len() < MAX_NEIGHBORS {
+                        let _ = neighbor_node.neighbors[layer].push(new_idx as u16);
+                    }
+                }
+
+                added += 1;
+            }
+
+            if !neighbors.is_empty() {
+                current = neighbors[0].index;
+            }
+        }
+
+        // Update entry point if new node has higher layer
+        if new_layer > self.max_layer {
+            self.entry_point = Some(new_idx);
+            self.max_layer = new_layer;
+        }
+
+        Ok(new_idx)
+    }
+
+    /// Search for k nearest neighbors
+    pub fn search(&self, query: &[i8], k: usize) -> HVec<SearchResult, 32> {
+        let mut results = HVec::new();
+
+        if self.entry_point.is_none() || k == 0 {
+            return results;
+        }
+
+        let entry = self.entry_point.unwrap();
+
+        // Traverse from top layer
+        let mut current = entry;
+        for layer in (1..=self.max_layer as usize).rev() {
+            current = self.greedy_search_layer(current, query, layer);
+        }
+
+        // Search layer 0 with ef expansion
+        let candidates = self.search_layer(current, query, 0, self.config.ef_search);
+
+        // Return top k
+        for result in candidates.into_iter().take(k) {
+            let _ = results.push(result);
+        }
+
+        results
+    }
+
+    /// Search specific layer
+    fn search_layer(&self, entry: usize, query: &[i8], layer: usize, ef: usize) -> HVec<SearchResult, 64> {
+        let mut visited = [false; CAPACITY];
+        let mut candidates: BinaryHeap<SearchResult, Min, 64> = BinaryHeap::new();
+        let mut results: HVec<SearchResult, 64> = HVec::new();
+
+        visited[entry] = true;
+        let entry_dist = self.distance(query, entry);
+
+        let _ = candidates.push(SearchResult {
+            id: self.nodes[entry].id,
+            distance: entry_dist,
+            index: entry,
+        });
+        let _ = results.push(SearchResult {
+            id: self.nodes[entry].id,
+            distance: entry_dist,
+            index: entry,
+        });
+
+        while let Some(current) = candidates.pop() {
+            // Early termination
+            if results.len() >= ef {
+                if let Some(worst) = results.iter().max_by_key(|r| r.distance) {
+                    if current.distance > worst.distance {
+                        break;
+                    }
+                }
+            }
+
+            // Explore neighbors
+            if let Some(node) = self.nodes.get(current.index) {
+                if layer < node.neighbors.len() {
+                    for &neighbor_idx in node.neighbors[layer].iter() {
+                        let neighbor_idx = neighbor_idx as usize;
+                        if neighbor_idx < CAPACITY && !visited[neighbor_idx] {
+                            visited[neighbor_idx] = true;
+
+                            let dist = self.distance(query, neighbor_idx);
+
+                            // Add if better than worst in results
+                            let should_add = results.len() < ef ||
+                                results.iter().any(|r| dist < r.distance);
+
+                            if should_add {
+                                let result = SearchResult {
+                                    id: self.nodes[neighbor_idx].id,
+                                    distance: dist,
+                                    index: neighbor_idx,
+                                };
+                                let _ = candidates.push(result);
+                                let _ = results.push(result);
+
+                                // Keep results bounded
+                                if results.len() > ef * 2 {
+                                    results.sort_by_key(|r| r.distance);
+                                    results.truncate(ef);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Sort and truncate
+        results.sort_by_key(|r| r.distance);
+        results
+    }
+
+    /// Greedy search on a single layer
+    fn greedy_search_layer(&self, entry: usize, query: &[i8], layer: usize) -> usize {
+        let mut current = entry;
+        let mut current_dist = self.distance(query, current);
+
+        loop {
+            let mut improved = false;
+
+            if let Some(node) = self.nodes.get(current) {
+                if layer < node.neighbors.len() {
+                    for &neighbor_idx in node.neighbors[layer].iter() {
+                        let neighbor_idx = neighbor_idx as usize;
+                        if neighbor_idx < self.nodes.len() {
+                            let dist = self.distance(query, neighbor_idx);
+                            if dist < current_dist {
+                                current = neighbor_idx;
+                                current_dist = dist;
+                                improved = true;
+                            }
+                        }
+                    }
+                }
+            }
+
+            if !improved {
+                break;
+            }
+        }
+
+        current
+    }
+
+    /// Calculate distance between query and stored vector
+    fn distance(&self, query: &[i8], idx: usize) -> i32 {
+        if let Some(node) = self.nodes.get(idx) {
+            self.config.metric.distance(query, &node.vector)
+        } else {
+            i32::MAX
+        }
+    }
+
+    /// Generate random layer (exponential distribution)
+    fn random_layer(&mut self) -> u8 {
+        // Simple LCG random
+        self.rng_state = self.rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+        let rand = self.rng_state;
+
+        // Count leading zeros gives exponential distribution
+        let layer = (rand.leading_zeros() / 4) as u8;
+        layer.min(MAX_LAYERS as u8 - 1)
+    }
+
+    /// Get vector by index
+    pub fn get(&self, idx: usize) -> Option<&[i8]> {
+        self.nodes.get(idx).map(|n| n.vector.as_slice())
+    }
+
+    /// Get ID by index
+    pub fn get_id(&self, idx: usize) -> Option<u32> {
+        self.nodes.get(idx).map(|n| n.id)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hnsw_basic() {
+        let mut index: MicroHNSW<8, 100> = MicroHNSW::new(HNSWConfig::default());
+
+        // Insert vectors
+        for i in 0..10 {
+            let data: HVec<i8, 8> = (0..8).map(|j| (i * 10 + j) as i8).collect();
+            let vec = MicroVector { data, id: i as u32 };
+            index.insert(&vec).unwrap();
+        }
+
+        assert_eq!(index.len(), 10);
+    }
+
+    #[test]
+    fn test_hnsw_search() {
+        let mut index: MicroHNSW<4, 100> = MicroHNSW::new(HNSWConfig::default());
+
+        // Insert specific vectors
+        let vectors = [
+            [10i8, 0, 0, 0],
+            [0i8, 10, 0, 0],
+            [0i8, 0, 10, 0],
+            [11i8, 1, 0, 0], // Close to first
+        ];
+
+        for (i, v) in vectors.iter().enumerate() {
+            let data: HVec<i8, 4> = v.iter().copied().collect();
+            let vec = MicroVector { data, id: i as u32 };
+            index.insert(&vec).unwrap();
+        }
+
+        // Search for vector close to first
+        let query = [10i8, 0, 0, 0];
+        let results = index.search(&query, 2);
+
+        assert!(!results.is_empty());
+        assert_eq!(results[0].id, 0); // Exact match should be first
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/mod.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/mod.rs
@@ -0,0 +1,229 @@
+//! RuVector Integration for ESP32
+//!
+//! Brings vector database capabilities to microcontrollers:
+//! - Micro HNSW index for similarity search
+//! - Semantic memory for context-aware AI
+//! - RAG (Retrieval-Augmented Generation)
+//! - Anomaly detection via embedding distance
+//! - Federated vector search across chip clusters
+//!
+//! # Memory Budget
+//!
+//! | Component | Size | Vectors |
+//! |-----------|------|---------|
+//! | Micro HNSW (64-dim, 100 vectors) | ~8 KB | 100 |
+//! | Binary HNSW (64-dim, 1000 vectors) | ~10 KB | 1000 |
+//! | Semantic Memory (50 memories) | ~4 KB | 50 |
+//! | RAG Context Cache (10 docs) | ~2 KB | 10 |
+//!
+//! # Capabilities from RuVector
+//!
+//! - HNSW approximate nearest neighbor (adapted for fixed memory)
+//! - Binary quantization (32x compression)
+//! - Product quantization (8-64x compression)
+//! - Cosine/Euclidean/Hamming distance
+//! - Self-learning pattern recognition
+
+pub mod micro_hnsw;
+pub mod semantic_memory;
+pub mod rag;
+pub mod anomaly;
+pub mod federated_search;
+
+// Re-exports
+pub use micro_hnsw::{MicroHNSW, HNSWConfig, SearchResult};
+pub use semantic_memory::{SemanticMemory, Memory, MemoryType};
+pub use rag::{MicroRAG, RAGConfig, RAGResult};
+pub use anomaly::{AnomalyDetector, AnomalyConfig, AnomalyResult};
+pub use federated_search::{FederatedIndex, ShardConfig};
+
+use heapless::Vec as HVec;
+
+/// Maximum dimensions for vectors on ESP32
+pub const MAX_DIMENSIONS: usize = 128;
+/// Maximum vectors in a single index
+pub const MAX_VECTORS: usize = 1000;
+/// Maximum neighbors per node in HNSW
+pub const MAX_NEIGHBORS: usize = 16;
+
+/// Quantized vector type for ESP32
+#[derive(Debug, Clone)]
+pub struct MicroVector<const DIM: usize> {
+    /// INT8 quantized components
+    pub data: HVec<i8, DIM>,
+    /// Optional metadata ID
+    pub id: u32,
+}
+
+impl<const DIM: usize> MicroVector<DIM> {
+    /// Create from i8 slice
+    pub fn from_i8(data: &[i8], id: u32) -> Option<Self> {
+        if data.len() > DIM {
+            return None;
+        }
+        let mut vec = HVec::new();
+        for &v in data {
+            vec.push(v).ok()?;
+        }
+        Some(Self { data: vec, id })
+    }
+
+    /// Create from f32 slice (quantizes to INT8)
+    pub fn from_f32(data: &[f32], id: u32) -> Option<Self> {
+        if data.len() > DIM {
+            return None;
+        }
+        let mut vec = HVec::new();
+        for &v in data {
+            let quantized = (v * 127.0).clamp(-128.0, 127.0) as i8;
+            vec.push(quantized).ok()?;
+        }
+        Some(Self { data: vec, id })
+    }
+
+    /// Dimension count
+    pub fn dim(&self) -> usize {
+        self.data.len()
+    }
+}
+
+/// Distance metrics
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum DistanceMetric {
+    /// Euclidean (L2) distance
+    Euclidean,
+    /// Cosine similarity (returned as 1 - cosine)
+    Cosine,
+    /// Manhattan (L1) distance
+    Manhattan,
+    /// Hamming distance (for binary vectors)
+    Hamming,
+    /// Dot product (for normalized vectors)
+    DotProduct,
+}
+
+impl DistanceMetric {
+    /// Calculate distance between two INT8 vectors
+    pub fn distance(&self, a: &[i8], b: &[i8]) -> i32 {
+        match self {
+            Self::Euclidean => euclidean_distance_i8(a, b),
+            Self::Cosine => cosine_distance_i8(a, b),
+            Self::Manhattan => manhattan_distance_i8(a, b),
+            Self::Hamming => hamming_distance_i8(a, b),
+            Self::DotProduct => -dot_product_i8(a, b), // Negate for min-heap
+        }
+    }
+}
+
+/// INT8 Euclidean distance squared (avoids sqrt)
+pub fn euclidean_distance_i8(a: &[i8], b: &[i8]) -> i32 {
+    let mut sum: i32 = 0;
+    for (x, y) in a.iter().zip(b.iter()) {
+        let diff = (*x as i32) - (*y as i32);
+        sum += diff * diff;
+    }
+    sum
+}
+
+/// INT8 Cosine distance (1 - similarity) scaled to i32
+pub fn cosine_distance_i8(a: &[i8], b: &[i8]) -> i32 {
+    let mut dot: i32 = 0;
+    let mut norm_a: i32 = 0;
+    let mut norm_b: i32 = 0;
+
+    for (x, y) in a.iter().zip(b.iter()) {
+        let xi = *x as i32;
+        let yi = *y as i32;
+        dot += xi * yi;
+        norm_a += xi * xi;
+        norm_b += yi * yi;
+    }
+
+    // Avoid division by zero
+    if norm_a == 0 || norm_b == 0 {
+        return i32::MAX;
+    }
+
+    // Return (1 - cosine) * 1000 for precision
+    // cosine = dot / (sqrt(norm_a) * sqrt(norm_b))
+    // Approximate with fixed-point: 1000 - (dot * 1000) / sqrt(norm_a * norm_b)
+    let norm_product = ((norm_a as i64) * (norm_b as i64)).min(i64::MAX as i64);
+    let norm_sqrt = isqrt(norm_product as u64) as i32;
+
+    if norm_sqrt == 0 {
+        return i32::MAX;
+    }
+
+    1000 - ((dot * 1000) / norm_sqrt)
+}
+
+/// INT8 Manhattan distance
+pub fn manhattan_distance_i8(a: &[i8], b: &[i8]) -> i32 {
+    let mut sum: i32 = 0;
+    for (x, y) in a.iter().zip(b.iter()) {
+        sum += ((*x as i32) - (*y as i32)).abs();
+    }
+    sum
+}
+
+/// Hamming distance (count differing bits)
+pub fn hamming_distance_i8(a: &[i8], b: &[i8]) -> i32 {
+    let mut count = 0i32;
+    for (x, y) in a.iter().zip(b.iter()) {
+        count += (*x ^ *y).count_ones() as i32;
+    }
+    count
+}
+
+/// INT8 dot product
+pub fn dot_product_i8(a: &[i8], b: &[i8]) -> i32 {
+    let mut sum: i32 = 0;
+    for (x, y) in a.iter().zip(b.iter()) {
+        sum += (*x as i32) * (*y as i32);
+    }
+    sum
+}
+
+/// Integer square root (no floating point)
+fn isqrt(n: u64) -> u64 {
+    if n == 0 {
+        return 0;
+    }
+    let mut x = n;
+    let mut y = (x + 1) / 2;
+    while y < x {
+        x = y;
+        y = (x + n / x) / 2;
+    }
+    x
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_euclidean_distance() {
+        let a = [10i8, 20, 30, 40];
+        let b = [11i8, 21, 31, 41];
+        let dist = euclidean_distance_i8(&a, &b);
+        assert_eq!(dist, 4); // 1 + 1 + 1 + 1 = 4
+    }
+
+    #[test]
+    fn test_micro_vector() {
+        let data = [1i8, 2, 3, 4, 5, 6, 7, 8];
+        let vec: MicroVector<16> = MicroVector::from_i8(&data, 42).unwrap();
+        assert_eq!(vec.dim(), 8);
+        assert_eq!(vec.id, 42);
+    }
+
+    #[test]
+    fn test_cosine_distance() {
+        // Same direction = 0 distance
+        let a = [100i8, 0, 0, 0];
+        let b = [50i8, 0, 0, 0];
+        let dist = cosine_distance_i8(&a, &b);
+        assert!(dist < 100); // Should be close to 0
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/rag.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/rag.rs
@@ -0,0 +1,409 @@
+//! Micro RAG - Retrieval-Augmented Generation for ESP32
+//!
+//! Enables small language models to access external knowledge,
+//! dramatically improving accuracy without larger models.
+//!
+//! # How RAG Works
+//!
+//! ```text
+//! Question: "What's the capital of France?"
+//!     │
+//!     ▼
+//! ┌─────────────────────────────────────────────────────────────┐
+//! │                     MICRO RAG PIPELINE                      │
+//! ├─────────────────────────────────────────────────────────────┤
+//! │                                                             │
+//! │  1. EMBED    Question ──▶ [0.2, 0.1, 0.8, ...]             │
+//! │              │                                              │
+//! │  2. SEARCH   ▼                                              │
+//! │      ┌────────────────┐                                     │
+//! │      │ Vector Index   │ ──▶ Top 3 relevant docs             │
+//! │      │ (HNSW)         │                                     │
+//! │      └────────────────┘                                     │
+//! │              │                                              │
+//! │  3. AUGMENT  ▼                                              │
+//! │      Context: "France is a country in Europe.               │
+//! │               Paris is the capital of France.               │
+//! │               The Eiffel Tower is in Paris."                │
+//! │              │                                              │
+//! │  4. GENERATE ▼                                              │
+//! │      ┌────────────────┐                                     │
+//! │      │ Tiny LLM       │ ──▶ "Paris"                         │
+//! │      └────────────────┘                                     │
+//! │                                                             │
+//! └─────────────────────────────────────────────────────────────┘
+//! ```
+//!
+//! # Benefits
+//!
+//! - 50K model + RAG ≈ 1M model accuracy for factual questions
+//! - Knowledge can be updated without retraining
+//! - Explainable: you can see which documents were used
+
+use heapless::Vec as HVec;
+use heapless::String as HString;
+use super::{MicroHNSW, HNSWConfig, SearchResult, MicroVector, DistanceMetric};
+
+/// Maximum documents in RAG index
+pub const MAX_DOCUMENTS: usize = 256;
+/// Maximum chunks per document
+pub const MAX_CHUNKS: usize = 512;
+/// Chunk embedding dimension
+pub const CHUNK_DIM: usize = 32;
+/// Maximum text per chunk
+pub const MAX_CHUNK_TEXT: usize = 128;
+/// Maximum context size for generation
+pub const MAX_CONTEXT: usize = 256;
+
+/// RAG Configuration
+#[derive(Debug, Clone)]
+pub struct RAGConfig {
+    /// Number of documents to retrieve
+    pub top_k: usize,
+    /// Minimum similarity threshold (0-1000)
+    pub min_similarity: i32,
+    /// Maximum context tokens
+    pub max_context_tokens: usize,
+    /// Include source attribution
+    pub include_sources: bool,
+    /// Rerank retrieved documents
+    pub enable_reranking: bool,
+}
+
+impl Default for RAGConfig {
+    fn default() -> Self {
+        Self {
+            top_k: 3,
+            min_similarity: 200, // Distance threshold
+            max_context_tokens: 128,
+            include_sources: true,
+            enable_reranking: false,
+        }
+    }
+}
+
+/// A chunk of text with embedding
+#[derive(Debug, Clone)]
+pub struct Chunk {
+    /// Unique chunk ID
+    pub id: u32,
+    /// Parent document ID
+    pub doc_id: u16,
+    /// Chunk index within document
+    pub chunk_idx: u8,
+    /// Text content
+    pub text: HString<MAX_CHUNK_TEXT>,
+    /// Embedding
+    pub embedding: HVec<i8, CHUNK_DIM>,
+}
+
+impl Chunk {
+    /// Create new chunk
+    pub fn new(id: u32, doc_id: u16, chunk_idx: u8, text: &str, embedding: &[i8]) -> Option<Self> {
+        let mut text_str = HString::new();
+        for c in text.chars().take(MAX_CHUNK_TEXT) {
+            text_str.push(c).ok()?;
+        }
+
+        let mut embed = HVec::new();
+        for &v in embedding.iter().take(CHUNK_DIM) {
+            embed.push(v).ok()?;
+        }
+
+        Some(Self {
+            id,
+            doc_id,
+            chunk_idx,
+            text: text_str,
+            embedding: embed,
+        })
+    }
+}
+
+/// RAG Result
+#[derive(Debug)]
+pub struct RAGResult {
+    /// Retrieved context (concatenated chunks)
+    pub context: HString<MAX_CONTEXT>,
+    /// Source chunk IDs
+    pub source_ids: HVec<u32, 8>,
+    /// Relevance scores
+    pub scores: HVec<i32, 8>,
+    /// Whether context is truncated
+    pub truncated: bool,
+}
+
+/// Micro RAG Engine
+pub struct MicroRAG {
+    /// Configuration
+    config: RAGConfig,
+    /// HNSW index for chunk retrieval
+    index: MicroHNSW<CHUNK_DIM, MAX_CHUNKS>,
+    /// Stored chunks
+    chunks: HVec<Chunk, MAX_CHUNKS>,
+    /// Document count
+    doc_count: u16,
+    /// Next chunk ID
+    next_chunk_id: u32,
+}
+
+impl MicroRAG {
+    /// Create new RAG engine
+    pub fn new(config: RAGConfig) -> Self {
+        let hnsw_config = HNSWConfig {
+            m: 6,
+            m_max0: 12,
+            ef_construction: 24,
+            ef_search: 16,
+            metric: DistanceMetric::Euclidean,
+            binary_mode: false,
+        };
+
+        Self {
+            config,
+            index: MicroHNSW::new(hnsw_config),
+            chunks: HVec::new(),
+            doc_count: 0,
+            next_chunk_id: 0,
+        }
+    }
+
+    /// Number of indexed chunks
+    pub fn chunk_count(&self) -> usize {
+        self.chunks.len()
+    }
+
+    /// Number of documents
+    pub fn doc_count(&self) -> u16 {
+        self.doc_count
+    }
+
+    /// Memory usage in bytes
+    pub fn memory_bytes(&self) -> usize {
+        self.index.memory_bytes() + self.chunks.len() * core::mem::size_of::<Chunk>()
+    }
+
+    /// Add a document (split into chunks)
+    pub fn add_document(&mut self, chunks: &[(&str, &[i8])]) -> Result<u16, &'static str> {
+        let doc_id = self.doc_count;
+        self.doc_count += 1;
+
+        for (idx, (text, embedding)) in chunks.iter().enumerate() {
+            if self.chunks.len() >= MAX_CHUNKS {
+                return Err("Chunk limit reached");
+            }
+
+            let chunk_id = self.next_chunk_id;
+            self.next_chunk_id += 1;
+
+            let chunk = Chunk::new(chunk_id, doc_id, idx as u8, text, embedding)
+                .ok_or("Failed to create chunk")?;
+
+            // Add to HNSW index
+            let vec = MicroVector {
+                data: chunk.embedding.clone(),
+                id: chunk_id,
+            };
+            self.index.insert(&vec)?;
+
+            // Store chunk
+            self.chunks.push(chunk).map_err(|_| "Chunk storage full")?;
+        }
+
+        Ok(doc_id)
+    }
+
+    /// Add a single pre-chunked piece of knowledge
+    pub fn add_knowledge(&mut self, text: &str, embedding: &[i8]) -> Result<u32, &'static str> {
+        if self.chunks.len() >= MAX_CHUNKS {
+            return Err("Chunk limit reached");
+        }
+
+        let chunk_id = self.next_chunk_id;
+        self.next_chunk_id += 1;
+
+        let chunk = Chunk::new(chunk_id, self.doc_count, 0, text, embedding)
+            .ok_or("Failed to create chunk")?;
+
+        let vec = MicroVector {
+            data: chunk.embedding.clone(),
+            id: chunk_id,
+        };
+        self.index.insert(&vec)?;
+        self.chunks.push(chunk).map_err(|_| "Chunk storage full")?;
+
+        self.doc_count += 1;
+        Ok(chunk_id)
+    }
+
+    /// Retrieve relevant context for a query
+    pub fn retrieve(&self, query_embedding: &[i8]) -> RAGResult {
+        let search_results = self.index.search(query_embedding, self.config.top_k * 2);
+
+        let mut context = HString::new();
+        let mut source_ids = HVec::new();
+        let mut scores = HVec::new();
+        let mut truncated = false;
+
+        let mut added = 0;
+        for result in search_results.iter() {
+            // Check similarity threshold
+            if result.distance > self.config.min_similarity && added > 0 {
+                continue;
+            }
+
+            if let Some(chunk) = self.find_chunk_by_id(result.id) {
+                // Check if we have room
+                if context.len() + chunk.text.len() + 2 > MAX_CONTEXT {
+                    if added > 0 {
+                        truncated = true;
+                        break;
+                    }
+                }
+
+                // Add separator
+                if !context.is_empty() {
+                    let _ = context.push_str(" | ");
+                }
+
+                // Add chunk text
+                for c in chunk.text.chars() {
+                    if context.push(c).is_err() {
+                        truncated = true;
+                        break;
+                    }
+                }
+
+                let _ = source_ids.push(result.id);
+                let _ = scores.push(result.distance);
+                added += 1;
+
+                if added >= self.config.top_k {
+                    break;
+                }
+            }
+        }
+
+        RAGResult {
+            context,
+            source_ids,
+            scores,
+            truncated,
+        }
+    }
+
+    /// Retrieve and format for LLM prompt
+    pub fn retrieve_prompt(&self, query_embedding: &[i8], question: &str) -> HString<512> {
+        let rag_result = self.retrieve(query_embedding);
+
+        let mut prompt = HString::new();
+
+        // Add context
+        let _ = prompt.push_str("Context: ");
+        for c in rag_result.context.chars() {
+            let _ = prompt.push(c);
+        }
+        let _ = prompt.push_str("\n\nQuestion: ");
+        for c in question.chars().take(128) {
+            let _ = prompt.push(c);
+        }
+        let _ = prompt.push_str("\n\nAnswer: ");
+
+        prompt
+    }
+
+    /// Find chunk by ID
+    fn find_chunk_by_id(&self, id: u32) -> Option<&Chunk> {
+        self.chunks.iter().find(|c| c.id == id)
+    }
+
+    /// Get all chunks for a document
+    pub fn get_document_chunks(&self, doc_id: u16) -> HVec<&Chunk, 16> {
+        let mut result = HVec::new();
+        for chunk in self.chunks.iter() {
+            if chunk.doc_id == doc_id {
+                let _ = result.push(chunk);
+            }
+        }
+        result.sort_by_key(|c| c.chunk_idx);
+        result
+    }
+}
+
+impl Default for MicroRAG {
+    fn default() -> Self {
+        Self::new(RAGConfig::default())
+    }
+}
+
+/// Helper: Simple text chunker for preprocessing
+pub fn chunk_text(text: &str, chunk_size: usize, overlap: usize) -> HVec<HString<MAX_CHUNK_TEXT>, 16> {
+    let mut chunks = HVec::new();
+    let chars: HVec<char, 1024> = text.chars().collect();
+
+    let mut start = 0;
+    while start < chars.len() {
+        let end = (start + chunk_size).min(chars.len());
+
+        let mut chunk = HString::new();
+        for &c in chars[start..end].iter() {
+            let _ = chunk.push(c);
+        }
+
+        if !chunk.is_empty() {
+            let _ = chunks.push(chunk);
+        }
+
+        if end >= chars.len() {
+            break;
+        }
+
+        start = end.saturating_sub(overlap);
+    }
+
+    chunks
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_rag_basic() {
+        let mut rag = MicroRAG::default();
+
+        // Add knowledge
+        let embed1 = [10i8; CHUNK_DIM];
+        let embed2 = [20i8; CHUNK_DIM];
+
+        rag.add_knowledge("Paris is the capital of France", &embed1).unwrap();
+        rag.add_knowledge("London is the capital of UK", &embed2).unwrap();
+
+        assert_eq!(rag.chunk_count(), 2);
+    }
+
+    #[test]
+    fn test_rag_retrieve() {
+        let mut rag = MicroRAG::default();
+
+        let embed1 = [10i8; CHUNK_DIM];
+        let embed2 = [50i8; CHUNK_DIM];
+
+        rag.add_knowledge("The sky is blue", &embed1).unwrap();
+        rag.add_knowledge("Grass is green", &embed2).unwrap();
+
+        // Query similar to first
+        let query = [11i8; CHUNK_DIM];
+        let result = rag.retrieve(&query);
+
+        assert!(!result.context.is_empty());
+        assert!(!result.source_ids.is_empty());
+    }
+
+    #[test]
+    fn test_chunk_text() {
+        let text = "Hello world this is a test";
+        let chunks = chunk_text(text, 10, 3);
+        assert!(!chunks.is_empty());
+    }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/semantic_memory.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32/src/ruvector/semantic_memory.rs
@@ -0,0 +1,374 @@
+//! Semantic Memory - Context-Aware AI Memory for ESP32
+//!
+//! Enables AI to remember and recall information based on meaning,
+//! not just keywords. Perfect for:
+//! - Personal assistants that remember preferences
+//! - Robots that learn from experience
+//! - Smart home devices that understand context
+//!
+//! # How It Works
+//!
+//! ```text
+//! User: "I like my coffee at 7am"
+//!         │
+//!         ▼
+//! ┌─────────────────┐
+//! │ Embed to Vector │ ──▶ [0.2, 0.8, -0.1, ...]
+//! └─────────────────┘
+//!         │
+//!         ▼
+//! ┌─────────────────┐
+//! │ Store in Memory │ ──▶ ID: 42, Type: Preference
+//! └─────────────────┘
+//!
+//! Later: "What time do I like coffee?"
+//!         │
+//!         ▼
+//! ┌─────────────────┐
+//! │ Search Similar  │ ──▶ Found: "I like my coffee at 7am"
+//! └─────────────────┘
+//! ```
+
+use heapless::Vec as HVec;
+use heapless::String as HString;
+use super::{MicroHNSW, HNSWConfig, SearchResult, MicroVector, DistanceMetric};
+
+/// Maximum memories
+pub const MAX_MEMORIES: usize = 128;
+/// Maximum text length per memory
+pub const MAX_TEXT_LEN: usize = 64;
+/// Embedding dimension
+pub const MEMORY_DIM: usize = 32;
+
+/// Memory type classification
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum MemoryType {
+    /// User preference ("I like X")
+    Preference,
+    /// Factual knowledge ("X is Y")
+    Fact,
+    /// Event/experience ("Yesterday I did X")
+    Event,
+    /// Skill/procedure ("To do X, first Y")
+    Procedure,
+    /// Entity/person ("John is my friend")
+    Entity,
+    /// Emotional context ("I feel X about Y")
+    Emotion,
+    /// Conversation context
+    Context,
+    /// System/device state
+    State,
+}
+
+impl MemoryType {
+    /// Priority weight for retrieval
+    pub fn priority(&self) -> i32 {
+        match self {
+            Self::State => 100,      // Most recent state is critical
+            Self::Context => 90,     // Current conversation context
+            Self::Preference => 80,  // User preferences matter
+            Self::Emotion => 70,     // Emotional context
+            Self::Procedure => 60,   // How-to knowledge
+            Self::Fact => 50,        // General facts
+            Self::Event => 40,       // Past events
+            Self::Entity => 30,      // People/things
+        }
+    }
+}
+
+/// A single memory entry
+#[derive(Debug, Clone)]
+pub struct Memory {
+    /// Unique ID
+    pub id: u32,
+    /// Memory type
+    pub memory_type: MemoryType,
+    /// Timestamp (seconds since boot or epoch)
+    pub timestamp: u32,
+    /// Text content (truncated)
+    pub text: HString<MAX_TEXT_LEN>,
+    /// Importance score (0-100)
+    pub importance: u8,
+    /// Access count (for recency weighting)
+    pub access_count: u16,
+    /// Embedding vector
+    pub embedding: HVec<i8, MEMORY_DIM>,
+}
+
+impl Memory {
+    /// Create new memory
+    pub fn new(
+        id: u32,
+        memory_type: MemoryType,
+        text: &str,
+        embedding: &[i8],
+        timestamp: u32,
+    ) -> Option<Self> {
+        let mut text_str = HString::new();
+        for c in text.chars().take(MAX_TEXT_LEN) {
+            text_str.push(c).ok()?;
+        }
+
+        let mut embed_vec = HVec::new();
+        for &v in embedding.iter().take(MEMORY_DIM) {
+            embed_vec.push(v).ok()?;
+        }
+
+        Some(Self {
+            id,
+            memory_type,
+            timestamp,
+            text: text_str,
+            importance: 50,
+            access_count: 0,
+            embedding: embed_vec,
+        })
+    }
+
+    /// Calculate relevance score
+    pub fn relevance_score(&self, distance: i32, current_time: u32) -> i32 {
+        let type_weight = self.memory_type.priority();
+        let importance_weight = self.importance as i32;
+
+        // Recency decay (newer = higher score)
+        let age_seconds = current_time.saturating_sub(self.timestamp);
+        let recency = 100 - (age_seconds / 3600).min(100) as i32; // Decay over hours
+
+        // Access frequency boost
+        let frequency = (self.access_count as i32).min(50);
+
+        // Combined score (higher is better, distance is inverted)
+        let distance_score = 1000 - distance.min(1000);
+
+        (distance_score * 3 + type_weight * 2 + importance_weight + recency + frequency) / 7
+    }
+}
+
+/// Semantic Memory System
+pub struct SemanticMemory {
+    /// HNSW index for fast similarity search
+    index: MicroHNSW<MEMORY_DIM, MAX_MEMORIES>,
+    /// Memory entries
+    memories: HVec<Memory, MAX_MEMORIES>,
+    /// Next memory ID
+    next_id: u32,
+    /// Current time (updated externally)
+    current_time: u32,
+}
+
+impl SemanticMemory {
+    /// Create new semantic memory
+    pub fn new() -> Self {
+        let config = HNSWConfig {
+            m: 4,
+            m_max0: 8,
+            ef_construction: 16,
+            ef_search: 8,
+            metric: DistanceMetric::Euclidean,
+            binary_mode: false,
+        };
+
+        Self {
+            index: MicroHNSW::new(config),
+            memories: HVec::new(),
+            next_id: 0,
+            current_time: 0,
+        }
+    }
+
+    /// Update current time
+    pub fn set_time(&mut self, time: u32) {
+        self.current_time = time;
+    }
+
+    /// Number of memories stored
+    pub fn len(&self) -> usize {
+        self.memories.len()
+    }
+
+    /// Check if empty
+    pub fn is_empty(&self) -> bool {
+        self.memories.is_empty()
+    }
+
+    /// Memory usage in bytes
+    pub fn memory_bytes(&self) -> usize {
+        self.index.memory_bytes() + self.memories.len() * core::mem::size_of::<Memory>()
+    }
+
+    /// Store a new memory
+    pub fn remember(
+        &mut self,
+        memory_type: MemoryType,
+        text: &str,
+        embedding: &[i8],
+    ) -> Result<u32, &'static str> {
+        if self.memories.len() >= MAX_MEMORIES {
+            // Evict least important memory
+            self.evict_least_important()?;
+        }
+
+        let id = self.next_id;
+        self.next_id += 1;
+
+        let memory = Memory::new(id, memory_type, text, embedding, self.current_time)
+            .ok_or("Failed to create memory")?;
+
+        // Add to HNSW index
+        let vec = MicroVector {
+            data: memory.embedding.clone(),
+            id,
+        };
+        self.index.insert(&vec)?;
+
+        // Store memory
+        self.memories.push(memory).map_err(|_| "Memory full")?;
+
+        Ok(id)
+    }
+
+    /// Recall memories similar to query
+    pub fn recall(&mut self, query_embedding: &[i8], k: usize) -> HVec<(Memory, i32), 16> {
+        let mut results = HVec::new();
+
+        let search_results = self.index.search(query_embedding, k * 2);
+
+        for result in search_results.iter() {
+            if let Some(memory) = self.find_memory_by_id(result.id) {
+                let score = memory.relevance_score(result.distance, self.current_time);
+                let _ = results.push((memory.clone(), score));
+            }
+        }
+
+        // Sort by relevance score
+        results.sort_by(|a, b| b.1.cmp(&a.1));
+
+        // Update access counts
+        for (mem, _) in results.iter() {
+            self.increment_access(mem.id);
+        }
+
+        // Truncate to k
+        while results.len() > k {
+            results.pop();
+        }
+
+        results
+    }
+
+    /// Recall memories of specific type
+    pub fn recall_by_type(
+        &mut self,
+        query_embedding: &[i8],
+        memory_type: MemoryType,
+        k: usize,
+    ) -> HVec<Memory, 16> {
+        let all_results = self.recall(query_embedding, k * 3);
+
+        let mut filtered = HVec::new();
+        for (memory, _) in all_results {
+            if memory.memory_type == memory_type && filtered.len() < k {
+                let _ = filtered.push(memory);
+            }
+        }
+
+        filtered
+    }
+
+    /// Get recent memories
+    pub fn recent(&self, k: usize) -> HVec<&Memory, 16> {
+        let mut sorted: HVec<&Memory, MAX_MEMORIES> = self.memories.iter().collect();
+        sorted.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
+
+        let mut result = HVec::new();
+        for mem in sorted.iter().take(k) {
+            let _ = result.push(*mem);
+        }
+        result
+    }
+
+    /// Forget (remove) a memory
+    pub fn forget(&mut self, id: u32) -> bool {
+        if let Some(pos) = self.memories.iter().position(|m| m.id == id) {
+            self.memories.swap_remove(pos);
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Find memory by ID
+    fn find_memory_by_id(&self, id: u32) -> Option<&Memory> {
+        self.memories.iter().find(|m| m.id == id)
+    }
+
+    /// Increment access count
+    fn increment_access(&mut self, id: u32) {
+        if let Some(memory) = self.memories.iter_mut().find(|m| m.id == id) {
+            memory.access_count = memory.access_count.saturating_add(1);
+        }
+    }
+
+    /// Evict least important memory
+    fn evict_least_important(&mut self) -> Result<(), &'static str> {
+        if self.memories.is_empty() {
+            return Ok(());
+        }
+
+        // Find memory with lowest score
+        let mut min_score = i32::MAX;
+        let mut min_idx = 0;
+
+        for (i, memory) in self.memories.iter().enumerate() {
+            let score = memory.relevance_score(0, self.current_time);
+            if score < min_score {
+                min_score = score;
+                min_idx = i;
+            }
+        }
+
+        self.memories.swap_remove(min_idx);
+        Ok(())
+    }
+}
+
+impl Default for SemanticMemory {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_memory_creation() {
+        let embedding = [10i8; MEMORY_DIM];
+        let memory = Memory::new(1, MemoryType::Preference, "I like coffee", &embedding, 1000);
+        assert!(memory.is_some());
+        let m = memory.unwrap();
+        assert_eq!(m.id, 1);
+        assert_eq!(m.memory_type, MemoryType::Preference);
+    }
+
+    #[test]
+    fn test_semantic_memory() {
+        let mut sm = SemanticMemory::new();
+        sm.set_time(1000);
+
+        let embed1 = [10i8; MEMORY_DIM];
+        let embed2 = [20i8; MEMORY_DIM];
+
+        sm.remember(MemoryType::Preference, "I like tea", &embed1).unwrap();
+        sm.remember(MemoryType::Fact, "Water is wet", &embed2).unwrap();
+
+        assert_eq!(sm.len(), 2);
+
+        // Recall similar to embed1
+        let query = [11i8; MEMORY_DIM];
+        let results = sm.recall(&query, 1);
+        assert!(!results.is_empty());
+    }
+}