Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/binary_quant.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/binary_quant.rs
@@ -0,0 +1,130 @@
+//! Binary Quantization - 32x Memory Compression
+
+use heapless::Vec as HVec;
+
+pub const MAX_BINARY_SIZE: usize = 64;
+
+/// Binary quantized vector - 1 bit per dimension
+#[derive(Debug, Clone)]
+pub struct BinaryVector<const N: usize> {
+    pub data: HVec<u8, N>,
+    pub dim: usize,
+    pub threshold: i8,
+}
+
+impl<const N: usize> BinaryVector<N> {
+    pub fn from_i8(values: &[i8], threshold: i8) -> crate::Result<Self> {
+        let dim = values.len();
+        let num_bytes = (dim + 7) / 8;
+        if num_bytes > N {
+            return Err(crate::Error::BufferOverflow);
+        }
+
+        let mut data = HVec::new();
+        for chunk_idx in 0..num_bytes {
+            let mut byte = 0u8;
+            for bit_idx in 0..8 {
+                let val_idx = chunk_idx * 8 + bit_idx;
+                if val_idx < dim && values[val_idx] >= threshold {
+                    byte |= 1 << bit_idx;
+                }
+            }
+            data.push(byte).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { data, dim, threshold })
+    }
+
+    pub fn num_bytes(&self) -> usize { self.data.len() }
+    pub fn compression_ratio(&self) -> f32 { self.dim as f32 / self.data.len() as f32 }
+}
+
+/// Binary embedding table (32x smaller than INT8)
+pub struct BinaryEmbedding<const VOCAB: usize, const DIM_BYTES: usize> {
+    data: HVec<u8, { 32 * 1024 }>,
+    vocab_size: usize,
+    dim: usize,
+    bytes_per_embed: usize,
+}
+
+impl<const VOCAB: usize, const DIM_BYTES: usize> BinaryEmbedding<VOCAB, DIM_BYTES> {
+    pub fn random(vocab_size: usize, dim: usize, seed: u32) -> crate::Result<Self> {
+        let bytes_per_embed = (dim + 7) / 8;
+        let total_bytes = vocab_size * bytes_per_embed;
+
+        let mut data = HVec::new();
+        let mut rng_state = seed;
+
+        for _ in 0..total_bytes {
+            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
+            let byte = ((rng_state >> 16) & 0xFF) as u8;
+            data.push(byte).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { data, vocab_size, dim, bytes_per_embed })
+    }
+
+    pub fn lookup(&self, token_id: u16, output: &mut [u8]) -> crate::Result<()> {
+        let id = token_id as usize;
+        if id >= self.vocab_size {
+            return Err(crate::Error::InvalidModel("Token ID out of range"));
+        }
+        let start = id * self.bytes_per_embed;
+        let end = start + self.bytes_per_embed;
+        if output.len() < self.bytes_per_embed {
+            return Err(crate::Error::BufferOverflow);
+        }
+        output[..self.bytes_per_embed].copy_from_slice(&self.data[start..end]);
+        Ok(())
+    }
+
+    pub fn memory_size(&self) -> usize { self.data.len() }
+}
+
+/// Hamming distance between binary vectors (POPCNT)
+#[inline]
+pub fn hamming_distance(a: &[u8], b: &[u8]) -> u32 {
+    let mut distance: u32 = 0;
+    let chunks = a.len() / 4;
+    for i in 0..chunks {
+        let idx = i * 4;
+        distance += popcount8(a[idx] ^ b[idx]) + popcount8(a[idx + 1] ^ b[idx + 1])
+                  + popcount8(a[idx + 2] ^ b[idx + 2]) + popcount8(a[idx + 3] ^ b[idx + 3]);
+    }
+    for i in (chunks * 4)..a.len() {
+        distance += popcount8(a[i] ^ b[i]);
+    }
+    distance
+}
+
+#[inline]
+pub fn hamming_similarity(a: &[u8], b: &[u8]) -> f32 {
+    let total_bits = (a.len() * 8) as f32;
+    1.0 - (hamming_distance(a, b) as f32 / total_bits)
+}
+
+#[inline]
+pub fn popcount8(x: u8) -> u32 {
+    const TABLE: [u8; 256] = [
+        0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+        1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+        2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+        3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+    ];
+    TABLE[x as usize] as u32
+}
+
+/// XNOR-popcount for binary neural network inference
+#[inline]
+pub fn xnor_popcount(a: &[u8], b: &[u8]) -> i32 {
+    let total_bits = (a.len() * 8) as i32;
+    let mut matching: i32 = 0;
+    for (&x, &y) in a.iter().zip(b.iter()) {
+        matching += popcount8(!(x ^ y)) as i32;
+    }
+    2 * matching - total_bits
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/lookup_tables.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/lookup_tables.rs
@@ -0,0 +1,124 @@
+//! Lookup Tables for Fast Fixed-Point Operations
+
+/// Softmax lookup table
+pub struct SoftmaxLUT {
+    exp_table: [u8; 256],
+    pub input_scale: i32,
+}
+
+impl SoftmaxLUT {
+    pub const fn new() -> Self {
+        let mut exp_table = [0u8; 256];
+        let mut i = 0;
+        while i < 256 {
+            let x_scaled = i as i32 - 255;
+            let mut exp_approx = 255 + x_scaled;
+            if exp_approx < 1 { exp_approx = 1; }
+            if exp_approx > 255 { exp_approx = 255; }
+            exp_table[i] = exp_approx as u8;
+            i += 1;
+        }
+        Self { exp_table, input_scale: 32 }
+    }
+
+    #[inline]
+    pub fn exp(&self, x: i32) -> u8 {
+        let x_clamped = x.max(-255).min(0);
+        self.exp_table[(x_clamped + 255) as usize]
+    }
+
+    pub fn softmax(&self, logits: &[i32], output: &mut [u16]) {
+        if logits.is_empty() { return; }
+        let max_logit = logits.iter().cloned().max().unwrap_or(0);
+        let mut sum: u32 = 0;
+        for (&logit, out) in logits.iter().zip(output.iter_mut()) {
+            let exp_val = self.exp(logit - max_logit) as u16;
+            *out = exp_val;
+            sum += exp_val as u32;
+        }
+        if sum > 0 {
+            for out in output.iter_mut() {
+                *out = ((*out as u32 * 256) / sum) as u16;
+            }
+        }
+    }
+
+    pub fn softmax_inplace(&self, logits: &mut [i32]) {
+        if logits.is_empty() { return; }
+        let max = logits.iter().cloned().max().unwrap_or(0);
+        let mut sum: i32 = 0;
+        for logit in logits.iter_mut() {
+            let x = (*logit - max).max(-255);
+            *logit = self.exp_table[(x + 255) as usize] as i32;
+            sum += *logit;
+        }
+        if sum > 0 {
+            for logit in logits.iter_mut() {
+                *logit = (*logit << 8) / sum;
+            }
+        }
+    }
+}
+
+impl Default for SoftmaxLUT {
+    fn default() -> Self { Self::new() }
+}
+
+/// Exponential lookup table
+pub struct ExpLUT {
+    table: [u16; 256],
+}
+
+impl ExpLUT {
+    pub const fn new() -> Self {
+        let mut table = [0u16; 256];
+        let mut i = 0;
+        while i < 256 {
+            let x = i as i32;
+            let x_scaled = x * 256 / 64;
+            let x2 = (x_scaled * x_scaled) >> 9;
+            let mut exp_val = 256 + x_scaled + (x2 >> 1);
+            if exp_val > 65535 { exp_val = 65535; }
+            table[i] = exp_val as u16;
+            i += 1;
+        }
+        Self { table }
+    }
+
+    #[inline]
+    pub fn exp(&self, x: u8) -> u16 { self.table[x as usize] }
+}
+
+/// Distance lookup table for L2 distance
+pub struct DistanceLUT<const SIZE: usize> {
+    sq_diff_table: [u16; 512],
+}
+
+impl<const SIZE: usize> DistanceLUT<SIZE> {
+    pub const fn new() -> Self {
+        let mut sq_diff_table = [0u16; 512];
+        let mut i = 0i32;
+        while i < 512 {
+            let diff = i - 256;
+            let mut sq = diff * diff;
+            if sq > 65535 { sq = 65535; }
+            sq_diff_table[i as usize] = sq as u16;
+            i += 1;
+        }
+        Self { sq_diff_table }
+    }
+
+    #[inline]
+    pub fn squared_diff(&self, a: i8, b: i8) -> u16 {
+        let idx = (a as i32 - b as i32 + 256) as usize;
+        self.sq_diff_table[idx]
+    }
+
+    pub fn l2_squared(&self, a: &[i8], b: &[i8]) -> u32 {
+        a.iter().zip(b.iter()).map(|(&x, &y)| self.squared_diff(x, y) as u32).sum()
+    }
+}
+
+pub static SOFTMAX_LUT: SoftmaxLUT = SoftmaxLUT::new();
+pub static EXP_LUT: ExpLUT = ExpLUT::new();
+pub static DISTANCE_LUT: DistanceLUT<256> = DistanceLUT::new();
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/micro_lora.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/micro_lora.rs
@@ -0,0 +1,113 @@
+//! MicroLoRA - Tiny Low-Rank Adaptation for ESP32
+
+use heapless::Vec as HVec;
+use crate::QuantParams;
+
+pub const MAX_LORA_RANK: usize = 2;
+pub const MAX_LORA_DIM: usize = 64;
+
+#[derive(Debug, Clone, Copy)]
+pub struct LoRAConfig {
+    pub rank: usize,
+    pub dim: usize,
+    pub scale: i8,
+    pub frozen: bool,
+}
+
+impl Default for LoRAConfig {
+    fn default() -> Self {
+        Self { rank: 1, dim: 32, scale: 8, frozen: true }
+    }
+}
+
+pub struct MicroLoRA {
+    a_weights: HVec<i8, { MAX_LORA_DIM * MAX_LORA_RANK }>,
+    b_weights: HVec<i8, { MAX_LORA_RANK * MAX_LORA_DIM }>,
+    config: LoRAConfig,
+    intermediate: [i32; MAX_LORA_RANK],
+}
+
+impl MicroLoRA {
+    pub fn new(config: LoRAConfig, seed: u32) -> crate::Result<Self> {
+        if config.rank > MAX_LORA_RANK || config.dim > MAX_LORA_DIM {
+            return Err(crate::Error::InvalidModel("LoRA dimensions too large"));
+        }
+
+        let mut a_weights = HVec::new();
+        let mut b_weights = HVec::new();
+        let mut rng = seed;
+
+        for _ in 0..(config.dim * config.rank) {
+            rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
+            a_weights.push((((rng >> 16) & 0x3F) as i16 - 32) as i8)
+                .map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        for _ in 0..(config.rank * config.dim) {
+            b_weights.push(0).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+
+        Ok(Self { a_weights, b_weights, config, intermediate: [0; MAX_LORA_RANK] })
+    }
+
+    pub fn from_weights(config: LoRAConfig, a: &[i8], b: &[i8]) -> crate::Result<Self> {
+        let mut a_vec = HVec::new();
+        let mut b_vec = HVec::new();
+        for &w in a { a_vec.push(w).map_err(|_| crate::Error::BufferOverflow)?; }
+        for &w in b { b_vec.push(w).map_err(|_| crate::Error::BufferOverflow)?; }
+        Ok(Self { a_weights: a_vec, b_weights: b_vec, config, intermediate: [0; MAX_LORA_RANK] })
+    }
+
+    #[inline]
+    pub fn apply(&mut self, input: &[i8], output: &mut [i32]) {
+        let (dim, rank, scale) = (self.config.dim, self.config.rank, self.config.scale as i32);
+
+        for r in 0..rank {
+            let mut sum: i32 = 0;
+            for d in 0..dim {
+                sum += input[d] as i32 * self.a_weights[d * rank + r] as i32;
+            }
+            self.intermediate[r] = sum >> 4;
+        }
+
+        for d in 0..dim {
+            let mut sum: i32 = 0;
+            for r in 0..rank {
+                sum += self.intermediate[r] * self.b_weights[r * dim + d] as i32;
+            }
+            output[d] += (sum * scale) >> 8;
+        }
+    }
+
+    pub fn memory_size(&self) -> usize { self.a_weights.len() + self.b_weights.len() }
+}
+
+pub struct LoRAStack<const NUM_LAYERS: usize> {
+    adapters: [Option<MicroLoRA>; NUM_LAYERS],
+    active_count: usize,
+}
+
+impl<const NUM_LAYERS: usize> LoRAStack<NUM_LAYERS> {
+    pub fn new() -> Self {
+        Self { adapters: core::array::from_fn(|_| None), active_count: 0 }
+    }
+
+    pub fn add_adapter(&mut self, layer: usize, adapter: MicroLoRA) -> crate::Result<()> {
+        if layer >= NUM_LAYERS { return Err(crate::Error::InvalidModel("Layer out of range")); }
+        self.adapters[layer] = Some(adapter);
+        self.active_count += 1;
+        Ok(())
+    }
+
+    pub fn get(&mut self, layer: usize) -> Option<&mut MicroLoRA> {
+        self.adapters.get_mut(layer).and_then(|a| a.as_mut())
+    }
+
+    pub fn total_memory(&self) -> usize {
+        self.adapters.iter().filter_map(|a| a.as_ref()).map(|a| a.memory_size()).sum()
+    }
+}
+
+impl<const N: usize> Default for LoRAStack<N> {
+    fn default() -> Self { Self::new() }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/mod.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/mod.rs
@@ -0,0 +1,22 @@
+//! Advanced Optimizations for ESP32
+//!
+//! - Binary quantization (32x compression)
+//! - Product quantization (8-32x compression)
+//! - Lookup tables (fixed-point softmax)
+//! - MicroLoRA (on-device adaptation)
+//! - Sparse attention patterns
+//! - MinCut-inspired pruning
+
+pub mod binary_quant;
+pub mod product_quant;
+pub mod lookup_tables;
+pub mod micro_lora;
+pub mod sparse_attention;
+pub mod pruning;
+
+pub use binary_quant::{BinaryVector, BinaryEmbedding, hamming_distance, hamming_similarity, popcount8};
+pub use product_quant::{ProductQuantizer, PQCode, PQConfig, PQDistanceTable};
+pub use lookup_tables::{SoftmaxLUT, ExpLUT, DistanceLUT, SOFTMAX_LUT, EXP_LUT, DISTANCE_LUT};
+pub use micro_lora::{MicroLoRA, LoRAConfig, LoRAStack};
+pub use sparse_attention::{SparseAttention, AttentionPattern, AttentionPatternCache};
+pub use pruning::{LayerPruner, PruningConfig, PruningMask, PruningStats, MinCutScorer};
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/product_quant.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/product_quant.rs
@@ -0,0 +1,149 @@
+//! Product Quantization - 8-32x Memory Compression
+
+use heapless::Vec as HVec;
+
+pub const MAX_SUBQUANTIZERS: usize = 8;
+pub const MAX_CODEBOOK_SIZE: usize = 16;
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct PQConfig {
+    pub num_subquantizers: usize,
+    pub codebook_size: usize,
+    pub subvec_dim: usize,
+    pub dim: usize,
+}
+
+impl PQConfig {
+    pub fn new(dim: usize, num_sub: usize) -> Self {
+        Self {
+            num_subquantizers: num_sub,
+            codebook_size: 16,
+            subvec_dim: dim / num_sub,
+            dim,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct PQCode<const M: usize> {
+    pub codes: HVec<u8, M>,
+}
+
+impl<const M: usize> PQCode<M> {
+    pub fn from_codes(codes: &[u8]) -> crate::Result<Self> {
+        let mut code_vec = HVec::new();
+        for &c in codes {
+            code_vec.push(c).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(Self { codes: code_vec })
+    }
+
+    #[inline]
+    pub fn get_code(&self, i: usize) -> u8 {
+        self.codes.get(i).copied().unwrap_or(0)
+    }
+}
+
+pub struct ProductQuantizer<const M: usize, const K: usize, const D: usize> {
+    codebooks: HVec<i8, { 8 * 16 * 8 }>,
+    config: PQConfig,
+}
+
+impl<const M: usize, const K: usize, const D: usize> ProductQuantizer<M, K, D> {
+    pub fn random(config: PQConfig, seed: u32) -> crate::Result<Self> {
+        let total = config.num_subquantizers * config.codebook_size * config.subvec_dim;
+        let mut codebooks = HVec::new();
+        let mut rng = seed;
+
+        for _ in 0..total {
+            rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
+            let val = (((rng >> 16) & 0xFF) as i16 - 128) as i8;
+            codebooks.push(val).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(Self { codebooks, config })
+    }
+
+    #[inline]
+    fn get_centroid(&self, m: usize, k: usize) -> &[i8] {
+        let d = self.config.subvec_dim;
+        let kk = self.config.codebook_size;
+        let start = m * kk * d + k * d;
+        &self.codebooks[start..start + d]
+    }
+
+    pub fn encode(&self, vector: &[i8]) -> crate::Result<PQCode<M>> {
+        if vector.len() != self.config.dim {
+            return Err(crate::Error::InvalidModel("Dimension mismatch"));
+        }
+        let mut codes = HVec::new();
+        let d = self.config.subvec_dim;
+
+        for m in 0..self.config.num_subquantizers {
+            let subvec = &vector[m * d..(m + 1) * d];
+            let mut best_code = 0u8;
+            let mut best_dist = i32::MAX;
+
+            for k in 0..self.config.codebook_size {
+                let dist = Self::l2_squared(subvec, self.get_centroid(m, k));
+                if dist < best_dist {
+                    best_dist = dist;
+                    best_code = k as u8;
+                }
+            }
+            codes.push(best_code).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(PQCode { codes })
+    }
+
+    pub fn asymmetric_distance(&self, query: &[i8], code: &PQCode<M>) -> i32 {
+        let d = self.config.subvec_dim;
+        let mut total: i32 = 0;
+        for m in 0..self.config.num_subquantizers {
+            let query_sub = &query[m * d..(m + 1) * d];
+            let k = code.get_code(m) as usize;
+            total += Self::l2_squared(query_sub, self.get_centroid(m, k));
+        }
+        total
+    }
+
+    pub fn build_distance_table(&self, query: &[i8]) -> PQDistanceTable<M, K> {
+        let mut table = PQDistanceTable::new();
+        let d = self.config.subvec_dim;
+        for m in 0..self.config.num_subquantizers {
+            let query_sub = &query[m * d..(m + 1) * d];
+            for k in 0..self.config.codebook_size {
+                let dist = Self::l2_squared(query_sub, self.get_centroid(m, k));
+                table.set(m, k, dist);
+            }
+        }
+        table
+    }
+
+    #[inline]
+    fn l2_squared(a: &[i8], b: &[i8]) -> i32 {
+        a.iter().zip(b.iter()).map(|(&x, &y)| {
+            let diff = x as i32 - y as i32;
+            diff * diff
+        }).sum()
+    }
+
+    pub fn compression_ratio(&self) -> f32 {
+        self.config.dim as f32 / self.config.num_subquantizers as f32
+    }
+}
+
+pub struct PQDistanceTable<const M: usize, const K: usize> {
+    distances: [i32; 128],
+}
+
+impl<const M: usize, const K: usize> PQDistanceTable<M, K> {
+    pub fn new() -> Self { Self { distances: [0; 128] } }
+    #[inline]
+    pub fn get(&self, m: usize, k: usize) -> i32 { self.distances[m * K + k] }
+    #[inline]
+    pub fn set(&mut self, m: usize, k: usize, dist: i32) { self.distances[m * K + k] = dist; }
+}
+
+impl<const M: usize, const K: usize> Default for PQDistanceTable<M, K> {
+    fn default() -> Self { Self::new() }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/pruning.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/pruning.rs
@@ -0,0 +1,167 @@
+//! MinCut-Inspired Layer Pruning
+
+use heapless::Vec as HVec;
+
+pub const MAX_PRUNING_UNITS: usize = 64;
+pub const MAX_MASK_WORDS: usize = 64;
+
+#[derive(Debug, Clone, Copy)]
+pub struct PruningConfig {
+    pub target_sparsity: f32,
+    pub importance_threshold: i8,
+    pub structured: bool,
+}
+
+impl Default for PruningConfig {
+    fn default() -> Self {
+        Self { target_sparsity: 0.5, importance_threshold: 8, structured: true }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct PruningMask<const N: usize> {
+    pub mask: HVec<u32, MAX_MASK_WORDS>,
+    pub size: usize,
+    pub pruned_count: usize,
+}
+
+impl<const N: usize> PruningMask<N> {
+    pub fn new(size: usize) -> crate::Result<Self> {
+        let num_words = (size + 31) / 32;
+        let mut mask = HVec::new();
+        for i in 0..num_words {
+            let bits = if i == num_words - 1 && size % 32 != 0 {
+                (1u32 << (size % 32)) - 1
+            } else {
+                u32::MAX
+            };
+            mask.push(bits).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(Self { mask, size, pruned_count: 0 })
+    }
+
+    #[inline]
+    pub fn is_kept(&self, idx: usize) -> bool {
+        let word = idx / 32;
+        let bit = idx % 32;
+        (self.mask.get(word).copied().unwrap_or(0) >> bit) & 1 == 1
+    }
+
+    pub fn prune(&mut self, idx: usize) {
+        if idx < self.size && self.is_kept(idx) {
+            let word = idx / 32;
+            let bit = idx % 32;
+            if let Some(w) = self.mask.get_mut(word) {
+                *w &= !(1 << bit);
+                self.pruned_count += 1;
+            }
+        }
+    }
+
+    pub fn sparsity(&self) -> f32 { self.pruned_count as f32 / self.size as f32 }
+}
+
+pub struct LayerPruner {
+    config: PruningConfig,
+    importance_scores: HVec<i16, MAX_PRUNING_UNITS>,
+}
+
+impl LayerPruner {
+    pub fn new(config: PruningConfig) -> Self {
+        Self { config, importance_scores: HVec::new() }
+    }
+
+    pub fn compute_magnitude_importance(&mut self, weights: &[i8]) {
+        self.importance_scores.clear();
+        for &w in weights.iter().take(MAX_PRUNING_UNITS) {
+            let _ = self.importance_scores.push((w as i16).abs());
+        }
+    }
+
+    pub fn create_mask<const N: usize>(&self, size: usize) -> crate::Result<PruningMask<N>> {
+        let mut mask = PruningMask::new(size)?;
+        let threshold = self.compute_threshold(size);
+        for (idx, &score) in self.importance_scores.iter().enumerate() {
+            if score < threshold { mask.prune(idx); }
+        }
+        Ok(mask)
+    }
+
+    fn compute_threshold(&self, size: usize) -> i16 {
+        let target = (size as f32 * self.config.target_sparsity) as usize;
+        if target == 0 || self.importance_scores.is_empty() { return 0; }
+
+        let mut sorted: HVec<i16, MAX_PRUNING_UNITS> = self.importance_scores.clone();
+        for i in 0..sorted.len() {
+            for j in 0..sorted.len() - 1 - i {
+                if sorted[j] > sorted[j + 1] { sorted.swap(j, j + 1); }
+            }
+        }
+        sorted.get(target.min(sorted.len() - 1)).copied().unwrap_or(0)
+    }
+
+    pub fn apply_mask<const N: usize>(&self, weights: &mut [i8], mask: &PruningMask<N>) {
+        for (idx, weight) in weights.iter_mut().enumerate() {
+            if !mask.is_kept(idx) { *weight = 0; }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct PruningStats {
+    pub total_weights: usize,
+    pub pruned_weights: usize,
+    pub sparsity: f32,
+    pub memory_saved: usize,
+}
+
+pub struct MinCutScorer {
+    input_flow: HVec<i32, MAX_PRUNING_UNITS>,
+    output_flow: HVec<i32, MAX_PRUNING_UNITS>,
+}
+
+impl MinCutScorer {
+    pub fn new() -> Self {
+        Self { input_flow: HVec::new(), output_flow: HVec::new() }
+    }
+
+    pub fn compute_edge_importance(&mut self, weights: &[i8], input_dim: usize, output_dim: usize)
+        -> HVec<i16, MAX_PRUNING_UNITS>
+    {
+        self.input_flow.clear();
+        self.output_flow.clear();
+
+        for in_idx in 0..input_dim.min(MAX_PRUNING_UNITS) {
+            let flow: i32 = (0..output_dim).map(|out_idx| {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() { (weights[w_idx] as i32).abs() } else { 0 }
+            }).sum();
+            let _ = self.input_flow.push(flow);
+        }
+
+        for out_idx in 0..output_dim.min(MAX_PRUNING_UNITS) {
+            let flow: i32 = (0..input_dim).map(|in_idx| {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() { (weights[w_idx] as i32).abs() } else { 0 }
+            }).sum();
+            let _ = self.output_flow.push(flow);
+        }
+
+        let mut importance: HVec<i16, MAX_PRUNING_UNITS> = HVec::new();
+        for out_idx in 0..output_dim.min(self.output_flow.len()) {
+            for in_idx in 0..input_dim.min(self.input_flow.len()) {
+                let w_idx = out_idx * input_dim + in_idx;
+                if w_idx < weights.len() && importance.len() < MAX_PRUNING_UNITS {
+                    let w = (weights[w_idx] as i32).abs();
+                    let bottleneck = self.input_flow[in_idx].min(self.output_flow[out_idx]);
+                    let _ = importance.push(((w * bottleneck) >> 10) as i16);
+                }
+            }
+        }
+        importance
+    }
+}
+
+impl Default for MinCutScorer {
+    fn default() -> Self { Self::new() }
+}
--- a/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/sparse_attention.rs
+++ b/vendor/ruvector/examples/ruvLLM/esp32-flash/src/optimizations/sparse_attention.rs
@@ -0,0 +1,120 @@
+//! Sparse Attention Patterns for ESP32
+
+use heapless::Vec as HVec;
+
+pub const MAX_SPARSE_SEQ: usize = 32;
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum AttentionPattern {
+    Full,
+    SlidingWindow { window_size: usize },
+    Strided { stride: usize },
+    Longformer { window_size: usize, stride: usize },
+    BlockDiagonal { block_size: usize },
+    BigBird { window_size: usize, global_tokens: usize },
+}
+
+impl Default for AttentionPattern {
+    fn default() -> Self { Self::SlidingWindow { window_size: 4 } }
+}
+
+pub struct SparseAttention {
+    pattern: AttentionPattern,
+    mask_data: HVec<u32, MAX_SPARSE_SEQ>,
+    seq_len: usize,
+}
+
+impl SparseAttention {
+    pub fn new(pattern: AttentionPattern, seq_len: usize) -> crate::Result<Self> {
+        if seq_len > MAX_SPARSE_SEQ { return Err(crate::Error::BufferOverflow); }
+        let mut sa = Self { pattern, mask_data: HVec::new(), seq_len };
+        sa.build_mask()?;
+        Ok(sa)
+    }
+
+    fn build_mask(&mut self) -> crate::Result<()> {
+        self.mask_data.clear();
+        for i in 0..self.seq_len {
+            let mut row_mask: u32 = 0;
+            for j in 0..self.seq_len {
+                if j <= i && self.should_attend(i, j) {
+                    row_mask |= 1 << j;
+                }
+            }
+            self.mask_data.push(row_mask).map_err(|_| crate::Error::BufferOverflow)?;
+        }
+        Ok(())
+    }
+
+    fn should_attend(&self, i: usize, j: usize) -> bool {
+        match self.pattern {
+            AttentionPattern::Full => true,
+            AttentionPattern::SlidingWindow { window_size } => i.saturating_sub(window_size) <= j,
+            AttentionPattern::Strided { stride } => j % stride == 0 || i.saturating_sub(1) <= j,
+            AttentionPattern::Longformer { window_size, stride } =>
+                i.saturating_sub(window_size) <= j || j % stride == 0,
+            AttentionPattern::BlockDiagonal { block_size } => i / block_size == j / block_size,
+            AttentionPattern::BigBird { window_size, global_tokens } =>
+                i.saturating_sub(window_size) <= j || j < global_tokens,
+        }
+    }
+
+    #[inline]
+    pub fn should_attend_at(&self, i: usize, j: usize) -> bool {
+        if i >= self.seq_len || j >= self.seq_len { return false; }
+        (self.mask_data[i] >> j) & 1 == 1
+    }
+
+    #[inline]
+    pub fn get_mask_row(&self, i: usize) -> u32 {
+        self.mask_data.get(i).copied().unwrap_or(0)
+    }
+
+    pub fn sparse_qk(&self, query: &[i8], keys: &[&[i8]], scores: &mut [i32], query_pos: usize) {
+        let mask = self.get_mask_row(query_pos);
+        for (j, key) in keys.iter().enumerate() {
+            if (mask >> j) & 1 == 1 {
+                scores[j] = query.iter().zip(key.iter()).map(|(&q, &k)| q as i32 * k as i32).sum();
+            } else {
+                scores[j] = i32::MIN;
+            }
+        }
+    }
+
+    pub fn active_positions(&self) -> usize {
+        self.mask_data.iter().map(|m| m.count_ones() as usize).sum()
+    }
+
+    pub fn sparsity_ratio(&self) -> f32 {
+        let full = self.seq_len * (self.seq_len + 1) / 2;
+        self.active_positions() as f32 / full as f32
+    }
+}
+
+pub struct AttentionPatternCache {
+    patterns: [Option<SparseAttention>; 4],
+}
+
+impl AttentionPatternCache {
+    pub fn new_sliding(window: usize) -> Self {
+        let p = AttentionPattern::SlidingWindow { window_size: window };
+        Self {
+            patterns: [
+                SparseAttention::new(p, 8).ok(),
+                SparseAttention::new(p, 16).ok(),
+                SparseAttention::new(p, 24).ok(),
+                SparseAttention::new(p, 32).ok(),
+            ],
+        }
+    }
+
+    pub fn get(&self, seq_len: usize) -> Option<&SparseAttention> {
+        match seq_len {
+            1..=8 => self.patterns[0].as_ref(),
+            9..=16 => self.patterns[1].as_ref(),
+            17..=24 => self.patterns[2].as_ref(),
+            25..=32 => self.patterns[3].as_ref(),
+            _ => None,
+        }
+    }
+}