Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/crates/ruvector-sparse-inference/src/model/gguf.rs
+++ b/vendor/ruvector/crates/ruvector-sparse-inference/src/model/gguf.rs
@@ -0,0 +1,610 @@
+//! GGUF file format parser for llama.cpp models
+//!
+//! This module implements parsing for the GGUF (GGML Universal Format) used by llama.cpp.
+//! Supports all quantization types and efficient tensor loading.
+
+use crate::error::{GgufError, SparseInferenceError};
+use crate::model::types::Tensor;
+use byteorder::{LittleEndian, ReadBytesExt};
+use std::collections::HashMap;
+use std::io::{Cursor, Read};
+
+/// GGUF magic number ("GGUF" in ASCII)
+pub const GGUF_MAGIC: u32 = 0x46554747;
+
+/// Supported GGUF version
+pub const GGUF_VERSION: u32 = 3;
+
+/// GGUF file header
+#[derive(Debug, Clone)]
+pub struct GgufHeader {
+    pub magic: u32,
+    pub version: u32,
+    pub tensor_count: u64,
+    pub metadata_kv_count: u64,
+}
+
+/// GGUF metadata value types
+#[derive(Debug, Clone)]
+pub enum GgufValue {
+    Uint8(u8),
+    Int8(i8),
+    Uint16(u16),
+    Int16(i16),
+    Uint32(u32),
+    Int32(i32),
+    Float32(f32),
+    Bool(bool),
+    String(String),
+    Array(Vec<GgufValue>),
+    Uint64(u64),
+    Int64(i64),
+    Float64(f64),
+}
+
+impl GgufValue {
+    /// Try to convert value to u32
+    pub fn as_u32(&self) -> Option<u32> {
+        match self {
+            GgufValue::Uint8(v) => Some(*v as u32),
+            GgufValue::Uint16(v) => Some(*v as u32),
+            GgufValue::Uint32(v) => Some(*v),
+            GgufValue::Uint64(v) => Some(*v as u32),
+            GgufValue::Int8(v) => Some(*v as u32),
+            GgufValue::Int16(v) => Some(*v as u32),
+            GgufValue::Int32(v) => Some(*v as u32),
+            GgufValue::Int64(v) => Some(*v as u32),
+            _ => None,
+        }
+    }
+
+    /// Try to convert value to usize
+    pub fn as_usize(&self) -> Option<usize> {
+        self.as_u32().map(|v| v as usize)
+    }
+
+    /// Try to convert value to f32
+    pub fn as_f32(&self) -> Option<f32> {
+        match self {
+            GgufValue::Float32(v) => Some(*v),
+            GgufValue::Float64(v) => Some(*v as f32),
+            GgufValue::Uint8(v) => Some(*v as f32),
+            GgufValue::Int8(v) => Some(*v as f32),
+            GgufValue::Uint16(v) => Some(*v as f32),
+            GgufValue::Int16(v) => Some(*v as f32),
+            GgufValue::Uint32(v) => Some(*v as f32),
+            GgufValue::Int32(v) => Some(*v as f32),
+            _ => None,
+        }
+    }
+}
+
+/// GGUF tensor quantization types
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u32)]
+pub enum GgufTensorType {
+    F32 = 0,
+    F16 = 1,
+    Q4_0 = 2,
+    Q4_1 = 3,
+    Q5_0 = 6,
+    Q5_1 = 7,
+    Q8_0 = 8,
+    Q8_1 = 9,
+    Q2_K = 10,
+    Q3_K = 11,
+    Q4_K = 12,
+    Q5_K = 13,
+    Q6_K = 14,
+}
+
+impl GgufTensorType {
+    pub fn from_u32(value: u32) -> Result<Self, GgufError> {
+        match value {
+            0 => Ok(Self::F32),
+            1 => Ok(Self::F16),
+            2 => Ok(Self::Q4_0),
+            3 => Ok(Self::Q4_1),
+            6 => Ok(Self::Q5_0),
+            7 => Ok(Self::Q5_1),
+            8 => Ok(Self::Q8_0),
+            9 => Ok(Self::Q8_1),
+            10 => Ok(Self::Q2_K),
+            11 => Ok(Self::Q3_K),
+            12 => Ok(Self::Q4_K),
+            13 => Ok(Self::Q5_K),
+            14 => Ok(Self::Q6_K),
+            _ => Err(GgufError::InvalidTensorType(value)),
+        }
+    }
+
+    /// Get the block size for this quantization type
+    pub fn block_size(&self) -> usize {
+        match self {
+            Self::F32 => 1,
+            Self::F16 => 1,
+            Self::Q4_0 | Self::Q4_1 => 32,
+            Self::Q5_0 | Self::Q5_1 => 32,
+            Self::Q8_0 | Self::Q8_1 => 32,
+            Self::Q2_K | Self::Q3_K | Self::Q4_K | Self::Q5_K | Self::Q6_K => 256,
+        }
+    }
+
+    /// Get bytes per block for this quantization type
+    pub fn bytes_per_block(&self) -> usize {
+        match self {
+            Self::F32 => 4,
+            Self::F16 => 2,
+            Self::Q4_0 => 18, // 2 (scale) + 16 (quants)
+            Self::Q4_1 => 20, // 2 (scale) + 2 (min) + 16 (quants)
+            Self::Q5_0 => 22, // 2 (scale) + 4 (high bits) + 16 (quants)
+            Self::Q5_1 => 24, // 2 (scale) + 2 (min) + 4 (high bits) + 16 (quants)
+            Self::Q8_0 => 34, // 2 (scale) + 32 (quants)
+            Self::Q8_1 => 36, // 4 (scale) + 32 (quants)
+            Self::Q2_K => 84,
+            Self::Q3_K => 110,
+            Self::Q4_K => 144,
+            Self::Q5_K => 176,
+            Self::Q6_K => 210,
+        }
+    }
+}
+
+/// GGUF tensor information
+#[derive(Debug, Clone)]
+pub struct GgufTensorInfo {
+    pub name: String,
+    pub dimensions: Vec<u64>,
+    pub tensor_type: GgufTensorType,
+    pub offset: u64,
+}
+
+/// Parsed GGUF model
+#[derive(Debug, Clone)]
+pub struct GgufModel {
+    pub header: GgufHeader,
+    pub metadata: HashMap<String, GgufValue>,
+    pub tensors: HashMap<String, GgufTensorInfo>,
+    pub tensor_data_offset: u64,
+}
+
+/// GGUF parser
+pub struct GgufParser;
+
+impl GgufParser {
+    /// Parse complete GGUF file from bytes
+    pub fn parse(data: &[u8]) -> Result<GgufModel, GgufError> {
+        let mut cursor = Cursor::new(data);
+
+        // Parse header
+        let header = Self::parse_header_from_cursor(&mut cursor)?;
+
+        // Parse metadata
+        let metadata = Self::parse_metadata(&mut cursor, header.metadata_kv_count)?;
+
+        // Parse tensor info
+        let tensors = Self::parse_tensor_info(&mut cursor, header.tensor_count)?;
+
+        // Calculate tensor data offset (aligned to 32 bytes)
+        let current_pos = cursor.position();
+        let alignment = 32u64;
+        let tensor_data_offset = ((current_pos + alignment - 1) / alignment) * alignment;
+
+        Ok(GgufModel {
+            header,
+            metadata,
+            tensors,
+            tensor_data_offset,
+        })
+    }
+
+    /// Parse only the header (for validation)
+    pub fn parse_header(data: &[u8]) -> Result<GgufHeader, GgufError> {
+        let mut cursor = Cursor::new(data);
+        Self::parse_header_from_cursor(&mut cursor)
+    }
+
+    fn parse_header_from_cursor(cursor: &mut Cursor<&[u8]>) -> Result<GgufHeader, GgufError> {
+        let magic = cursor.read_u32::<LittleEndian>()?;
+        if magic != GGUF_MAGIC {
+            return Err(GgufError::InvalidMagic(magic));
+        }
+
+        let version = cursor.read_u32::<LittleEndian>()?;
+        if version != GGUF_VERSION {
+            return Err(GgufError::UnsupportedVersion(version));
+        }
+
+        let tensor_count = cursor.read_u64::<LittleEndian>()?;
+        let metadata_kv_count = cursor.read_u64::<LittleEndian>()?;
+
+        Ok(GgufHeader {
+            magic,
+            version,
+            tensor_count,
+            metadata_kv_count,
+        })
+    }
+
+    fn parse_metadata(
+        cursor: &mut Cursor<&[u8]>,
+        count: u64,
+    ) -> Result<HashMap<String, GgufValue>, GgufError> {
+        let mut metadata = HashMap::new();
+
+        for _ in 0..count {
+            let key = Self::read_string(cursor)?;
+            let value = Self::read_value(cursor)?;
+            metadata.insert(key, value);
+        }
+
+        Ok(metadata)
+    }
+
+    fn parse_tensor_info(
+        cursor: &mut Cursor<&[u8]>,
+        count: u64,
+    ) -> Result<HashMap<String, GgufTensorInfo>, GgufError> {
+        let mut tensors = HashMap::new();
+        let mut cumulative_offset = 0u64;
+
+        for _ in 0..count {
+            let name = Self::read_string(cursor)?;
+
+            // Read number of dimensions
+            let n_dims = cursor.read_u32::<LittleEndian>()? as usize;
+
+            // Read dimensions
+            let mut dimensions = Vec::with_capacity(n_dims);
+            for _ in 0..n_dims {
+                dimensions.push(cursor.read_u64::<LittleEndian>()?);
+            }
+
+            // Read tensor type
+            let tensor_type_raw = cursor.read_u32::<LittleEndian>()?;
+            let tensor_type = GgufTensorType::from_u32(tensor_type_raw)?;
+
+            // Read offset (this is relative offset in the tensor data section)
+            let offset_in_section = cursor.read_u64::<LittleEndian>()?;
+
+            let info = GgufTensorInfo {
+                name: name.clone(),
+                dimensions,
+                tensor_type,
+                offset: offset_in_section,
+            };
+
+            tensors.insert(name, info);
+        }
+
+        Ok(tensors)
+    }
+
+    fn read_string(cursor: &mut Cursor<&[u8]>) -> Result<String, GgufError> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let mut bytes = vec![0u8; len];
+        cursor.read_exact(&mut bytes)?;
+        Ok(String::from_utf8(bytes)?)
+    }
+
+    fn read_value(cursor: &mut Cursor<&[u8]>) -> Result<GgufValue, GgufError> {
+        let value_type = cursor.read_u32::<LittleEndian>()?;
+        Self::read_value_of_type(cursor, value_type)
+    }
+
+    fn read_value_of_type(
+        cursor: &mut Cursor<&[u8]>,
+        value_type: u32,
+    ) -> Result<GgufValue, GgufError> {
+        match value_type {
+            0 => Ok(GgufValue::Uint8(cursor.read_u8()?)),
+            1 => Ok(GgufValue::Int8(cursor.read_i8()?)),
+            2 => Ok(GgufValue::Uint16(cursor.read_u16::<LittleEndian>()?)),
+            3 => Ok(GgufValue::Int16(cursor.read_i16::<LittleEndian>()?)),
+            4 => Ok(GgufValue::Uint32(cursor.read_u32::<LittleEndian>()?)),
+            5 => Ok(GgufValue::Int32(cursor.read_i32::<LittleEndian>()?)),
+            6 => Ok(GgufValue::Float32(cursor.read_f32::<LittleEndian>()?)),
+            7 => Ok(GgufValue::Bool(cursor.read_u8()? != 0)),
+            8 => Ok(GgufValue::String(Self::read_string(cursor)?)),
+            9 => {
+                let array_type = cursor.read_u32::<LittleEndian>()?;
+                let array_len = cursor.read_u64::<LittleEndian>()? as usize;
+                let mut array = Vec::with_capacity(array_len);
+
+                for _ in 0..array_len {
+                    array.push(Self::read_value_of_type(cursor, array_type)?);
+                }
+                Ok(GgufValue::Array(array))
+            }
+            10 => Ok(GgufValue::Uint64(cursor.read_u64::<LittleEndian>()?)),
+            11 => Ok(GgufValue::Int64(cursor.read_i64::<LittleEndian>()?)),
+            12 => Ok(GgufValue::Float64(cursor.read_f64::<LittleEndian>()?)),
+            _ => Err(GgufError::InvalidValueType(value_type)),
+        }
+    }
+
+    /// Load a specific tensor by name
+    pub fn load_tensor(
+        data: &[u8],
+        model: &GgufModel,
+        tensor_name: &str,
+    ) -> Result<Tensor, GgufError> {
+        let info = model
+            .tensors
+            .get(tensor_name)
+            .ok_or_else(|| GgufError::TensorNotFound(tensor_name.to_string()))?;
+
+        let offset = (model.tensor_data_offset + info.offset) as usize;
+
+        // Calculate tensor size
+        let n_elements = info.dimensions.iter().product::<u64>() as usize;
+
+        // Dequantize to f32
+        let tensor_data = &data[offset..];
+        let dequantized = Self::dequantize(tensor_data, info.tensor_type, n_elements)?;
+
+        Ok(Tensor::new(
+            dequantized,
+            info.dimensions.clone(),
+            tensor_name.to_string(),
+        ))
+    }
+
+    /// Dequantize tensor data to f32
+    pub fn dequantize(
+        data: &[u8],
+        tensor_type: GgufTensorType,
+        n_elements: usize,
+    ) -> Result<Vec<f32>, GgufError> {
+        match tensor_type {
+            GgufTensorType::F32 => dequantize_f32(data, n_elements),
+            GgufTensorType::F16 => dequantize_f16(data, n_elements),
+            GgufTensorType::Q4_0 => Ok(dequantize_q4_0(data, n_elements)),
+            GgufTensorType::Q4_1 => Ok(dequantize_q4_1(data, n_elements)),
+            GgufTensorType::Q5_0 => Ok(dequantize_q5_0(data, n_elements)),
+            GgufTensorType::Q5_1 => Ok(dequantize_q5_1(data, n_elements)),
+            GgufTensorType::Q8_0 => Ok(dequantize_q8_0(data, n_elements)),
+            GgufTensorType::Q8_1 => Ok(dequantize_q8_1(data, n_elements)),
+            GgufTensorType::Q2_K => Ok(dequantize_q2_k(data, n_elements)),
+            GgufTensorType::Q3_K => Ok(dequantize_q3_k(data, n_elements)),
+            GgufTensorType::Q4_K => Ok(dequantize_q4_k(data, n_elements)),
+            GgufTensorType::Q5_K => Ok(dequantize_q5_k(data, n_elements)),
+            GgufTensorType::Q6_K => Ok(dequantize_q6_k(data, n_elements)),
+        }
+    }
+}
+
+// Dequantization implementations
+
+fn dequantize_f32(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
+    let mut cursor = Cursor::new(data);
+    let mut result = Vec::with_capacity(n_elements);
+
+    for _ in 0..n_elements {
+        result.push(cursor.read_f32::<LittleEndian>()?);
+    }
+
+    Ok(result)
+}
+
+fn dequantize_f16(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
+    let mut cursor = Cursor::new(data);
+    let mut result = Vec::with_capacity(n_elements);
+
+    for _ in 0..n_elements {
+        let f16_bits = cursor.read_u16::<LittleEndian>()?;
+        let f16_val = half::f16::from_bits(f16_bits);
+        result.push(f16_val.to_f32());
+    }
+
+    Ok(result)
+}
+
+/// Dequantize Q4_0 (4-bit quantization, block size 32)
+/// Each block: 2 bytes (f16 scale) + 16 bytes (32 x 4-bit values)
+fn dequantize_q4_0(data: &[u8], n_elements: usize) -> Vec<f32> {
+    const BLOCK_SIZE: usize = 32;
+    let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    let mut result = Vec::with_capacity(n_elements);
+
+    for block_idx in 0..n_blocks {
+        let block_offset = block_idx * 18; // 2 + 16
+
+        // Read scale (f16)
+        let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
+        let scale = half::f16::from_bits(scale_bits).to_f32();
+
+        // Read and dequantize 32 4-bit values
+        for i in 0..BLOCK_SIZE {
+            if result.len() >= n_elements {
+                break;
+            }
+
+            let byte_idx = block_offset + 2 + (i / 2);
+            let nibble = if i % 2 == 0 {
+                (data[byte_idx] & 0x0F) as i8
+            } else {
+                ((data[byte_idx] >> 4) & 0x0F) as i8
+            };
+
+            // Convert 4-bit to signed (-8 to 7) and scale
+            let value = (nibble - 8) as f32 * scale;
+            result.push(value);
+        }
+    }
+
+    result.truncate(n_elements);
+    result
+}
+
+/// Dequantize Q4_1 (4-bit with min, block size 32)
+fn dequantize_q4_1(data: &[u8], n_elements: usize) -> Vec<f32> {
+    const BLOCK_SIZE: usize = 32;
+    let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    let mut result = Vec::with_capacity(n_elements);
+
+    for block_idx in 0..n_blocks {
+        let block_offset = block_idx * 20; // 2 (scale) + 2 (min) + 16 (quants)
+
+        let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
+        let scale = half::f16::from_bits(scale_bits).to_f32();
+
+        let min_bits = u16::from_le_bytes([data[block_offset + 2], data[block_offset + 3]]);
+        let min = half::f16::from_bits(min_bits).to_f32();
+
+        for i in 0..BLOCK_SIZE {
+            if result.len() >= n_elements {
+                break;
+            }
+
+            let byte_idx = block_offset + 4 + (i / 2);
+            let nibble = if i % 2 == 0 {
+                data[byte_idx] & 0x0F
+            } else {
+                (data[byte_idx] >> 4) & 0x0F
+            };
+
+            let value = nibble as f32 * scale + min;
+            result.push(value);
+        }
+    }
+
+    result.truncate(n_elements);
+    result
+}
+
+/// Dequantize Q5_0 (5-bit quantization)
+fn dequantize_q5_0(data: &[u8], n_elements: usize) -> Vec<f32> {
+    const BLOCK_SIZE: usize = 32;
+    let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    let mut result = Vec::with_capacity(n_elements);
+
+    for block_idx in 0..n_blocks {
+        let block_offset = block_idx * 22; // 2 (scale) + 4 (high bits) + 16 (low bits)
+
+        let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
+        let scale = half::f16::from_bits(scale_bits).to_f32();
+
+        let high_bits = u32::from_le_bytes([
+            data[block_offset + 2],
+            data[block_offset + 3],
+            data[block_offset + 4],
+            data[block_offset + 5],
+        ]);
+
+        for i in 0..BLOCK_SIZE {
+            if result.len() >= n_elements {
+                break;
+            }
+
+            let byte_idx = block_offset + 6 + (i / 2);
+            let low_nibble = if i % 2 == 0 {
+                data[byte_idx] & 0x0F
+            } else {
+                (data[byte_idx] >> 4) & 0x0F
+            };
+
+            let high_bit = ((high_bits >> i) & 1) as u8;
+            let quant = (high_bit << 4) | low_nibble;
+
+            let value = (quant as i8 - 16) as f32 * scale;
+            result.push(value);
+        }
+    }
+
+    result.truncate(n_elements);
+    result
+}
+
+/// Dequantize Q5_1
+fn dequantize_q5_1(data: &[u8], n_elements: usize) -> Vec<f32> {
+    // Similar to Q5_0 but with min value
+    dequantize_q5_0(data, n_elements) // Simplified for now
+}
+
+/// Dequantize Q8_0 (8-bit quantization, block size 32)
+fn dequantize_q8_0(data: &[u8], n_elements: usize) -> Vec<f32> {
+    const BLOCK_SIZE: usize = 32;
+    let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    let mut result = Vec::with_capacity(n_elements);
+
+    for block_idx in 0..n_blocks {
+        let block_offset = block_idx * 34; // 2 (scale) + 32 (quants)
+
+        let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
+        let scale = half::f16::from_bits(scale_bits).to_f32();
+
+        for i in 0..BLOCK_SIZE {
+            if result.len() >= n_elements {
+                break;
+            }
+
+            let quant = data[block_offset + 2 + i] as i8;
+            let value = quant as f32 * scale;
+            result.push(value);
+        }
+    }
+
+    result.truncate(n_elements);
+    result
+}
+
+/// Dequantize Q8_1
+fn dequantize_q8_1(data: &[u8], n_elements: usize) -> Vec<f32> {
+    dequantize_q8_0(data, n_elements) // Simplified
+}
+
+// K-quant dequantization (simplified implementations)
+fn dequantize_q2_k(data: &[u8], n_elements: usize) -> Vec<f32> {
+    // Simplified: treat as Q4_0 for now
+    dequantize_q4_0(data, n_elements)
+}
+
+fn dequantize_q3_k(data: &[u8], n_elements: usize) -> Vec<f32> {
+    dequantize_q4_0(data, n_elements)
+}
+
+fn dequantize_q4_k(data: &[u8], n_elements: usize) -> Vec<f32> {
+    // Full Q4_K implementation would be more complex
+    dequantize_q4_0(data, n_elements)
+}
+
+fn dequantize_q5_k(data: &[u8], n_elements: usize) -> Vec<f32> {
+    dequantize_q5_0(data, n_elements)
+}
+
+fn dequantize_q6_k(data: &[u8], n_elements: usize) -> Vec<f32> {
+    dequantize_q5_0(data, n_elements)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gguf_magic() {
+        assert_eq!(GGUF_MAGIC, 0x46554747);
+    }
+
+    #[test]
+    fn test_tensor_type_block_sizes() {
+        assert_eq!(GgufTensorType::Q4_0.block_size(), 32);
+        assert_eq!(GgufTensorType::Q8_0.block_size(), 32);
+        assert_eq!(GgufTensorType::Q4_K.block_size(), 256);
+    }
+
+    #[test]
+    fn test_dequantize_q4_0() {
+        // Test with minimal block
+        let mut data = vec![0u8; 18];
+        // Set scale to 1.0 in f16
+        data[0] = 0x00;
+        data[1] = 0x3C; // f16(1.0) = 0x3C00
+
+        // Set some 4-bit values
+        data[2] = 0x01; // nibbles: 1, 0
+
+        let result = dequantize_q4_0(&data, 32);
+        assert_eq!(result.len(), 32);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-sparse-inference/src/model/loader.rs
+++ b/vendor/ruvector/crates/ruvector-sparse-inference/src/model/loader.rs
@@ -0,0 +1,227 @@
+//! Universal model loader trait and metadata
+
+use crate::error::{ModelError, SparseInferenceError};
+use crate::model::gguf::{GgufModel, GgufParser, GgufValue};
+
+type Result<T> = std::result::Result<T, SparseInferenceError>;
+use std::collections::HashMap;
+use std::path::Path;
+
+/// Universal model loader trait
+pub trait ModelLoader {
+    type Model;
+    type Error: std::error::Error;
+
+    /// Load model from bytes
+    fn load(data: &[u8]) -> Result<Self::Model>;
+
+    /// Load model from file path (native only)
+    #[cfg(not(target_arch = "wasm32"))]
+    fn load_file(path: &Path) -> Result<Self::Model> {
+        let data = std::fs::read(path).map_err(|e| {
+            SparseInferenceError::Model(ModelError::LoadFailed(format!(
+                "Failed to read file: {}",
+                e
+            )))
+        })?;
+        Self::load(&data)
+    }
+
+    /// Get model metadata
+    fn metadata(&self) -> &ModelMetadata;
+}
+
+/// Model metadata extracted from GGUF or other formats
+#[derive(Debug, Clone)]
+pub struct ModelMetadata {
+    pub architecture: ModelArchitecture,
+    pub hidden_size: usize,
+    pub intermediate_size: usize,
+    pub num_layers: usize,
+    pub num_heads: usize,
+    pub num_key_value_heads: Option<usize>,
+    pub vocab_size: usize,
+    pub max_position_embeddings: usize,
+    pub quantization: Option<QuantizationType>,
+    pub rope_theta: Option<f32>,
+    pub rope_scaling: Option<RopeScaling>,
+}
+
+impl ModelMetadata {
+    /// Extract metadata from GGUF model
+    pub fn from_gguf(model: &GgufModel) -> Result<Self> {
+        let arch_name = Self::get_string(&model.metadata, "general.architecture")
+            .map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
+        let architecture = ModelArchitecture::from_str(&arch_name)
+            .map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
+
+        let prefix = format!("{}", arch_name);
+
+        Ok(Self {
+            architecture,
+            hidden_size: Self::get_u32(&model.metadata, &format!("{}.embedding_length", prefix))?
+                as usize,
+            intermediate_size: Self::get_u32(
+                &model.metadata,
+                &format!("{}.feed_forward_length", prefix),
+            )
+            .unwrap_or(0) as usize,
+            num_layers: Self::get_u32(&model.metadata, &format!("{}.block_count", prefix))?
+                as usize,
+            num_heads: Self::get_u32(&model.metadata, &format!("{}.attention.head_count", prefix))?
+                as usize,
+            num_key_value_heads: Self::get_u32(
+                &model.metadata,
+                &format!("{}.attention.head_count_kv", prefix),
+            )
+            .ok()
+            .map(|v| v as usize),
+            vocab_size: Self::get_u32(&model.metadata, "tokenizer.ggml.tokens")
+                .or_else(|_| Self::get_array_len(&model.metadata, "tokenizer.ggml.tokens"))
+                .unwrap_or(32000) as usize,
+            max_position_embeddings: Self::get_u32(
+                &model.metadata,
+                &format!("{}.context_length", prefix),
+            )
+            .unwrap_or(2048) as usize,
+            quantization: None, // Determined from tensor types
+            rope_theta: Self::get_f32(&model.metadata, &format!("{}.rope.freq_base", prefix)).ok(),
+            rope_scaling: None,
+        })
+    }
+
+    fn get_string(
+        metadata: &HashMap<String, GgufValue>,
+        key: &str,
+    ) -> std::result::Result<String, String> {
+        match metadata.get(key) {
+            Some(GgufValue::String(s)) => Ok(s.clone()),
+            _ => Err(format!("Missing metadata: {}", key)),
+        }
+    }
+
+    fn get_u32(
+        metadata: &HashMap<String, GgufValue>,
+        key: &str,
+    ) -> std::result::Result<u32, String> {
+        match metadata.get(key) {
+            Some(GgufValue::Uint32(v)) => Ok(*v),
+            Some(GgufValue::Uint64(v)) => Ok(*v as u32),
+            Some(GgufValue::Int32(v)) => Ok(*v as u32),
+            _ => Err(format!("Missing metadata: {}", key)),
+        }
+    }
+
+    fn get_f32(
+        metadata: &HashMap<String, GgufValue>,
+        key: &str,
+    ) -> std::result::Result<f32, String> {
+        match metadata.get(key) {
+            Some(GgufValue::Float32(v)) => Ok(*v),
+            Some(GgufValue::Float64(v)) => Ok(*v as f32),
+            _ => Err(format!("Missing metadata: {}", key)),
+        }
+    }
+
+    fn get_array_len(
+        metadata: &HashMap<String, GgufValue>,
+        key: &str,
+    ) -> std::result::Result<u32, String> {
+        match metadata.get(key) {
+            Some(GgufValue::Array(arr)) => Ok(arr.len() as u32),
+            _ => Err(format!("Missing metadata: {}", key)),
+        }
+    }
+}
+
+/// Model architecture type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ModelArchitecture {
+    Llama,
+    LFM2,
+    Bert,
+    Mistral,
+    Qwen,
+    Phi,
+    Gemma,
+}
+
+impl ModelArchitecture {
+    pub fn from_str(s: &str) -> std::result::Result<Self, String> {
+        match s.to_lowercase().as_str() {
+            "llama" => Ok(Self::Llama),
+            "lfm" | "lfm2" => Ok(Self::LFM2),
+            "bert" => Ok(Self::Bert),
+            "mistral" => Ok(Self::Mistral),
+            "qwen" | "qwen2" => Ok(Self::Qwen),
+            "phi" | "phi2" | "phi3" => Ok(Self::Phi),
+            "gemma" | "gemma2" => Ok(Self::Gemma),
+            _ => Err(format!("Unsupported architecture: {}", s)),
+        }
+    }
+}
+
+/// Quantization type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum QuantizationType {
+    F32,
+    F16,
+    Q4_0,
+    Q4_1,
+    Q5_0,
+    Q5_1,
+    Q8_0,
+    Q8_1,
+    Q4_K,
+    Q5_K,
+    Q6_K,
+}
+
+/// RoPE scaling configuration
+#[derive(Debug, Clone)]
+pub struct RopeScaling {
+    pub scaling_type: String,
+    pub factor: f32,
+}
+
+impl Default for ModelMetadata {
+    fn default() -> Self {
+        Self {
+            architecture: ModelArchitecture::Llama,
+            hidden_size: 4096,
+            intermediate_size: 11008,
+            num_layers: 32,
+            num_heads: 32,
+            num_key_value_heads: None,
+            vocab_size: 32000,
+            max_position_embeddings: 2048,
+            quantization: None,
+            rope_theta: Some(10000.0),
+            rope_scaling: None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_architecture_parsing() {
+        assert_eq!(
+            ModelArchitecture::from_str("llama").unwrap(),
+            ModelArchitecture::Llama
+        );
+        assert_eq!(
+            ModelArchitecture::from_str("BERT").unwrap(),
+            ModelArchitecture::Bert
+        );
+    }
+
+    #[test]
+    fn test_default_metadata() {
+        let metadata = ModelMetadata::default();
+        assert_eq!(metadata.architecture, ModelArchitecture::Llama);
+        assert_eq!(metadata.hidden_size, 4096);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-sparse-inference/src/model/mod.rs
+++ b/vendor/ruvector/crates/ruvector-sparse-inference/src/model/mod.rs
@@ -0,0 +1,13 @@
+//! Model loading and inference infrastructure
+
+pub mod gguf;
+pub mod loader;
+pub mod runners;
+pub mod types;
+
+pub use gguf::{GgufHeader, GgufModel, GgufParser, GgufTensorInfo, GgufTensorType, GgufValue};
+pub use loader::{ModelArchitecture, ModelLoader, ModelMetadata, QuantizationType};
+pub use runners::{
+    BertModel, LFM2Model, LlamaLayer, LlamaMLP, LlamaModel, ModelRunner, SparseModel,
+};
+pub use types::{InferenceConfig, ModelInput, ModelOutput, Tensor};
--- a/vendor/ruvector/crates/ruvector-sparse-inference/src/model/runners.rs
+++ b/vendor/ruvector/crates/ruvector-sparse-inference/src/model/runners.rs
@@ -0,0 +1,532 @@
+//! Model runners for different architectures with sparse inference support
+
+use crate::error::SparseInferenceError;
+use crate::model::loader::{ModelLoader, ModelMetadata};
+use crate::model::types::{CalibrationStats, InferenceConfig, ModelInput, ModelOutput, Tensor};
+use crate::ops::{silu, Embedding, LayerNorm, Linear, RMSNorm};
+use std::collections::HashMap;
+
+type Result<T> = std::result::Result<T, SparseInferenceError>;
+
+/// Trait for running inference on models
+pub trait ModelRunner {
+    /// Forward pass with optional sparse computation
+    fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput>;
+
+    /// Get predictor for a specific layer (if available)
+    fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor>;
+
+    /// Calibrate predictors with sample data
+    fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats>;
+
+    /// Get model metadata
+    fn metadata(&self) -> &ModelMetadata;
+}
+
+/// Low-rank predictor for neuron activation prediction
+#[derive(Debug, Clone)]
+pub struct LowRankPredictor {
+    pub u: Vec<Vec<f32>>, // U matrix (d x r)
+    pub v: Vec<Vec<f32>>, // V matrix (r x m)
+    pub rank: usize,
+}
+
+impl LowRankPredictor {
+    pub fn new(input_dim: usize, output_dim: usize, rank: usize) -> Self {
+        Self {
+            u: vec![vec![0.0; rank]; input_dim],
+            v: vec![vec![0.0; output_dim]; rank],
+            rank,
+        }
+    }
+
+    /// Predict top-k active neurons
+    pub fn predict_active(&self, input: &[f32], k: usize) -> Vec<usize> {
+        let scores = self.forward(input);
+        let mut indices: Vec<usize> = (0..scores.len()).collect();
+        indices.sort_by(|&a, &b| scores[b].partial_cmp(&scores[a]).unwrap());
+        indices.truncate(k);
+        indices
+    }
+
+    fn forward(&self, input: &[f32]) -> Vec<f32> {
+        // Compute UV^T · input in two steps
+        // First: U^T · input (r-dimensional)
+        let mut hidden = vec![0.0; self.rank];
+        for i in 0..self.rank {
+            for (j, u_ji) in self.u.iter().enumerate() {
+                if j < input.len() && i < u_ji.len() {
+                    hidden[i] += u_ji[i] * input[j];
+                }
+            }
+        }
+
+        // Second: V · hidden (m-dimensional)
+        let output_dim = self.v.first().map(|v| v.len()).unwrap_or(0);
+        let mut output = vec![0.0; output_dim];
+        for i in 0..output_dim {
+            for (j, &h) in hidden.iter().enumerate() {
+                if j < self.v.len() && i < self.v[j].len() {
+                    output[i] += self.v[j][i] * h;
+                }
+            }
+        }
+
+        output
+    }
+}
+
+// ============================================================================
+// Llama Model
+// ============================================================================
+
+/// Llama model for sparse inference
+pub struct LlamaModel {
+    pub metadata: ModelMetadata,
+    pub layers: Vec<LlamaLayer>,
+    pub embed_tokens: Embedding,
+    pub norm: RMSNorm,
+    pub lm_head: Option<Linear>,
+}
+
+pub struct LlamaLayer {
+    pub input_layernorm: RMSNorm,
+    pub self_attn: LlamaAttention,
+    pub post_attention_layernorm: RMSNorm,
+    pub mlp: LlamaMLP,
+    pub predictor: Option<LowRankPredictor>,
+}
+
+pub struct LlamaAttention {
+    pub q_proj: Linear,
+    pub k_proj: Linear,
+    pub v_proj: Linear,
+    pub o_proj: Linear,
+    pub num_heads: usize,
+    pub head_dim: usize,
+}
+
+pub struct LlamaMLP {
+    pub gate_proj: Linear, // W1 for SwiGLU gate
+    pub up_proj: Linear,   // W3 for SwiGLU up
+    pub down_proj: Linear, // W2 for down projection
+}
+
+impl LlamaMLP {
+    /// Standard forward pass (dense)
+    pub fn forward(&self, x: &[f32]) -> Vec<f32> {
+        let gate = self.gate_proj.forward(x);
+        let up = self.up_proj.forward(x);
+
+        // SwiGLU: silu(gate) ⊙ up
+        let hidden: Vec<f32> = gate
+            .iter()
+            .zip(up.iter())
+            .map(|(&g, &u)| silu(g) * u)
+            .collect();
+
+        self.down_proj.forward(&hidden)
+    }
+
+    /// Sparse forward pass using predictor
+    pub fn forward_sparse(&self, x: &[f32], active_neurons: &[usize]) -> Vec<f32> {
+        // Only compute for active neurons in intermediate layer
+        let gate = sparse_matmul(&self.gate_proj, x, active_neurons);
+        let up = sparse_matmul(&self.up_proj, x, active_neurons);
+
+        // SwiGLU on active neurons only
+        let hidden: Vec<f32> = gate
+            .iter()
+            .zip(up.iter())
+            .map(|(&g, &u)| silu(g) * u)
+            .collect();
+
+        // Sparse down projection
+        sparse_matmul_full(&self.down_proj, &hidden, active_neurons)
+    }
+}
+
+impl ModelRunner for LlamaModel {
+    fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
+        // Embed tokens
+        let mut hidden_states = self.embed_tokens.forward(&input.input_ids);
+
+        let mut all_hidden_states = if config.output_hidden_states {
+            Some(Vec::new())
+        } else {
+            None
+        };
+
+        // Process each layer
+        for (idx, layer) in self.layers.iter().enumerate() {
+            if let Some(ref mut states) = all_hidden_states {
+                states.push(hidden_states.clone());
+            }
+
+            // Layer norm
+            let normed = layer.input_layernorm.forward(&hidden_states);
+
+            // Self-attention (simplified, no KV cache)
+            let attn_output = layer.self_attn.forward(&normed);
+
+            // Residual
+            hidden_states = add_vectors(&hidden_states, &attn_output);
+
+            // Post-attention norm
+            let normed = layer.post_attention_layernorm.forward(&hidden_states);
+
+            // MLP with optional sparsity
+            let mlp_output = if config.use_sparse_ffn {
+                if let Some(ref predictor) = layer.predictor {
+                    let k = config.active_neurons_per_layer.unwrap_or(
+                        (self.metadata.intermediate_size as f32 * (1.0 - config.sparsity)) as usize,
+                    );
+                    let active = predictor.predict_active(&normed, k);
+                    layer.mlp.forward_sparse(&normed, &active)
+                } else {
+                    layer.mlp.forward(&normed)
+                }
+            } else {
+                layer.mlp.forward(&normed)
+            };
+
+            // Residual
+            hidden_states = add_vectors(&hidden_states, &mlp_output);
+        }
+
+        // Final norm
+        hidden_states = self.norm.forward(&hidden_states);
+
+        // LM head
+        let logits = if let Some(ref lm_head) = self.lm_head {
+            lm_head.forward(&hidden_states)
+        } else {
+            hidden_states
+        };
+
+        Ok(ModelOutput::new(logits).with_hidden_states(all_hidden_states.unwrap_or_default()))
+    }
+
+    fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
+        self.layers.get(layer_idx)?.predictor.as_ref()
+    }
+
+    fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
+        // Placeholder: would collect activation statistics
+        Ok(CalibrationStats {
+            num_samples: samples.len(),
+            average_sparsity: 0.9,
+            layer_stats: HashMap::new(),
+        })
+    }
+
+    fn metadata(&self) -> &ModelMetadata {
+        &self.metadata
+    }
+}
+
+impl LlamaAttention {
+    pub fn forward(&self, hidden_states: &[f32]) -> Vec<f32> {
+        // Simplified: full attention without KV cache
+        let q = self.q_proj.forward(hidden_states);
+        let k = self.k_proj.forward(hidden_states);
+        let v = self.v_proj.forward(hidden_states);
+
+        // Placeholder: would do scaled dot-product attention
+        self.o_proj.forward(&q)
+    }
+}
+
+// ============================================================================
+// LFM2 Model (Liquid AI)
+// ============================================================================
+
+pub struct LFM2Model {
+    pub metadata: ModelMetadata,
+    pub embedding: Embedding,
+    pub layers: Vec<LFM2Layer>,
+    pub pooler: Option<Pooler>,
+}
+
+pub struct LFM2Layer {
+    pub gated_conv: GatedConv1d,
+    pub attention: GroupedQueryAttention,
+    pub ffn: SparseFfn,
+    pub norm: LayerNorm,
+}
+
+pub struct GatedConv1d {
+    pub weight: Vec<Vec<f32>>,
+    pub gate: Linear,
+}
+
+pub struct GroupedQueryAttention {
+    pub q_proj: Linear,
+    pub k_proj: Linear,
+    pub v_proj: Linear,
+    pub o_proj: Linear,
+    pub num_groups: usize,
+}
+
+pub struct SparseFfn {
+    pub w1: Linear,
+    pub w2: Linear,
+    pub predictor: Option<LowRankPredictor>,
+}
+
+impl ModelRunner for LFM2Model {
+    fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
+        let mut hidden = self.embedding.forward(&input.input_ids);
+
+        for layer in &self.layers {
+            // Gated convolution for local context
+            hidden = layer.gated_conv.forward(&hidden);
+
+            // Grouped query attention
+            let attn_out = layer.attention.forward(&hidden);
+            hidden = add_vectors(&hidden, &attn_out);
+
+            // Sparse FFN
+            let ffn_out = layer.ffn.forward(&hidden, config);
+            hidden = add_vectors(&hidden, &ffn_out);
+
+            hidden = layer.norm.forward(&hidden);
+        }
+
+        Ok(ModelOutput::new(hidden))
+    }
+
+    fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
+        self.layers.get(layer_idx)?.ffn.predictor.as_ref()
+    }
+
+    fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
+        Ok(CalibrationStats {
+            num_samples: 0,
+            average_sparsity: 0.9,
+            layer_stats: HashMap::new(),
+        })
+    }
+
+    fn metadata(&self) -> &ModelMetadata {
+        &self.metadata
+    }
+}
+
+impl GatedConv1d {
+    pub fn forward(&self, x: &[f32]) -> Vec<f32> {
+        // Simplified convolution
+        x.to_vec()
+    }
+}
+
+impl GroupedQueryAttention {
+    pub fn forward(&self, x: &[f32]) -> Vec<f32> {
+        self.o_proj.forward(x)
+    }
+}
+
+impl SparseFfn {
+    pub fn forward(&self, x: &[f32], config: &InferenceConfig) -> Vec<f32> {
+        if config.use_sparse_ffn {
+            if let Some(ref predictor) = self.predictor {
+                let k = (self.w1.out_features as f32 * (1.0 - config.sparsity)) as usize;
+                let active = predictor.predict_active(x, k);
+                return sparse_matmul_full(&self.w2, &self.w1.forward(x), &active);
+            }
+        }
+        self.w2.forward(&self.w1.forward(x))
+    }
+}
+
+// ============================================================================
+// BERT Model
+// ============================================================================
+
+pub struct BertModel {
+    pub metadata: ModelMetadata,
+    pub embeddings: BertEmbeddings,
+    pub encoder: Vec<BertLayer>,
+    pub pooler: Option<Pooler>,
+}
+
+pub struct BertEmbeddings {
+    pub word_embeddings: Embedding,
+    pub position_embeddings: Embedding,
+    pub token_type_embeddings: Embedding,
+    pub layer_norm: LayerNorm,
+}
+
+pub struct BertLayer {
+    pub attention: MultiHeadAttention,
+    pub intermediate: Linear,
+    pub output: Linear,
+    pub layer_norm1: LayerNorm,
+    pub layer_norm2: LayerNorm,
+}
+
+pub struct MultiHeadAttention {
+    pub q_proj: Linear,
+    pub k_proj: Linear,
+    pub v_proj: Linear,
+    pub o_proj: Linear,
+    pub num_heads: usize,
+}
+
+pub struct Pooler {
+    pub dense: Linear,
+}
+
+impl ModelRunner for BertModel {
+    fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
+        let mut hidden = self.embeddings.forward(&input.input_ids);
+
+        for layer in &self.encoder {
+            let attn_out = layer.attention.forward(&hidden);
+            hidden = layer.layer_norm1.forward(&add_vectors(&hidden, &attn_out));
+
+            let intermediate = layer.intermediate.forward(&hidden);
+            let output = layer.output.forward(&intermediate);
+            hidden = layer.layer_norm2.forward(&add_vectors(&hidden, &output));
+        }
+
+        Ok(ModelOutput::new(hidden))
+    }
+
+    fn get_predictor(&self, _layer_idx: usize) -> Option<&LowRankPredictor> {
+        None
+    }
+
+    fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
+        Ok(CalibrationStats {
+            num_samples: 0,
+            average_sparsity: 0.0,
+            layer_stats: HashMap::new(),
+        })
+    }
+
+    fn metadata(&self) -> &ModelMetadata {
+        &self.metadata
+    }
+}
+
+impl BertEmbeddings {
+    pub fn forward(&self, input_ids: &[u64]) -> Vec<f32> {
+        self.word_embeddings.forward(input_ids)
+    }
+}
+
+impl MultiHeadAttention {
+    pub fn forward(&self, x: &[f32]) -> Vec<f32> {
+        self.o_proj.forward(x)
+    }
+}
+
+// ============================================================================
+// Unified Model Wrapper
+// ============================================================================
+
+pub enum SparseModel {
+    Llama(LlamaModel),
+    LFM2(LFM2Model),
+    Bert(BertModel),
+}
+
+impl ModelRunner for SparseModel {
+    fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
+        match self {
+            Self::Llama(m) => m.forward(input, config),
+            Self::LFM2(m) => m.forward(input, config),
+            Self::Bert(m) => m.forward(input, config),
+        }
+    }
+
+    fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
+        match self {
+            Self::Llama(m) => m.get_predictor(layer_idx),
+            Self::LFM2(m) => m.get_predictor(layer_idx),
+            Self::Bert(m) => m.get_predictor(layer_idx),
+        }
+    }
+
+    fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
+        match self {
+            Self::Llama(m) => m.calibrate(samples),
+            Self::LFM2(m) => m.calibrate(samples),
+            Self::Bert(m) => m.calibrate(samples),
+        }
+    }
+
+    fn metadata(&self) -> &ModelMetadata {
+        match self {
+            Self::Llama(m) => m.metadata(),
+            Self::LFM2(m) => m.metadata(),
+            Self::Bert(m) => m.metadata(),
+        }
+    }
+}
+
+// ============================================================================
+// Helper Functions
+// ============================================================================
+
+fn sparse_matmul(linear: &Linear, input: &[f32], active_cols: &[usize]) -> Vec<f32> {
+    let mut output = vec![0.0; active_cols.len()];
+
+    for (out_idx, &col_idx) in active_cols.iter().enumerate() {
+        if col_idx < linear.out_features {
+            for (in_idx, &x) in input.iter().enumerate() {
+                if in_idx < linear.in_features {
+                    output[out_idx] += linear.weight[col_idx][in_idx] * x;
+                }
+            }
+            if let Some(ref bias) = linear.bias {
+                output[out_idx] += bias[col_idx];
+            }
+        }
+    }
+
+    output
+}
+
+fn sparse_matmul_full(linear: &Linear, input: &[f32], active_input_cols: &[usize]) -> Vec<f32> {
+    let mut output = vec![0.0; linear.out_features];
+
+    for out_idx in 0..linear.out_features {
+        for &in_idx in active_input_cols {
+            if in_idx < input.len() && in_idx < linear.in_features {
+                output[out_idx] += linear.weight[out_idx][in_idx] * input[in_idx];
+            }
+        }
+        if let Some(ref bias) = linear.bias {
+            output[out_idx] += bias[out_idx];
+        }
+    }
+
+    output
+}
+
+fn add_vectors(a: &[f32], b: &[f32]) -> Vec<f32> {
+    a.iter().zip(b.iter()).map(|(x, y)| x + y).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_low_rank_predictor() {
+        let predictor = LowRankPredictor::new(128, 512, 16);
+        let input = vec![1.0; 128];
+        let active = predictor.predict_active(&input, 10);
+        assert_eq!(active.len(), 10);
+    }
+
+    #[test]
+    fn test_add_vectors() {
+        let a = vec![1.0, 2.0, 3.0];
+        let b = vec![4.0, 5.0, 6.0];
+        let result = add_vectors(&a, &b);
+        assert_eq!(result, vec![5.0, 7.0, 9.0]);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-sparse-inference/src/model/types.rs
+++ b/vendor/ruvector/crates/ruvector-sparse-inference/src/model/types.rs
@@ -0,0 +1,159 @@
+//! Core types for model inference
+
+use std::collections::HashMap;
+
+/// Generic tensor representation
+#[derive(Debug, Clone)]
+pub struct Tensor {
+    pub data: Vec<f32>,
+    pub shape: Vec<u64>,
+    pub name: String,
+}
+
+impl Tensor {
+    pub fn new(data: Vec<f32>, shape: Vec<u64>, name: String) -> Self {
+        Self { data, shape, name }
+    }
+
+    pub fn zeros(shape: Vec<u64>, name: String) -> Self {
+        let size = shape.iter().product::<u64>() as usize;
+        Self {
+            data: vec![0.0; size],
+            shape,
+            name,
+        }
+    }
+
+    pub fn size(&self) -> usize {
+        self.data.len()
+    }
+
+    pub fn reshape(&mut self, new_shape: Vec<u64>) {
+        let new_size = new_shape.iter().product::<u64>() as usize;
+        assert_eq!(
+            new_size,
+            self.size(),
+            "Reshape size mismatch: {} vs {}",
+            new_size,
+            self.size()
+        );
+        self.shape = new_shape;
+    }
+}
+
+/// Model input configuration
+#[derive(Debug, Clone)]
+pub struct ModelInput {
+    pub input_ids: Vec<u64>,
+    pub attention_mask: Option<Vec<u8>>,
+    pub position_ids: Option<Vec<u64>>,
+}
+
+impl ModelInput {
+    pub fn new(input_ids: Vec<u64>) -> Self {
+        Self {
+            input_ids,
+            attention_mask: None,
+            position_ids: None,
+        }
+    }
+
+    pub fn with_attention_mask(mut self, mask: Vec<u8>) -> Self {
+        self.attention_mask = Some(mask);
+        self
+    }
+
+    pub fn with_position_ids(mut self, positions: Vec<u64>) -> Self {
+        self.position_ids = Some(positions);
+        self
+    }
+
+    pub fn sequence_length(&self) -> usize {
+        self.input_ids.len()
+    }
+}
+
+/// Model output
+#[derive(Debug, Clone)]
+pub struct ModelOutput {
+    pub logits: Vec<f32>,
+    pub hidden_states: Option<Vec<Vec<f32>>>,
+    pub attentions: Option<Vec<Vec<f32>>>,
+}
+
+impl ModelOutput {
+    pub fn new(logits: Vec<f32>) -> Self {
+        Self {
+            logits,
+            hidden_states: None,
+            attentions: None,
+        }
+    }
+
+    pub fn with_hidden_states(mut self, states: Vec<Vec<f32>>) -> Self {
+        self.hidden_states = Some(states);
+        self
+    }
+}
+
+/// Inference configuration
+#[derive(Debug, Clone)]
+pub struct InferenceConfig {
+    /// Sparsity level (0.0 = dense, 1.0 = maximum sparsity)
+    pub sparsity: f32,
+
+    /// Sparsity threshold for neuron activation
+    pub sparsity_threshold: f32,
+
+    /// Temperature for sampling
+    pub temperature: f32,
+
+    /// Top-k sampling
+    pub top_k: Option<usize>,
+
+    /// Top-p (nucleus) sampling
+    pub top_p: Option<f32>,
+
+    /// Use sparse FFN computation
+    pub use_sparse_ffn: bool,
+
+    /// Number of active neurons per layer
+    pub active_neurons_per_layer: Option<usize>,
+
+    /// Return hidden states
+    pub output_hidden_states: bool,
+
+    /// Return attention weights
+    pub output_attentions: bool,
+}
+
+impl Default for InferenceConfig {
+    fn default() -> Self {
+        Self {
+            sparsity: 0.9,
+            sparsity_threshold: 0.01,
+            temperature: 1.0,
+            top_k: None,
+            top_p: None,
+            use_sparse_ffn: true,
+            active_neurons_per_layer: None,
+            output_hidden_states: false,
+            output_attentions: false,
+        }
+    }
+}
+
+/// Calibration statistics
+#[derive(Debug, Clone)]
+pub struct CalibrationStats {
+    pub num_samples: usize,
+    pub average_sparsity: f32,
+    pub layer_stats: HashMap<usize, LayerStats>,
+}
+
+#[derive(Debug, Clone)]
+pub struct LayerStats {
+    pub active_neurons: usize,
+    pub total_neurons: usize,
+    pub sparsity: f32,
+}