Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,610 @@
//! GGUF file format parser for llama.cpp models
//!
//! This module implements parsing for the GGUF (GGML Universal Format) used by llama.cpp.
//! Supports all quantization types and efficient tensor loading.
use crate::error::{GgufError, SparseInferenceError};
use crate::model::types::Tensor;
use byteorder::{LittleEndian, ReadBytesExt};
use std::collections::HashMap;
use std::io::{Cursor, Read};
/// GGUF magic number ("GGUF" in ASCII)
pub const GGUF_MAGIC: u32 = 0x46554747;
/// Supported GGUF version
pub const GGUF_VERSION: u32 = 3;
/// GGUF file header
#[derive(Debug, Clone)]
pub struct GgufHeader {
pub magic: u32,
pub version: u32,
pub tensor_count: u64,
pub metadata_kv_count: u64,
}
/// GGUF metadata value types
#[derive(Debug, Clone)]
pub enum GgufValue {
Uint8(u8),
Int8(i8),
Uint16(u16),
Int16(i16),
Uint32(u32),
Int32(i32),
Float32(f32),
Bool(bool),
String(String),
Array(Vec<GgufValue>),
Uint64(u64),
Int64(i64),
Float64(f64),
}
impl GgufValue {
/// Try to convert value to u32
pub fn as_u32(&self) -> Option<u32> {
match self {
GgufValue::Uint8(v) => Some(*v as u32),
GgufValue::Uint16(v) => Some(*v as u32),
GgufValue::Uint32(v) => Some(*v),
GgufValue::Uint64(v) => Some(*v as u32),
GgufValue::Int8(v) => Some(*v as u32),
GgufValue::Int16(v) => Some(*v as u32),
GgufValue::Int32(v) => Some(*v as u32),
GgufValue::Int64(v) => Some(*v as u32),
_ => None,
}
}
/// Try to convert value to usize
pub fn as_usize(&self) -> Option<usize> {
self.as_u32().map(|v| v as usize)
}
/// Try to convert value to f32
pub fn as_f32(&self) -> Option<f32> {
match self {
GgufValue::Float32(v) => Some(*v),
GgufValue::Float64(v) => Some(*v as f32),
GgufValue::Uint8(v) => Some(*v as f32),
GgufValue::Int8(v) => Some(*v as f32),
GgufValue::Uint16(v) => Some(*v as f32),
GgufValue::Int16(v) => Some(*v as f32),
GgufValue::Uint32(v) => Some(*v as f32),
GgufValue::Int32(v) => Some(*v as f32),
_ => None,
}
}
}
/// GGUF tensor quantization types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum GgufTensorType {
F32 = 0,
F16 = 1,
Q4_0 = 2,
Q4_1 = 3,
Q5_0 = 6,
Q5_1 = 7,
Q8_0 = 8,
Q8_1 = 9,
Q2_K = 10,
Q3_K = 11,
Q4_K = 12,
Q5_K = 13,
Q6_K = 14,
}
impl GgufTensorType {
pub fn from_u32(value: u32) -> Result<Self, GgufError> {
match value {
0 => Ok(Self::F32),
1 => Ok(Self::F16),
2 => Ok(Self::Q4_0),
3 => Ok(Self::Q4_1),
6 => Ok(Self::Q5_0),
7 => Ok(Self::Q5_1),
8 => Ok(Self::Q8_0),
9 => Ok(Self::Q8_1),
10 => Ok(Self::Q2_K),
11 => Ok(Self::Q3_K),
12 => Ok(Self::Q4_K),
13 => Ok(Self::Q5_K),
14 => Ok(Self::Q6_K),
_ => Err(GgufError::InvalidTensorType(value)),
}
}
/// Get the block size for this quantization type
pub fn block_size(&self) -> usize {
match self {
Self::F32 => 1,
Self::F16 => 1,
Self::Q4_0 | Self::Q4_1 => 32,
Self::Q5_0 | Self::Q5_1 => 32,
Self::Q8_0 | Self::Q8_1 => 32,
Self::Q2_K | Self::Q3_K | Self::Q4_K | Self::Q5_K | Self::Q6_K => 256,
}
}
/// Get bytes per block for this quantization type
pub fn bytes_per_block(&self) -> usize {
match self {
Self::F32 => 4,
Self::F16 => 2,
Self::Q4_0 => 18, // 2 (scale) + 16 (quants)
Self::Q4_1 => 20, // 2 (scale) + 2 (min) + 16 (quants)
Self::Q5_0 => 22, // 2 (scale) + 4 (high bits) + 16 (quants)
Self::Q5_1 => 24, // 2 (scale) + 2 (min) + 4 (high bits) + 16 (quants)
Self::Q8_0 => 34, // 2 (scale) + 32 (quants)
Self::Q8_1 => 36, // 4 (scale) + 32 (quants)
Self::Q2_K => 84,
Self::Q3_K => 110,
Self::Q4_K => 144,
Self::Q5_K => 176,
Self::Q6_K => 210,
}
}
}
/// GGUF tensor information
#[derive(Debug, Clone)]
pub struct GgufTensorInfo {
pub name: String,
pub dimensions: Vec<u64>,
pub tensor_type: GgufTensorType,
pub offset: u64,
}
/// Parsed GGUF model
#[derive(Debug, Clone)]
pub struct GgufModel {
pub header: GgufHeader,
pub metadata: HashMap<String, GgufValue>,
pub tensors: HashMap<String, GgufTensorInfo>,
pub tensor_data_offset: u64,
}
/// GGUF parser
pub struct GgufParser;
impl GgufParser {
/// Parse complete GGUF file from bytes
pub fn parse(data: &[u8]) -> Result<GgufModel, GgufError> {
let mut cursor = Cursor::new(data);
// Parse header
let header = Self::parse_header_from_cursor(&mut cursor)?;
// Parse metadata
let metadata = Self::parse_metadata(&mut cursor, header.metadata_kv_count)?;
// Parse tensor info
let tensors = Self::parse_tensor_info(&mut cursor, header.tensor_count)?;
// Calculate tensor data offset (aligned to 32 bytes)
let current_pos = cursor.position();
let alignment = 32u64;
let tensor_data_offset = ((current_pos + alignment - 1) / alignment) * alignment;
Ok(GgufModel {
header,
metadata,
tensors,
tensor_data_offset,
})
}
/// Parse only the header (for validation)
pub fn parse_header(data: &[u8]) -> Result<GgufHeader, GgufError> {
let mut cursor = Cursor::new(data);
Self::parse_header_from_cursor(&mut cursor)
}
fn parse_header_from_cursor(cursor: &mut Cursor<&[u8]>) -> Result<GgufHeader, GgufError> {
let magic = cursor.read_u32::<LittleEndian>()?;
if magic != GGUF_MAGIC {
return Err(GgufError::InvalidMagic(magic));
}
let version = cursor.read_u32::<LittleEndian>()?;
if version != GGUF_VERSION {
return Err(GgufError::UnsupportedVersion(version));
}
let tensor_count = cursor.read_u64::<LittleEndian>()?;
let metadata_kv_count = cursor.read_u64::<LittleEndian>()?;
Ok(GgufHeader {
magic,
version,
tensor_count,
metadata_kv_count,
})
}
fn parse_metadata(
cursor: &mut Cursor<&[u8]>,
count: u64,
) -> Result<HashMap<String, GgufValue>, GgufError> {
let mut metadata = HashMap::new();
for _ in 0..count {
let key = Self::read_string(cursor)?;
let value = Self::read_value(cursor)?;
metadata.insert(key, value);
}
Ok(metadata)
}
fn parse_tensor_info(
cursor: &mut Cursor<&[u8]>,
count: u64,
) -> Result<HashMap<String, GgufTensorInfo>, GgufError> {
let mut tensors = HashMap::new();
let mut cumulative_offset = 0u64;
for _ in 0..count {
let name = Self::read_string(cursor)?;
// Read number of dimensions
let n_dims = cursor.read_u32::<LittleEndian>()? as usize;
// Read dimensions
let mut dimensions = Vec::with_capacity(n_dims);
for _ in 0..n_dims {
dimensions.push(cursor.read_u64::<LittleEndian>()?);
}
// Read tensor type
let tensor_type_raw = cursor.read_u32::<LittleEndian>()?;
let tensor_type = GgufTensorType::from_u32(tensor_type_raw)?;
// Read offset (this is relative offset in the tensor data section)
let offset_in_section = cursor.read_u64::<LittleEndian>()?;
let info = GgufTensorInfo {
name: name.clone(),
dimensions,
tensor_type,
offset: offset_in_section,
};
tensors.insert(name, info);
}
Ok(tensors)
}
fn read_string(cursor: &mut Cursor<&[u8]>) -> Result<String, GgufError> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let mut bytes = vec![0u8; len];
cursor.read_exact(&mut bytes)?;
Ok(String::from_utf8(bytes)?)
}
fn read_value(cursor: &mut Cursor<&[u8]>) -> Result<GgufValue, GgufError> {
let value_type = cursor.read_u32::<LittleEndian>()?;
Self::read_value_of_type(cursor, value_type)
}
fn read_value_of_type(
cursor: &mut Cursor<&[u8]>,
value_type: u32,
) -> Result<GgufValue, GgufError> {
match value_type {
0 => Ok(GgufValue::Uint8(cursor.read_u8()?)),
1 => Ok(GgufValue::Int8(cursor.read_i8()?)),
2 => Ok(GgufValue::Uint16(cursor.read_u16::<LittleEndian>()?)),
3 => Ok(GgufValue::Int16(cursor.read_i16::<LittleEndian>()?)),
4 => Ok(GgufValue::Uint32(cursor.read_u32::<LittleEndian>()?)),
5 => Ok(GgufValue::Int32(cursor.read_i32::<LittleEndian>()?)),
6 => Ok(GgufValue::Float32(cursor.read_f32::<LittleEndian>()?)),
7 => Ok(GgufValue::Bool(cursor.read_u8()? != 0)),
8 => Ok(GgufValue::String(Self::read_string(cursor)?)),
9 => {
let array_type = cursor.read_u32::<LittleEndian>()?;
let array_len = cursor.read_u64::<LittleEndian>()? as usize;
let mut array = Vec::with_capacity(array_len);
for _ in 0..array_len {
array.push(Self::read_value_of_type(cursor, array_type)?);
}
Ok(GgufValue::Array(array))
}
10 => Ok(GgufValue::Uint64(cursor.read_u64::<LittleEndian>()?)),
11 => Ok(GgufValue::Int64(cursor.read_i64::<LittleEndian>()?)),
12 => Ok(GgufValue::Float64(cursor.read_f64::<LittleEndian>()?)),
_ => Err(GgufError::InvalidValueType(value_type)),
}
}
/// Load a specific tensor by name
pub fn load_tensor(
data: &[u8],
model: &GgufModel,
tensor_name: &str,
) -> Result<Tensor, GgufError> {
let info = model
.tensors
.get(tensor_name)
.ok_or_else(|| GgufError::TensorNotFound(tensor_name.to_string()))?;
let offset = (model.tensor_data_offset + info.offset) as usize;
// Calculate tensor size
let n_elements = info.dimensions.iter().product::<u64>() as usize;
// Dequantize to f32
let tensor_data = &data[offset..];
let dequantized = Self::dequantize(tensor_data, info.tensor_type, n_elements)?;
Ok(Tensor::new(
dequantized,
info.dimensions.clone(),
tensor_name.to_string(),
))
}
/// Dequantize tensor data to f32
pub fn dequantize(
data: &[u8],
tensor_type: GgufTensorType,
n_elements: usize,
) -> Result<Vec<f32>, GgufError> {
match tensor_type {
GgufTensorType::F32 => dequantize_f32(data, n_elements),
GgufTensorType::F16 => dequantize_f16(data, n_elements),
GgufTensorType::Q4_0 => Ok(dequantize_q4_0(data, n_elements)),
GgufTensorType::Q4_1 => Ok(dequantize_q4_1(data, n_elements)),
GgufTensorType::Q5_0 => Ok(dequantize_q5_0(data, n_elements)),
GgufTensorType::Q5_1 => Ok(dequantize_q5_1(data, n_elements)),
GgufTensorType::Q8_0 => Ok(dequantize_q8_0(data, n_elements)),
GgufTensorType::Q8_1 => Ok(dequantize_q8_1(data, n_elements)),
GgufTensorType::Q2_K => Ok(dequantize_q2_k(data, n_elements)),
GgufTensorType::Q3_K => Ok(dequantize_q3_k(data, n_elements)),
GgufTensorType::Q4_K => Ok(dequantize_q4_k(data, n_elements)),
GgufTensorType::Q5_K => Ok(dequantize_q5_k(data, n_elements)),
GgufTensorType::Q6_K => Ok(dequantize_q6_k(data, n_elements)),
}
}
}
// Dequantization implementations
fn dequantize_f32(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
let mut cursor = Cursor::new(data);
let mut result = Vec::with_capacity(n_elements);
for _ in 0..n_elements {
result.push(cursor.read_f32::<LittleEndian>()?);
}
Ok(result)
}
fn dequantize_f16(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
let mut cursor = Cursor::new(data);
let mut result = Vec::with_capacity(n_elements);
for _ in 0..n_elements {
let f16_bits = cursor.read_u16::<LittleEndian>()?;
let f16_val = half::f16::from_bits(f16_bits);
result.push(f16_val.to_f32());
}
Ok(result)
}
/// Dequantize Q4_0 (4-bit quantization, block size 32)
/// Each block: 2 bytes (f16 scale) + 16 bytes (32 x 4-bit values)
fn dequantize_q4_0(data: &[u8], n_elements: usize) -> Vec<f32> {
const BLOCK_SIZE: usize = 32;
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
let mut result = Vec::with_capacity(n_elements);
for block_idx in 0..n_blocks {
let block_offset = block_idx * 18; // 2 + 16
// Read scale (f16)
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
let scale = half::f16::from_bits(scale_bits).to_f32();
// Read and dequantize 32 4-bit values
for i in 0..BLOCK_SIZE {
if result.len() >= n_elements {
break;
}
let byte_idx = block_offset + 2 + (i / 2);
let nibble = if i % 2 == 0 {
(data[byte_idx] & 0x0F) as i8
} else {
((data[byte_idx] >> 4) & 0x0F) as i8
};
// Convert 4-bit to signed (-8 to 7) and scale
let value = (nibble - 8) as f32 * scale;
result.push(value);
}
}
result.truncate(n_elements);
result
}
/// Dequantize Q4_1 (4-bit with min, block size 32)
fn dequantize_q4_1(data: &[u8], n_elements: usize) -> Vec<f32> {
const BLOCK_SIZE: usize = 32;
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
let mut result = Vec::with_capacity(n_elements);
for block_idx in 0..n_blocks {
let block_offset = block_idx * 20; // 2 (scale) + 2 (min) + 16 (quants)
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
let scale = half::f16::from_bits(scale_bits).to_f32();
let min_bits = u16::from_le_bytes([data[block_offset + 2], data[block_offset + 3]]);
let min = half::f16::from_bits(min_bits).to_f32();
for i in 0..BLOCK_SIZE {
if result.len() >= n_elements {
break;
}
let byte_idx = block_offset + 4 + (i / 2);
let nibble = if i % 2 == 0 {
data[byte_idx] & 0x0F
} else {
(data[byte_idx] >> 4) & 0x0F
};
let value = nibble as f32 * scale + min;
result.push(value);
}
}
result.truncate(n_elements);
result
}
/// Dequantize Q5_0 (5-bit quantization)
fn dequantize_q5_0(data: &[u8], n_elements: usize) -> Vec<f32> {
const BLOCK_SIZE: usize = 32;
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
let mut result = Vec::with_capacity(n_elements);
for block_idx in 0..n_blocks {
let block_offset = block_idx * 22; // 2 (scale) + 4 (high bits) + 16 (low bits)
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
let scale = half::f16::from_bits(scale_bits).to_f32();
let high_bits = u32::from_le_bytes([
data[block_offset + 2],
data[block_offset + 3],
data[block_offset + 4],
data[block_offset + 5],
]);
for i in 0..BLOCK_SIZE {
if result.len() >= n_elements {
break;
}
let byte_idx = block_offset + 6 + (i / 2);
let low_nibble = if i % 2 == 0 {
data[byte_idx] & 0x0F
} else {
(data[byte_idx] >> 4) & 0x0F
};
let high_bit = ((high_bits >> i) & 1) as u8;
let quant = (high_bit << 4) | low_nibble;
let value = (quant as i8 - 16) as f32 * scale;
result.push(value);
}
}
result.truncate(n_elements);
result
}
/// Dequantize Q5_1
fn dequantize_q5_1(data: &[u8], n_elements: usize) -> Vec<f32> {
// Similar to Q5_0 but with min value
dequantize_q5_0(data, n_elements) // Simplified for now
}
/// Dequantize Q8_0 (8-bit quantization, block size 32)
fn dequantize_q8_0(data: &[u8], n_elements: usize) -> Vec<f32> {
const BLOCK_SIZE: usize = 32;
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
let mut result = Vec::with_capacity(n_elements);
for block_idx in 0..n_blocks {
let block_offset = block_idx * 34; // 2 (scale) + 32 (quants)
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
let scale = half::f16::from_bits(scale_bits).to_f32();
for i in 0..BLOCK_SIZE {
if result.len() >= n_elements {
break;
}
let quant = data[block_offset + 2 + i] as i8;
let value = quant as f32 * scale;
result.push(value);
}
}
result.truncate(n_elements);
result
}
/// Dequantize Q8_1
fn dequantize_q8_1(data: &[u8], n_elements: usize) -> Vec<f32> {
dequantize_q8_0(data, n_elements) // Simplified
}
// K-quant dequantization (simplified implementations)
fn dequantize_q2_k(data: &[u8], n_elements: usize) -> Vec<f32> {
// Simplified: treat as Q4_0 for now
dequantize_q4_0(data, n_elements)
}
fn dequantize_q3_k(data: &[u8], n_elements: usize) -> Vec<f32> {
dequantize_q4_0(data, n_elements)
}
fn dequantize_q4_k(data: &[u8], n_elements: usize) -> Vec<f32> {
// Full Q4_K implementation would be more complex
dequantize_q4_0(data, n_elements)
}
fn dequantize_q5_k(data: &[u8], n_elements: usize) -> Vec<f32> {
dequantize_q5_0(data, n_elements)
}
fn dequantize_q6_k(data: &[u8], n_elements: usize) -> Vec<f32> {
dequantize_q5_0(data, n_elements)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gguf_magic() {
assert_eq!(GGUF_MAGIC, 0x46554747);
}
#[test]
fn test_tensor_type_block_sizes() {
assert_eq!(GgufTensorType::Q4_0.block_size(), 32);
assert_eq!(GgufTensorType::Q8_0.block_size(), 32);
assert_eq!(GgufTensorType::Q4_K.block_size(), 256);
}
#[test]
fn test_dequantize_q4_0() {
// Test with minimal block
let mut data = vec![0u8; 18];
// Set scale to 1.0 in f16
data[0] = 0x00;
data[1] = 0x3C; // f16(1.0) = 0x3C00
// Set some 4-bit values
data[2] = 0x01; // nibbles: 1, 0
let result = dequantize_q4_0(&data, 32);
assert_eq!(result.len(), 32);
}
}

View File

@@ -0,0 +1,227 @@
//! Universal model loader trait and metadata
use crate::error::{ModelError, SparseInferenceError};
use crate::model::gguf::{GgufModel, GgufParser, GgufValue};
type Result<T> = std::result::Result<T, SparseInferenceError>;
use std::collections::HashMap;
use std::path::Path;
/// Universal model loader trait
pub trait ModelLoader {
type Model;
type Error: std::error::Error;
/// Load model from bytes
fn load(data: &[u8]) -> Result<Self::Model>;
/// Load model from file path (native only)
#[cfg(not(target_arch = "wasm32"))]
fn load_file(path: &Path) -> Result<Self::Model> {
let data = std::fs::read(path).map_err(|e| {
SparseInferenceError::Model(ModelError::LoadFailed(format!(
"Failed to read file: {}",
e
)))
})?;
Self::load(&data)
}
/// Get model metadata
fn metadata(&self) -> &ModelMetadata;
}
/// Model metadata extracted from GGUF or other formats
#[derive(Debug, Clone)]
pub struct ModelMetadata {
pub architecture: ModelArchitecture,
pub hidden_size: usize,
pub intermediate_size: usize,
pub num_layers: usize,
pub num_heads: usize,
pub num_key_value_heads: Option<usize>,
pub vocab_size: usize,
pub max_position_embeddings: usize,
pub quantization: Option<QuantizationType>,
pub rope_theta: Option<f32>,
pub rope_scaling: Option<RopeScaling>,
}
impl ModelMetadata {
/// Extract metadata from GGUF model
pub fn from_gguf(model: &GgufModel) -> Result<Self> {
let arch_name = Self::get_string(&model.metadata, "general.architecture")
.map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
let architecture = ModelArchitecture::from_str(&arch_name)
.map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
let prefix = format!("{}", arch_name);
Ok(Self {
architecture,
hidden_size: Self::get_u32(&model.metadata, &format!("{}.embedding_length", prefix))?
as usize,
intermediate_size: Self::get_u32(
&model.metadata,
&format!("{}.feed_forward_length", prefix),
)
.unwrap_or(0) as usize,
num_layers: Self::get_u32(&model.metadata, &format!("{}.block_count", prefix))?
as usize,
num_heads: Self::get_u32(&model.metadata, &format!("{}.attention.head_count", prefix))?
as usize,
num_key_value_heads: Self::get_u32(
&model.metadata,
&format!("{}.attention.head_count_kv", prefix),
)
.ok()
.map(|v| v as usize),
vocab_size: Self::get_u32(&model.metadata, "tokenizer.ggml.tokens")
.or_else(|_| Self::get_array_len(&model.metadata, "tokenizer.ggml.tokens"))
.unwrap_or(32000) as usize,
max_position_embeddings: Self::get_u32(
&model.metadata,
&format!("{}.context_length", prefix),
)
.unwrap_or(2048) as usize,
quantization: None, // Determined from tensor types
rope_theta: Self::get_f32(&model.metadata, &format!("{}.rope.freq_base", prefix)).ok(),
rope_scaling: None,
})
}
fn get_string(
metadata: &HashMap<String, GgufValue>,
key: &str,
) -> std::result::Result<String, String> {
match metadata.get(key) {
Some(GgufValue::String(s)) => Ok(s.clone()),
_ => Err(format!("Missing metadata: {}", key)),
}
}
fn get_u32(
metadata: &HashMap<String, GgufValue>,
key: &str,
) -> std::result::Result<u32, String> {
match metadata.get(key) {
Some(GgufValue::Uint32(v)) => Ok(*v),
Some(GgufValue::Uint64(v)) => Ok(*v as u32),
Some(GgufValue::Int32(v)) => Ok(*v as u32),
_ => Err(format!("Missing metadata: {}", key)),
}
}
fn get_f32(
metadata: &HashMap<String, GgufValue>,
key: &str,
) -> std::result::Result<f32, String> {
match metadata.get(key) {
Some(GgufValue::Float32(v)) => Ok(*v),
Some(GgufValue::Float64(v)) => Ok(*v as f32),
_ => Err(format!("Missing metadata: {}", key)),
}
}
fn get_array_len(
metadata: &HashMap<String, GgufValue>,
key: &str,
) -> std::result::Result<u32, String> {
match metadata.get(key) {
Some(GgufValue::Array(arr)) => Ok(arr.len() as u32),
_ => Err(format!("Missing metadata: {}", key)),
}
}
}
/// Model architecture type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModelArchitecture {
Llama,
LFM2,
Bert,
Mistral,
Qwen,
Phi,
Gemma,
}
impl ModelArchitecture {
pub fn from_str(s: &str) -> std::result::Result<Self, String> {
match s.to_lowercase().as_str() {
"llama" => Ok(Self::Llama),
"lfm" | "lfm2" => Ok(Self::LFM2),
"bert" => Ok(Self::Bert),
"mistral" => Ok(Self::Mistral),
"qwen" | "qwen2" => Ok(Self::Qwen),
"phi" | "phi2" | "phi3" => Ok(Self::Phi),
"gemma" | "gemma2" => Ok(Self::Gemma),
_ => Err(format!("Unsupported architecture: {}", s)),
}
}
}
/// Quantization type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QuantizationType {
F32,
F16,
Q4_0,
Q4_1,
Q5_0,
Q5_1,
Q8_0,
Q8_1,
Q4_K,
Q5_K,
Q6_K,
}
/// RoPE scaling configuration
#[derive(Debug, Clone)]
pub struct RopeScaling {
pub scaling_type: String,
pub factor: f32,
}
impl Default for ModelMetadata {
fn default() -> Self {
Self {
architecture: ModelArchitecture::Llama,
hidden_size: 4096,
intermediate_size: 11008,
num_layers: 32,
num_heads: 32,
num_key_value_heads: None,
vocab_size: 32000,
max_position_embeddings: 2048,
quantization: None,
rope_theta: Some(10000.0),
rope_scaling: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_architecture_parsing() {
assert_eq!(
ModelArchitecture::from_str("llama").unwrap(),
ModelArchitecture::Llama
);
assert_eq!(
ModelArchitecture::from_str("BERT").unwrap(),
ModelArchitecture::Bert
);
}
#[test]
fn test_default_metadata() {
let metadata = ModelMetadata::default();
assert_eq!(metadata.architecture, ModelArchitecture::Llama);
assert_eq!(metadata.hidden_size, 4096);
}
}

View File

@@ -0,0 +1,13 @@
//! Model loading and inference infrastructure
pub mod gguf;
pub mod loader;
pub mod runners;
pub mod types;
pub use gguf::{GgufHeader, GgufModel, GgufParser, GgufTensorInfo, GgufTensorType, GgufValue};
pub use loader::{ModelArchitecture, ModelLoader, ModelMetadata, QuantizationType};
pub use runners::{
BertModel, LFM2Model, LlamaLayer, LlamaMLP, LlamaModel, ModelRunner, SparseModel,
};
pub use types::{InferenceConfig, ModelInput, ModelOutput, Tensor};

View File

@@ -0,0 +1,532 @@
//! Model runners for different architectures with sparse inference support
use crate::error::SparseInferenceError;
use crate::model::loader::{ModelLoader, ModelMetadata};
use crate::model::types::{CalibrationStats, InferenceConfig, ModelInput, ModelOutput, Tensor};
use crate::ops::{silu, Embedding, LayerNorm, Linear, RMSNorm};
use std::collections::HashMap;
type Result<T> = std::result::Result<T, SparseInferenceError>;
/// Trait for running inference on models
pub trait ModelRunner {
/// Forward pass with optional sparse computation
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput>;
/// Get predictor for a specific layer (if available)
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor>;
/// Calibrate predictors with sample data
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats>;
/// Get model metadata
fn metadata(&self) -> &ModelMetadata;
}
/// Low-rank predictor for neuron activation prediction
#[derive(Debug, Clone)]
pub struct LowRankPredictor {
pub u: Vec<Vec<f32>>, // U matrix (d x r)
pub v: Vec<Vec<f32>>, // V matrix (r x m)
pub rank: usize,
}
impl LowRankPredictor {
pub fn new(input_dim: usize, output_dim: usize, rank: usize) -> Self {
Self {
u: vec![vec![0.0; rank]; input_dim],
v: vec![vec![0.0; output_dim]; rank],
rank,
}
}
/// Predict top-k active neurons
pub fn predict_active(&self, input: &[f32], k: usize) -> Vec<usize> {
let scores = self.forward(input);
let mut indices: Vec<usize> = (0..scores.len()).collect();
indices.sort_by(|&a, &b| scores[b].partial_cmp(&scores[a]).unwrap());
indices.truncate(k);
indices
}
fn forward(&self, input: &[f32]) -> Vec<f32> {
// Compute UV^T · input in two steps
// First: U^T · input (r-dimensional)
let mut hidden = vec![0.0; self.rank];
for i in 0..self.rank {
for (j, u_ji) in self.u.iter().enumerate() {
if j < input.len() && i < u_ji.len() {
hidden[i] += u_ji[i] * input[j];
}
}
}
// Second: V · hidden (m-dimensional)
let output_dim = self.v.first().map(|v| v.len()).unwrap_or(0);
let mut output = vec![0.0; output_dim];
for i in 0..output_dim {
for (j, &h) in hidden.iter().enumerate() {
if j < self.v.len() && i < self.v[j].len() {
output[i] += self.v[j][i] * h;
}
}
}
output
}
}
// ============================================================================
// Llama Model
// ============================================================================
/// Llama model for sparse inference
pub struct LlamaModel {
pub metadata: ModelMetadata,
pub layers: Vec<LlamaLayer>,
pub embed_tokens: Embedding,
pub norm: RMSNorm,
pub lm_head: Option<Linear>,
}
pub struct LlamaLayer {
pub input_layernorm: RMSNorm,
pub self_attn: LlamaAttention,
pub post_attention_layernorm: RMSNorm,
pub mlp: LlamaMLP,
pub predictor: Option<LowRankPredictor>,
}
pub struct LlamaAttention {
pub q_proj: Linear,
pub k_proj: Linear,
pub v_proj: Linear,
pub o_proj: Linear,
pub num_heads: usize,
pub head_dim: usize,
}
pub struct LlamaMLP {
pub gate_proj: Linear, // W1 for SwiGLU gate
pub up_proj: Linear, // W3 for SwiGLU up
pub down_proj: Linear, // W2 for down projection
}
impl LlamaMLP {
/// Standard forward pass (dense)
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
let gate = self.gate_proj.forward(x);
let up = self.up_proj.forward(x);
// SwiGLU: silu(gate) ⊙ up
let hidden: Vec<f32> = gate
.iter()
.zip(up.iter())
.map(|(&g, &u)| silu(g) * u)
.collect();
self.down_proj.forward(&hidden)
}
/// Sparse forward pass using predictor
pub fn forward_sparse(&self, x: &[f32], active_neurons: &[usize]) -> Vec<f32> {
// Only compute for active neurons in intermediate layer
let gate = sparse_matmul(&self.gate_proj, x, active_neurons);
let up = sparse_matmul(&self.up_proj, x, active_neurons);
// SwiGLU on active neurons only
let hidden: Vec<f32> = gate
.iter()
.zip(up.iter())
.map(|(&g, &u)| silu(g) * u)
.collect();
// Sparse down projection
sparse_matmul_full(&self.down_proj, &hidden, active_neurons)
}
}
impl ModelRunner for LlamaModel {
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
// Embed tokens
let mut hidden_states = self.embed_tokens.forward(&input.input_ids);
let mut all_hidden_states = if config.output_hidden_states {
Some(Vec::new())
} else {
None
};
// Process each layer
for (idx, layer) in self.layers.iter().enumerate() {
if let Some(ref mut states) = all_hidden_states {
states.push(hidden_states.clone());
}
// Layer norm
let normed = layer.input_layernorm.forward(&hidden_states);
// Self-attention (simplified, no KV cache)
let attn_output = layer.self_attn.forward(&normed);
// Residual
hidden_states = add_vectors(&hidden_states, &attn_output);
// Post-attention norm
let normed = layer.post_attention_layernorm.forward(&hidden_states);
// MLP with optional sparsity
let mlp_output = if config.use_sparse_ffn {
if let Some(ref predictor) = layer.predictor {
let k = config.active_neurons_per_layer.unwrap_or(
(self.metadata.intermediate_size as f32 * (1.0 - config.sparsity)) as usize,
);
let active = predictor.predict_active(&normed, k);
layer.mlp.forward_sparse(&normed, &active)
} else {
layer.mlp.forward(&normed)
}
} else {
layer.mlp.forward(&normed)
};
// Residual
hidden_states = add_vectors(&hidden_states, &mlp_output);
}
// Final norm
hidden_states = self.norm.forward(&hidden_states);
// LM head
let logits = if let Some(ref lm_head) = self.lm_head {
lm_head.forward(&hidden_states)
} else {
hidden_states
};
Ok(ModelOutput::new(logits).with_hidden_states(all_hidden_states.unwrap_or_default()))
}
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
self.layers.get(layer_idx)?.predictor.as_ref()
}
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
// Placeholder: would collect activation statistics
Ok(CalibrationStats {
num_samples: samples.len(),
average_sparsity: 0.9,
layer_stats: HashMap::new(),
})
}
fn metadata(&self) -> &ModelMetadata {
&self.metadata
}
}
impl LlamaAttention {
pub fn forward(&self, hidden_states: &[f32]) -> Vec<f32> {
// Simplified: full attention without KV cache
let q = self.q_proj.forward(hidden_states);
let k = self.k_proj.forward(hidden_states);
let v = self.v_proj.forward(hidden_states);
// Placeholder: would do scaled dot-product attention
self.o_proj.forward(&q)
}
}
// ============================================================================
// LFM2 Model (Liquid AI)
// ============================================================================
pub struct LFM2Model {
pub metadata: ModelMetadata,
pub embedding: Embedding,
pub layers: Vec<LFM2Layer>,
pub pooler: Option<Pooler>,
}
pub struct LFM2Layer {
pub gated_conv: GatedConv1d,
pub attention: GroupedQueryAttention,
pub ffn: SparseFfn,
pub norm: LayerNorm,
}
pub struct GatedConv1d {
pub weight: Vec<Vec<f32>>,
pub gate: Linear,
}
pub struct GroupedQueryAttention {
pub q_proj: Linear,
pub k_proj: Linear,
pub v_proj: Linear,
pub o_proj: Linear,
pub num_groups: usize,
}
pub struct SparseFfn {
pub w1: Linear,
pub w2: Linear,
pub predictor: Option<LowRankPredictor>,
}
impl ModelRunner for LFM2Model {
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
let mut hidden = self.embedding.forward(&input.input_ids);
for layer in &self.layers {
// Gated convolution for local context
hidden = layer.gated_conv.forward(&hidden);
// Grouped query attention
let attn_out = layer.attention.forward(&hidden);
hidden = add_vectors(&hidden, &attn_out);
// Sparse FFN
let ffn_out = layer.ffn.forward(&hidden, config);
hidden = add_vectors(&hidden, &ffn_out);
hidden = layer.norm.forward(&hidden);
}
Ok(ModelOutput::new(hidden))
}
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
self.layers.get(layer_idx)?.ffn.predictor.as_ref()
}
fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
Ok(CalibrationStats {
num_samples: 0,
average_sparsity: 0.9,
layer_stats: HashMap::new(),
})
}
fn metadata(&self) -> &ModelMetadata {
&self.metadata
}
}
impl GatedConv1d {
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
// Simplified convolution
x.to_vec()
}
}
impl GroupedQueryAttention {
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
self.o_proj.forward(x)
}
}
impl SparseFfn {
pub fn forward(&self, x: &[f32], config: &InferenceConfig) -> Vec<f32> {
if config.use_sparse_ffn {
if let Some(ref predictor) = self.predictor {
let k = (self.w1.out_features as f32 * (1.0 - config.sparsity)) as usize;
let active = predictor.predict_active(x, k);
return sparse_matmul_full(&self.w2, &self.w1.forward(x), &active);
}
}
self.w2.forward(&self.w1.forward(x))
}
}
// ============================================================================
// BERT Model
// ============================================================================
pub struct BertModel {
pub metadata: ModelMetadata,
pub embeddings: BertEmbeddings,
pub encoder: Vec<BertLayer>,
pub pooler: Option<Pooler>,
}
pub struct BertEmbeddings {
pub word_embeddings: Embedding,
pub position_embeddings: Embedding,
pub token_type_embeddings: Embedding,
pub layer_norm: LayerNorm,
}
pub struct BertLayer {
pub attention: MultiHeadAttention,
pub intermediate: Linear,
pub output: Linear,
pub layer_norm1: LayerNorm,
pub layer_norm2: LayerNorm,
}
pub struct MultiHeadAttention {
pub q_proj: Linear,
pub k_proj: Linear,
pub v_proj: Linear,
pub o_proj: Linear,
pub num_heads: usize,
}
pub struct Pooler {
pub dense: Linear,
}
impl ModelRunner for BertModel {
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
let mut hidden = self.embeddings.forward(&input.input_ids);
for layer in &self.encoder {
let attn_out = layer.attention.forward(&hidden);
hidden = layer.layer_norm1.forward(&add_vectors(&hidden, &attn_out));
let intermediate = layer.intermediate.forward(&hidden);
let output = layer.output.forward(&intermediate);
hidden = layer.layer_norm2.forward(&add_vectors(&hidden, &output));
}
Ok(ModelOutput::new(hidden))
}
fn get_predictor(&self, _layer_idx: usize) -> Option<&LowRankPredictor> {
None
}
fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
Ok(CalibrationStats {
num_samples: 0,
average_sparsity: 0.0,
layer_stats: HashMap::new(),
})
}
fn metadata(&self) -> &ModelMetadata {
&self.metadata
}
}
impl BertEmbeddings {
pub fn forward(&self, input_ids: &[u64]) -> Vec<f32> {
self.word_embeddings.forward(input_ids)
}
}
impl MultiHeadAttention {
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
self.o_proj.forward(x)
}
}
// ============================================================================
// Unified Model Wrapper
// ============================================================================
pub enum SparseModel {
Llama(LlamaModel),
LFM2(LFM2Model),
Bert(BertModel),
}
impl ModelRunner for SparseModel {
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
match self {
Self::Llama(m) => m.forward(input, config),
Self::LFM2(m) => m.forward(input, config),
Self::Bert(m) => m.forward(input, config),
}
}
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
match self {
Self::Llama(m) => m.get_predictor(layer_idx),
Self::LFM2(m) => m.get_predictor(layer_idx),
Self::Bert(m) => m.get_predictor(layer_idx),
}
}
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
match self {
Self::Llama(m) => m.calibrate(samples),
Self::LFM2(m) => m.calibrate(samples),
Self::Bert(m) => m.calibrate(samples),
}
}
fn metadata(&self) -> &ModelMetadata {
match self {
Self::Llama(m) => m.metadata(),
Self::LFM2(m) => m.metadata(),
Self::Bert(m) => m.metadata(),
}
}
}
// ============================================================================
// Helper Functions
// ============================================================================
fn sparse_matmul(linear: &Linear, input: &[f32], active_cols: &[usize]) -> Vec<f32> {
let mut output = vec![0.0; active_cols.len()];
for (out_idx, &col_idx) in active_cols.iter().enumerate() {
if col_idx < linear.out_features {
for (in_idx, &x) in input.iter().enumerate() {
if in_idx < linear.in_features {
output[out_idx] += linear.weight[col_idx][in_idx] * x;
}
}
if let Some(ref bias) = linear.bias {
output[out_idx] += bias[col_idx];
}
}
}
output
}
fn sparse_matmul_full(linear: &Linear, input: &[f32], active_input_cols: &[usize]) -> Vec<f32> {
let mut output = vec![0.0; linear.out_features];
for out_idx in 0..linear.out_features {
for &in_idx in active_input_cols {
if in_idx < input.len() && in_idx < linear.in_features {
output[out_idx] += linear.weight[out_idx][in_idx] * input[in_idx];
}
}
if let Some(ref bias) = linear.bias {
output[out_idx] += bias[out_idx];
}
}
output
}
fn add_vectors(a: &[f32], b: &[f32]) -> Vec<f32> {
a.iter().zip(b.iter()).map(|(x, y)| x + y).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_low_rank_predictor() {
let predictor = LowRankPredictor::new(128, 512, 16);
let input = vec![1.0; 128];
let active = predictor.predict_active(&input, 10);
assert_eq!(active.len(), 10);
}
#[test]
fn test_add_vectors() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0, 6.0];
let result = add_vectors(&a, &b);
assert_eq!(result, vec![5.0, 7.0, 9.0]);
}
}

View File

@@ -0,0 +1,159 @@
//! Core types for model inference
use std::collections::HashMap;
/// Generic tensor representation
#[derive(Debug, Clone)]
pub struct Tensor {
pub data: Vec<f32>,
pub shape: Vec<u64>,
pub name: String,
}
impl Tensor {
pub fn new(data: Vec<f32>, shape: Vec<u64>, name: String) -> Self {
Self { data, shape, name }
}
pub fn zeros(shape: Vec<u64>, name: String) -> Self {
let size = shape.iter().product::<u64>() as usize;
Self {
data: vec![0.0; size],
shape,
name,
}
}
pub fn size(&self) -> usize {
self.data.len()
}
pub fn reshape(&mut self, new_shape: Vec<u64>) {
let new_size = new_shape.iter().product::<u64>() as usize;
assert_eq!(
new_size,
self.size(),
"Reshape size mismatch: {} vs {}",
new_size,
self.size()
);
self.shape = new_shape;
}
}
/// Model input configuration
#[derive(Debug, Clone)]
pub struct ModelInput {
pub input_ids: Vec<u64>,
pub attention_mask: Option<Vec<u8>>,
pub position_ids: Option<Vec<u64>>,
}
impl ModelInput {
pub fn new(input_ids: Vec<u64>) -> Self {
Self {
input_ids,
attention_mask: None,
position_ids: None,
}
}
pub fn with_attention_mask(mut self, mask: Vec<u8>) -> Self {
self.attention_mask = Some(mask);
self
}
pub fn with_position_ids(mut self, positions: Vec<u64>) -> Self {
self.position_ids = Some(positions);
self
}
pub fn sequence_length(&self) -> usize {
self.input_ids.len()
}
}
/// Model output
#[derive(Debug, Clone)]
pub struct ModelOutput {
pub logits: Vec<f32>,
pub hidden_states: Option<Vec<Vec<f32>>>,
pub attentions: Option<Vec<Vec<f32>>>,
}
impl ModelOutput {
pub fn new(logits: Vec<f32>) -> Self {
Self {
logits,
hidden_states: None,
attentions: None,
}
}
pub fn with_hidden_states(mut self, states: Vec<Vec<f32>>) -> Self {
self.hidden_states = Some(states);
self
}
}
/// Inference configuration
#[derive(Debug, Clone)]
pub struct InferenceConfig {
/// Sparsity level (0.0 = dense, 1.0 = maximum sparsity)
pub sparsity: f32,
/// Sparsity threshold for neuron activation
pub sparsity_threshold: f32,
/// Temperature for sampling
pub temperature: f32,
/// Top-k sampling
pub top_k: Option<usize>,
/// Top-p (nucleus) sampling
pub top_p: Option<f32>,
/// Use sparse FFN computation
pub use_sparse_ffn: bool,
/// Number of active neurons per layer
pub active_neurons_per_layer: Option<usize>,
/// Return hidden states
pub output_hidden_states: bool,
/// Return attention weights
pub output_attentions: bool,
}
impl Default for InferenceConfig {
fn default() -> Self {
Self {
sparsity: 0.9,
sparsity_threshold: 0.01,
temperature: 1.0,
top_k: None,
top_p: None,
use_sparse_ffn: true,
active_neurons_per_layer: None,
output_hidden_states: false,
output_attentions: false,
}
}
}
/// Calibration statistics
#[derive(Debug, Clone)]
pub struct CalibrationStats {
pub num_samples: usize,
pub average_sparsity: f32,
pub layer_stats: HashMap<usize, LayerStats>,
}
#[derive(Debug, Clone)]
pub struct LayerStats {
pub active_neurons: usize,
pub total_neurons: usize,
pub sparsity: f32,
}