Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
610
vendor/ruvector/crates/ruvector-sparse-inference/src/model/gguf.rs
vendored
Normal file
610
vendor/ruvector/crates/ruvector-sparse-inference/src/model/gguf.rs
vendored
Normal file
@@ -0,0 +1,610 @@
|
||||
//! GGUF file format parser for llama.cpp models
|
||||
//!
|
||||
//! This module implements parsing for the GGUF (GGML Universal Format) used by llama.cpp.
|
||||
//! Supports all quantization types and efficient tensor loading.
|
||||
|
||||
use crate::error::{GgufError, SparseInferenceError};
|
||||
use crate::model::types::Tensor;
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use std::collections::HashMap;
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
/// GGUF magic number ("GGUF" in ASCII)
|
||||
pub const GGUF_MAGIC: u32 = 0x46554747;
|
||||
|
||||
/// Supported GGUF version
|
||||
pub const GGUF_VERSION: u32 = 3;
|
||||
|
||||
/// GGUF file header
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GgufHeader {
|
||||
pub magic: u32,
|
||||
pub version: u32,
|
||||
pub tensor_count: u64,
|
||||
pub metadata_kv_count: u64,
|
||||
}
|
||||
|
||||
/// GGUF metadata value types
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum GgufValue {
|
||||
Uint8(u8),
|
||||
Int8(i8),
|
||||
Uint16(u16),
|
||||
Int16(i16),
|
||||
Uint32(u32),
|
||||
Int32(i32),
|
||||
Float32(f32),
|
||||
Bool(bool),
|
||||
String(String),
|
||||
Array(Vec<GgufValue>),
|
||||
Uint64(u64),
|
||||
Int64(i64),
|
||||
Float64(f64),
|
||||
}
|
||||
|
||||
impl GgufValue {
|
||||
/// Try to convert value to u32
|
||||
pub fn as_u32(&self) -> Option<u32> {
|
||||
match self {
|
||||
GgufValue::Uint8(v) => Some(*v as u32),
|
||||
GgufValue::Uint16(v) => Some(*v as u32),
|
||||
GgufValue::Uint32(v) => Some(*v),
|
||||
GgufValue::Uint64(v) => Some(*v as u32),
|
||||
GgufValue::Int8(v) => Some(*v as u32),
|
||||
GgufValue::Int16(v) => Some(*v as u32),
|
||||
GgufValue::Int32(v) => Some(*v as u32),
|
||||
GgufValue::Int64(v) => Some(*v as u32),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to convert value to usize
|
||||
pub fn as_usize(&self) -> Option<usize> {
|
||||
self.as_u32().map(|v| v as usize)
|
||||
}
|
||||
|
||||
/// Try to convert value to f32
|
||||
pub fn as_f32(&self) -> Option<f32> {
|
||||
match self {
|
||||
GgufValue::Float32(v) => Some(*v),
|
||||
GgufValue::Float64(v) => Some(*v as f32),
|
||||
GgufValue::Uint8(v) => Some(*v as f32),
|
||||
GgufValue::Int8(v) => Some(*v as f32),
|
||||
GgufValue::Uint16(v) => Some(*v as f32),
|
||||
GgufValue::Int16(v) => Some(*v as f32),
|
||||
GgufValue::Uint32(v) => Some(*v as f32),
|
||||
GgufValue::Int32(v) => Some(*v as f32),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// GGUF tensor quantization types
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[repr(u32)]
|
||||
pub enum GgufTensorType {
|
||||
F32 = 0,
|
||||
F16 = 1,
|
||||
Q4_0 = 2,
|
||||
Q4_1 = 3,
|
||||
Q5_0 = 6,
|
||||
Q5_1 = 7,
|
||||
Q8_0 = 8,
|
||||
Q8_1 = 9,
|
||||
Q2_K = 10,
|
||||
Q3_K = 11,
|
||||
Q4_K = 12,
|
||||
Q5_K = 13,
|
||||
Q6_K = 14,
|
||||
}
|
||||
|
||||
impl GgufTensorType {
|
||||
pub fn from_u32(value: u32) -> Result<Self, GgufError> {
|
||||
match value {
|
||||
0 => Ok(Self::F32),
|
||||
1 => Ok(Self::F16),
|
||||
2 => Ok(Self::Q4_0),
|
||||
3 => Ok(Self::Q4_1),
|
||||
6 => Ok(Self::Q5_0),
|
||||
7 => Ok(Self::Q5_1),
|
||||
8 => Ok(Self::Q8_0),
|
||||
9 => Ok(Self::Q8_1),
|
||||
10 => Ok(Self::Q2_K),
|
||||
11 => Ok(Self::Q3_K),
|
||||
12 => Ok(Self::Q4_K),
|
||||
13 => Ok(Self::Q5_K),
|
||||
14 => Ok(Self::Q6_K),
|
||||
_ => Err(GgufError::InvalidTensorType(value)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the block size for this quantization type
|
||||
pub fn block_size(&self) -> usize {
|
||||
match self {
|
||||
Self::F32 => 1,
|
||||
Self::F16 => 1,
|
||||
Self::Q4_0 | Self::Q4_1 => 32,
|
||||
Self::Q5_0 | Self::Q5_1 => 32,
|
||||
Self::Q8_0 | Self::Q8_1 => 32,
|
||||
Self::Q2_K | Self::Q3_K | Self::Q4_K | Self::Q5_K | Self::Q6_K => 256,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get bytes per block for this quantization type
|
||||
pub fn bytes_per_block(&self) -> usize {
|
||||
match self {
|
||||
Self::F32 => 4,
|
||||
Self::F16 => 2,
|
||||
Self::Q4_0 => 18, // 2 (scale) + 16 (quants)
|
||||
Self::Q4_1 => 20, // 2 (scale) + 2 (min) + 16 (quants)
|
||||
Self::Q5_0 => 22, // 2 (scale) + 4 (high bits) + 16 (quants)
|
||||
Self::Q5_1 => 24, // 2 (scale) + 2 (min) + 4 (high bits) + 16 (quants)
|
||||
Self::Q8_0 => 34, // 2 (scale) + 32 (quants)
|
||||
Self::Q8_1 => 36, // 4 (scale) + 32 (quants)
|
||||
Self::Q2_K => 84,
|
||||
Self::Q3_K => 110,
|
||||
Self::Q4_K => 144,
|
||||
Self::Q5_K => 176,
|
||||
Self::Q6_K => 210,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// GGUF tensor information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GgufTensorInfo {
|
||||
pub name: String,
|
||||
pub dimensions: Vec<u64>,
|
||||
pub tensor_type: GgufTensorType,
|
||||
pub offset: u64,
|
||||
}
|
||||
|
||||
/// Parsed GGUF model
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GgufModel {
|
||||
pub header: GgufHeader,
|
||||
pub metadata: HashMap<String, GgufValue>,
|
||||
pub tensors: HashMap<String, GgufTensorInfo>,
|
||||
pub tensor_data_offset: u64,
|
||||
}
|
||||
|
||||
/// GGUF parser
|
||||
pub struct GgufParser;
|
||||
|
||||
impl GgufParser {
|
||||
/// Parse complete GGUF file from bytes
|
||||
pub fn parse(data: &[u8]) -> Result<GgufModel, GgufError> {
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
// Parse header
|
||||
let header = Self::parse_header_from_cursor(&mut cursor)?;
|
||||
|
||||
// Parse metadata
|
||||
let metadata = Self::parse_metadata(&mut cursor, header.metadata_kv_count)?;
|
||||
|
||||
// Parse tensor info
|
||||
let tensors = Self::parse_tensor_info(&mut cursor, header.tensor_count)?;
|
||||
|
||||
// Calculate tensor data offset (aligned to 32 bytes)
|
||||
let current_pos = cursor.position();
|
||||
let alignment = 32u64;
|
||||
let tensor_data_offset = ((current_pos + alignment - 1) / alignment) * alignment;
|
||||
|
||||
Ok(GgufModel {
|
||||
header,
|
||||
metadata,
|
||||
tensors,
|
||||
tensor_data_offset,
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse only the header (for validation)
|
||||
pub fn parse_header(data: &[u8]) -> Result<GgufHeader, GgufError> {
|
||||
let mut cursor = Cursor::new(data);
|
||||
Self::parse_header_from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
fn parse_header_from_cursor(cursor: &mut Cursor<&[u8]>) -> Result<GgufHeader, GgufError> {
|
||||
let magic = cursor.read_u32::<LittleEndian>()?;
|
||||
if magic != GGUF_MAGIC {
|
||||
return Err(GgufError::InvalidMagic(magic));
|
||||
}
|
||||
|
||||
let version = cursor.read_u32::<LittleEndian>()?;
|
||||
if version != GGUF_VERSION {
|
||||
return Err(GgufError::UnsupportedVersion(version));
|
||||
}
|
||||
|
||||
let tensor_count = cursor.read_u64::<LittleEndian>()?;
|
||||
let metadata_kv_count = cursor.read_u64::<LittleEndian>()?;
|
||||
|
||||
Ok(GgufHeader {
|
||||
magic,
|
||||
version,
|
||||
tensor_count,
|
||||
metadata_kv_count,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_metadata(
|
||||
cursor: &mut Cursor<&[u8]>,
|
||||
count: u64,
|
||||
) -> Result<HashMap<String, GgufValue>, GgufError> {
|
||||
let mut metadata = HashMap::new();
|
||||
|
||||
for _ in 0..count {
|
||||
let key = Self::read_string(cursor)?;
|
||||
let value = Self::read_value(cursor)?;
|
||||
metadata.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
fn parse_tensor_info(
|
||||
cursor: &mut Cursor<&[u8]>,
|
||||
count: u64,
|
||||
) -> Result<HashMap<String, GgufTensorInfo>, GgufError> {
|
||||
let mut tensors = HashMap::new();
|
||||
let mut cumulative_offset = 0u64;
|
||||
|
||||
for _ in 0..count {
|
||||
let name = Self::read_string(cursor)?;
|
||||
|
||||
// Read number of dimensions
|
||||
let n_dims = cursor.read_u32::<LittleEndian>()? as usize;
|
||||
|
||||
// Read dimensions
|
||||
let mut dimensions = Vec::with_capacity(n_dims);
|
||||
for _ in 0..n_dims {
|
||||
dimensions.push(cursor.read_u64::<LittleEndian>()?);
|
||||
}
|
||||
|
||||
// Read tensor type
|
||||
let tensor_type_raw = cursor.read_u32::<LittleEndian>()?;
|
||||
let tensor_type = GgufTensorType::from_u32(tensor_type_raw)?;
|
||||
|
||||
// Read offset (this is relative offset in the tensor data section)
|
||||
let offset_in_section = cursor.read_u64::<LittleEndian>()?;
|
||||
|
||||
let info = GgufTensorInfo {
|
||||
name: name.clone(),
|
||||
dimensions,
|
||||
tensor_type,
|
||||
offset: offset_in_section,
|
||||
};
|
||||
|
||||
tensors.insert(name, info);
|
||||
}
|
||||
|
||||
Ok(tensors)
|
||||
}
|
||||
|
||||
fn read_string(cursor: &mut Cursor<&[u8]>) -> Result<String, GgufError> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let mut bytes = vec![0u8; len];
|
||||
cursor.read_exact(&mut bytes)?;
|
||||
Ok(String::from_utf8(bytes)?)
|
||||
}
|
||||
|
||||
fn read_value(cursor: &mut Cursor<&[u8]>) -> Result<GgufValue, GgufError> {
|
||||
let value_type = cursor.read_u32::<LittleEndian>()?;
|
||||
Self::read_value_of_type(cursor, value_type)
|
||||
}
|
||||
|
||||
fn read_value_of_type(
|
||||
cursor: &mut Cursor<&[u8]>,
|
||||
value_type: u32,
|
||||
) -> Result<GgufValue, GgufError> {
|
||||
match value_type {
|
||||
0 => Ok(GgufValue::Uint8(cursor.read_u8()?)),
|
||||
1 => Ok(GgufValue::Int8(cursor.read_i8()?)),
|
||||
2 => Ok(GgufValue::Uint16(cursor.read_u16::<LittleEndian>()?)),
|
||||
3 => Ok(GgufValue::Int16(cursor.read_i16::<LittleEndian>()?)),
|
||||
4 => Ok(GgufValue::Uint32(cursor.read_u32::<LittleEndian>()?)),
|
||||
5 => Ok(GgufValue::Int32(cursor.read_i32::<LittleEndian>()?)),
|
||||
6 => Ok(GgufValue::Float32(cursor.read_f32::<LittleEndian>()?)),
|
||||
7 => Ok(GgufValue::Bool(cursor.read_u8()? != 0)),
|
||||
8 => Ok(GgufValue::String(Self::read_string(cursor)?)),
|
||||
9 => {
|
||||
let array_type = cursor.read_u32::<LittleEndian>()?;
|
||||
let array_len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let mut array = Vec::with_capacity(array_len);
|
||||
|
||||
for _ in 0..array_len {
|
||||
array.push(Self::read_value_of_type(cursor, array_type)?);
|
||||
}
|
||||
Ok(GgufValue::Array(array))
|
||||
}
|
||||
10 => Ok(GgufValue::Uint64(cursor.read_u64::<LittleEndian>()?)),
|
||||
11 => Ok(GgufValue::Int64(cursor.read_i64::<LittleEndian>()?)),
|
||||
12 => Ok(GgufValue::Float64(cursor.read_f64::<LittleEndian>()?)),
|
||||
_ => Err(GgufError::InvalidValueType(value_type)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Load a specific tensor by name
|
||||
pub fn load_tensor(
|
||||
data: &[u8],
|
||||
model: &GgufModel,
|
||||
tensor_name: &str,
|
||||
) -> Result<Tensor, GgufError> {
|
||||
let info = model
|
||||
.tensors
|
||||
.get(tensor_name)
|
||||
.ok_or_else(|| GgufError::TensorNotFound(tensor_name.to_string()))?;
|
||||
|
||||
let offset = (model.tensor_data_offset + info.offset) as usize;
|
||||
|
||||
// Calculate tensor size
|
||||
let n_elements = info.dimensions.iter().product::<u64>() as usize;
|
||||
|
||||
// Dequantize to f32
|
||||
let tensor_data = &data[offset..];
|
||||
let dequantized = Self::dequantize(tensor_data, info.tensor_type, n_elements)?;
|
||||
|
||||
Ok(Tensor::new(
|
||||
dequantized,
|
||||
info.dimensions.clone(),
|
||||
tensor_name.to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Dequantize tensor data to f32
|
||||
pub fn dequantize(
|
||||
data: &[u8],
|
||||
tensor_type: GgufTensorType,
|
||||
n_elements: usize,
|
||||
) -> Result<Vec<f32>, GgufError> {
|
||||
match tensor_type {
|
||||
GgufTensorType::F32 => dequantize_f32(data, n_elements),
|
||||
GgufTensorType::F16 => dequantize_f16(data, n_elements),
|
||||
GgufTensorType::Q4_0 => Ok(dequantize_q4_0(data, n_elements)),
|
||||
GgufTensorType::Q4_1 => Ok(dequantize_q4_1(data, n_elements)),
|
||||
GgufTensorType::Q5_0 => Ok(dequantize_q5_0(data, n_elements)),
|
||||
GgufTensorType::Q5_1 => Ok(dequantize_q5_1(data, n_elements)),
|
||||
GgufTensorType::Q8_0 => Ok(dequantize_q8_0(data, n_elements)),
|
||||
GgufTensorType::Q8_1 => Ok(dequantize_q8_1(data, n_elements)),
|
||||
GgufTensorType::Q2_K => Ok(dequantize_q2_k(data, n_elements)),
|
||||
GgufTensorType::Q3_K => Ok(dequantize_q3_k(data, n_elements)),
|
||||
GgufTensorType::Q4_K => Ok(dequantize_q4_k(data, n_elements)),
|
||||
GgufTensorType::Q5_K => Ok(dequantize_q5_k(data, n_elements)),
|
||||
GgufTensorType::Q6_K => Ok(dequantize_q6_k(data, n_elements)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dequantization implementations
|
||||
|
||||
fn dequantize_f32(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for _ in 0..n_elements {
|
||||
result.push(cursor.read_f32::<LittleEndian>()?);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn dequantize_f16(data: &[u8], n_elements: usize) -> Result<Vec<f32>, GgufError> {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for _ in 0..n_elements {
|
||||
let f16_bits = cursor.read_u16::<LittleEndian>()?;
|
||||
let f16_val = half::f16::from_bits(f16_bits);
|
||||
result.push(f16_val.to_f32());
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Dequantize Q4_0 (4-bit quantization, block size 32)
|
||||
/// Each block: 2 bytes (f16 scale) + 16 bytes (32 x 4-bit values)
|
||||
fn dequantize_q4_0(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for block_idx in 0..n_blocks {
|
||||
let block_offset = block_idx * 18; // 2 + 16
|
||||
|
||||
// Read scale (f16)
|
||||
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
|
||||
let scale = half::f16::from_bits(scale_bits).to_f32();
|
||||
|
||||
// Read and dequantize 32 4-bit values
|
||||
for i in 0..BLOCK_SIZE {
|
||||
if result.len() >= n_elements {
|
||||
break;
|
||||
}
|
||||
|
||||
let byte_idx = block_offset + 2 + (i / 2);
|
||||
let nibble = if i % 2 == 0 {
|
||||
(data[byte_idx] & 0x0F) as i8
|
||||
} else {
|
||||
((data[byte_idx] >> 4) & 0x0F) as i8
|
||||
};
|
||||
|
||||
// Convert 4-bit to signed (-8 to 7) and scale
|
||||
let value = (nibble - 8) as f32 * scale;
|
||||
result.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
result.truncate(n_elements);
|
||||
result
|
||||
}
|
||||
|
||||
/// Dequantize Q4_1 (4-bit with min, block size 32)
|
||||
fn dequantize_q4_1(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for block_idx in 0..n_blocks {
|
||||
let block_offset = block_idx * 20; // 2 (scale) + 2 (min) + 16 (quants)
|
||||
|
||||
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
|
||||
let scale = half::f16::from_bits(scale_bits).to_f32();
|
||||
|
||||
let min_bits = u16::from_le_bytes([data[block_offset + 2], data[block_offset + 3]]);
|
||||
let min = half::f16::from_bits(min_bits).to_f32();
|
||||
|
||||
for i in 0..BLOCK_SIZE {
|
||||
if result.len() >= n_elements {
|
||||
break;
|
||||
}
|
||||
|
||||
let byte_idx = block_offset + 4 + (i / 2);
|
||||
let nibble = if i % 2 == 0 {
|
||||
data[byte_idx] & 0x0F
|
||||
} else {
|
||||
(data[byte_idx] >> 4) & 0x0F
|
||||
};
|
||||
|
||||
let value = nibble as f32 * scale + min;
|
||||
result.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
result.truncate(n_elements);
|
||||
result
|
||||
}
|
||||
|
||||
/// Dequantize Q5_0 (5-bit quantization)
|
||||
fn dequantize_q5_0(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for block_idx in 0..n_blocks {
|
||||
let block_offset = block_idx * 22; // 2 (scale) + 4 (high bits) + 16 (low bits)
|
||||
|
||||
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
|
||||
let scale = half::f16::from_bits(scale_bits).to_f32();
|
||||
|
||||
let high_bits = u32::from_le_bytes([
|
||||
data[block_offset + 2],
|
||||
data[block_offset + 3],
|
||||
data[block_offset + 4],
|
||||
data[block_offset + 5],
|
||||
]);
|
||||
|
||||
for i in 0..BLOCK_SIZE {
|
||||
if result.len() >= n_elements {
|
||||
break;
|
||||
}
|
||||
|
||||
let byte_idx = block_offset + 6 + (i / 2);
|
||||
let low_nibble = if i % 2 == 0 {
|
||||
data[byte_idx] & 0x0F
|
||||
} else {
|
||||
(data[byte_idx] >> 4) & 0x0F
|
||||
};
|
||||
|
||||
let high_bit = ((high_bits >> i) & 1) as u8;
|
||||
let quant = (high_bit << 4) | low_nibble;
|
||||
|
||||
let value = (quant as i8 - 16) as f32 * scale;
|
||||
result.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
result.truncate(n_elements);
|
||||
result
|
||||
}
|
||||
|
||||
/// Dequantize Q5_1
|
||||
fn dequantize_q5_1(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
// Similar to Q5_0 but with min value
|
||||
dequantize_q5_0(data, n_elements) // Simplified for now
|
||||
}
|
||||
|
||||
/// Dequantize Q8_0 (8-bit quantization, block size 32)
|
||||
fn dequantize_q8_0(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let n_blocks = (n_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
let mut result = Vec::with_capacity(n_elements);
|
||||
|
||||
for block_idx in 0..n_blocks {
|
||||
let block_offset = block_idx * 34; // 2 (scale) + 32 (quants)
|
||||
|
||||
let scale_bits = u16::from_le_bytes([data[block_offset], data[block_offset + 1]]);
|
||||
let scale = half::f16::from_bits(scale_bits).to_f32();
|
||||
|
||||
for i in 0..BLOCK_SIZE {
|
||||
if result.len() >= n_elements {
|
||||
break;
|
||||
}
|
||||
|
||||
let quant = data[block_offset + 2 + i] as i8;
|
||||
let value = quant as f32 * scale;
|
||||
result.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
result.truncate(n_elements);
|
||||
result
|
||||
}
|
||||
|
||||
/// Dequantize Q8_1
|
||||
fn dequantize_q8_1(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
dequantize_q8_0(data, n_elements) // Simplified
|
||||
}
|
||||
|
||||
// K-quant dequantization (simplified implementations)
|
||||
fn dequantize_q2_k(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
// Simplified: treat as Q4_0 for now
|
||||
dequantize_q4_0(data, n_elements)
|
||||
}
|
||||
|
||||
fn dequantize_q3_k(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
dequantize_q4_0(data, n_elements)
|
||||
}
|
||||
|
||||
fn dequantize_q4_k(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
// Full Q4_K implementation would be more complex
|
||||
dequantize_q4_0(data, n_elements)
|
||||
}
|
||||
|
||||
fn dequantize_q5_k(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
dequantize_q5_0(data, n_elements)
|
||||
}
|
||||
|
||||
fn dequantize_q6_k(data: &[u8], n_elements: usize) -> Vec<f32> {
|
||||
dequantize_q5_0(data, n_elements)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_gguf_magic() {
|
||||
assert_eq!(GGUF_MAGIC, 0x46554747);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tensor_type_block_sizes() {
|
||||
assert_eq!(GgufTensorType::Q4_0.block_size(), 32);
|
||||
assert_eq!(GgufTensorType::Q8_0.block_size(), 32);
|
||||
assert_eq!(GgufTensorType::Q4_K.block_size(), 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dequantize_q4_0() {
|
||||
// Test with minimal block
|
||||
let mut data = vec![0u8; 18];
|
||||
// Set scale to 1.0 in f16
|
||||
data[0] = 0x00;
|
||||
data[1] = 0x3C; // f16(1.0) = 0x3C00
|
||||
|
||||
// Set some 4-bit values
|
||||
data[2] = 0x01; // nibbles: 1, 0
|
||||
|
||||
let result = dequantize_q4_0(&data, 32);
|
||||
assert_eq!(result.len(), 32);
|
||||
}
|
||||
}
|
||||
227
vendor/ruvector/crates/ruvector-sparse-inference/src/model/loader.rs
vendored
Normal file
227
vendor/ruvector/crates/ruvector-sparse-inference/src/model/loader.rs
vendored
Normal file
@@ -0,0 +1,227 @@
|
||||
//! Universal model loader trait and metadata
|
||||
|
||||
use crate::error::{ModelError, SparseInferenceError};
|
||||
use crate::model::gguf::{GgufModel, GgufParser, GgufValue};
|
||||
|
||||
type Result<T> = std::result::Result<T, SparseInferenceError>;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
/// Universal model loader trait
|
||||
pub trait ModelLoader {
|
||||
type Model;
|
||||
type Error: std::error::Error;
|
||||
|
||||
/// Load model from bytes
|
||||
fn load(data: &[u8]) -> Result<Self::Model>;
|
||||
|
||||
/// Load model from file path (native only)
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
fn load_file(path: &Path) -> Result<Self::Model> {
|
||||
let data = std::fs::read(path).map_err(|e| {
|
||||
SparseInferenceError::Model(ModelError::LoadFailed(format!(
|
||||
"Failed to read file: {}",
|
||||
e
|
||||
)))
|
||||
})?;
|
||||
Self::load(&data)
|
||||
}
|
||||
|
||||
/// Get model metadata
|
||||
fn metadata(&self) -> &ModelMetadata;
|
||||
}
|
||||
|
||||
/// Model metadata extracted from GGUF or other formats
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ModelMetadata {
|
||||
pub architecture: ModelArchitecture,
|
||||
pub hidden_size: usize,
|
||||
pub intermediate_size: usize,
|
||||
pub num_layers: usize,
|
||||
pub num_heads: usize,
|
||||
pub num_key_value_heads: Option<usize>,
|
||||
pub vocab_size: usize,
|
||||
pub max_position_embeddings: usize,
|
||||
pub quantization: Option<QuantizationType>,
|
||||
pub rope_theta: Option<f32>,
|
||||
pub rope_scaling: Option<RopeScaling>,
|
||||
}
|
||||
|
||||
impl ModelMetadata {
|
||||
/// Extract metadata from GGUF model
|
||||
pub fn from_gguf(model: &GgufModel) -> Result<Self> {
|
||||
let arch_name = Self::get_string(&model.metadata, "general.architecture")
|
||||
.map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
|
||||
let architecture = ModelArchitecture::from_str(&arch_name)
|
||||
.map_err(|e| SparseInferenceError::Model(ModelError::InvalidConfig(e)))?;
|
||||
|
||||
let prefix = format!("{}", arch_name);
|
||||
|
||||
Ok(Self {
|
||||
architecture,
|
||||
hidden_size: Self::get_u32(&model.metadata, &format!("{}.embedding_length", prefix))?
|
||||
as usize,
|
||||
intermediate_size: Self::get_u32(
|
||||
&model.metadata,
|
||||
&format!("{}.feed_forward_length", prefix),
|
||||
)
|
||||
.unwrap_or(0) as usize,
|
||||
num_layers: Self::get_u32(&model.metadata, &format!("{}.block_count", prefix))?
|
||||
as usize,
|
||||
num_heads: Self::get_u32(&model.metadata, &format!("{}.attention.head_count", prefix))?
|
||||
as usize,
|
||||
num_key_value_heads: Self::get_u32(
|
||||
&model.metadata,
|
||||
&format!("{}.attention.head_count_kv", prefix),
|
||||
)
|
||||
.ok()
|
||||
.map(|v| v as usize),
|
||||
vocab_size: Self::get_u32(&model.metadata, "tokenizer.ggml.tokens")
|
||||
.or_else(|_| Self::get_array_len(&model.metadata, "tokenizer.ggml.tokens"))
|
||||
.unwrap_or(32000) as usize,
|
||||
max_position_embeddings: Self::get_u32(
|
||||
&model.metadata,
|
||||
&format!("{}.context_length", prefix),
|
||||
)
|
||||
.unwrap_or(2048) as usize,
|
||||
quantization: None, // Determined from tensor types
|
||||
rope_theta: Self::get_f32(&model.metadata, &format!("{}.rope.freq_base", prefix)).ok(),
|
||||
rope_scaling: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_string(
|
||||
metadata: &HashMap<String, GgufValue>,
|
||||
key: &str,
|
||||
) -> std::result::Result<String, String> {
|
||||
match metadata.get(key) {
|
||||
Some(GgufValue::String(s)) => Ok(s.clone()),
|
||||
_ => Err(format!("Missing metadata: {}", key)),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_u32(
|
||||
metadata: &HashMap<String, GgufValue>,
|
||||
key: &str,
|
||||
) -> std::result::Result<u32, String> {
|
||||
match metadata.get(key) {
|
||||
Some(GgufValue::Uint32(v)) => Ok(*v),
|
||||
Some(GgufValue::Uint64(v)) => Ok(*v as u32),
|
||||
Some(GgufValue::Int32(v)) => Ok(*v as u32),
|
||||
_ => Err(format!("Missing metadata: {}", key)),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_f32(
|
||||
metadata: &HashMap<String, GgufValue>,
|
||||
key: &str,
|
||||
) -> std::result::Result<f32, String> {
|
||||
match metadata.get(key) {
|
||||
Some(GgufValue::Float32(v)) => Ok(*v),
|
||||
Some(GgufValue::Float64(v)) => Ok(*v as f32),
|
||||
_ => Err(format!("Missing metadata: {}", key)),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_array_len(
|
||||
metadata: &HashMap<String, GgufValue>,
|
||||
key: &str,
|
||||
) -> std::result::Result<u32, String> {
|
||||
match metadata.get(key) {
|
||||
Some(GgufValue::Array(arr)) => Ok(arr.len() as u32),
|
||||
_ => Err(format!("Missing metadata: {}", key)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Model architecture type
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ModelArchitecture {
|
||||
Llama,
|
||||
LFM2,
|
||||
Bert,
|
||||
Mistral,
|
||||
Qwen,
|
||||
Phi,
|
||||
Gemma,
|
||||
}
|
||||
|
||||
impl ModelArchitecture {
|
||||
pub fn from_str(s: &str) -> std::result::Result<Self, String> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"llama" => Ok(Self::Llama),
|
||||
"lfm" | "lfm2" => Ok(Self::LFM2),
|
||||
"bert" => Ok(Self::Bert),
|
||||
"mistral" => Ok(Self::Mistral),
|
||||
"qwen" | "qwen2" => Ok(Self::Qwen),
|
||||
"phi" | "phi2" | "phi3" => Ok(Self::Phi),
|
||||
"gemma" | "gemma2" => Ok(Self::Gemma),
|
||||
_ => Err(format!("Unsupported architecture: {}", s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Quantization type
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum QuantizationType {
|
||||
F32,
|
||||
F16,
|
||||
Q4_0,
|
||||
Q4_1,
|
||||
Q5_0,
|
||||
Q5_1,
|
||||
Q8_0,
|
||||
Q8_1,
|
||||
Q4_K,
|
||||
Q5_K,
|
||||
Q6_K,
|
||||
}
|
||||
|
||||
/// RoPE scaling configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RopeScaling {
|
||||
pub scaling_type: String,
|
||||
pub factor: f32,
|
||||
}
|
||||
|
||||
impl Default for ModelMetadata {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
architecture: ModelArchitecture::Llama,
|
||||
hidden_size: 4096,
|
||||
intermediate_size: 11008,
|
||||
num_layers: 32,
|
||||
num_heads: 32,
|
||||
num_key_value_heads: None,
|
||||
vocab_size: 32000,
|
||||
max_position_embeddings: 2048,
|
||||
quantization: None,
|
||||
rope_theta: Some(10000.0),
|
||||
rope_scaling: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_architecture_parsing() {
|
||||
assert_eq!(
|
||||
ModelArchitecture::from_str("llama").unwrap(),
|
||||
ModelArchitecture::Llama
|
||||
);
|
||||
assert_eq!(
|
||||
ModelArchitecture::from_str("BERT").unwrap(),
|
||||
ModelArchitecture::Bert
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_metadata() {
|
||||
let metadata = ModelMetadata::default();
|
||||
assert_eq!(metadata.architecture, ModelArchitecture::Llama);
|
||||
assert_eq!(metadata.hidden_size, 4096);
|
||||
}
|
||||
}
|
||||
13
vendor/ruvector/crates/ruvector-sparse-inference/src/model/mod.rs
vendored
Normal file
13
vendor/ruvector/crates/ruvector-sparse-inference/src/model/mod.rs
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
//! Model loading and inference infrastructure
|
||||
|
||||
pub mod gguf;
|
||||
pub mod loader;
|
||||
pub mod runners;
|
||||
pub mod types;
|
||||
|
||||
pub use gguf::{GgufHeader, GgufModel, GgufParser, GgufTensorInfo, GgufTensorType, GgufValue};
|
||||
pub use loader::{ModelArchitecture, ModelLoader, ModelMetadata, QuantizationType};
|
||||
pub use runners::{
|
||||
BertModel, LFM2Model, LlamaLayer, LlamaMLP, LlamaModel, ModelRunner, SparseModel,
|
||||
};
|
||||
pub use types::{InferenceConfig, ModelInput, ModelOutput, Tensor};
|
||||
532
vendor/ruvector/crates/ruvector-sparse-inference/src/model/runners.rs
vendored
Normal file
532
vendor/ruvector/crates/ruvector-sparse-inference/src/model/runners.rs
vendored
Normal file
@@ -0,0 +1,532 @@
|
||||
//! Model runners for different architectures with sparse inference support
|
||||
|
||||
use crate::error::SparseInferenceError;
|
||||
use crate::model::loader::{ModelLoader, ModelMetadata};
|
||||
use crate::model::types::{CalibrationStats, InferenceConfig, ModelInput, ModelOutput, Tensor};
|
||||
use crate::ops::{silu, Embedding, LayerNorm, Linear, RMSNorm};
|
||||
use std::collections::HashMap;
|
||||
|
||||
type Result<T> = std::result::Result<T, SparseInferenceError>;
|
||||
|
||||
/// Trait for running inference on models
|
||||
pub trait ModelRunner {
|
||||
/// Forward pass with optional sparse computation
|
||||
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput>;
|
||||
|
||||
/// Get predictor for a specific layer (if available)
|
||||
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor>;
|
||||
|
||||
/// Calibrate predictors with sample data
|
||||
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats>;
|
||||
|
||||
/// Get model metadata
|
||||
fn metadata(&self) -> &ModelMetadata;
|
||||
}
|
||||
|
||||
/// Low-rank predictor for neuron activation prediction
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LowRankPredictor {
|
||||
pub u: Vec<Vec<f32>>, // U matrix (d x r)
|
||||
pub v: Vec<Vec<f32>>, // V matrix (r x m)
|
||||
pub rank: usize,
|
||||
}
|
||||
|
||||
impl LowRankPredictor {
|
||||
pub fn new(input_dim: usize, output_dim: usize, rank: usize) -> Self {
|
||||
Self {
|
||||
u: vec![vec![0.0; rank]; input_dim],
|
||||
v: vec![vec![0.0; output_dim]; rank],
|
||||
rank,
|
||||
}
|
||||
}
|
||||
|
||||
/// Predict top-k active neurons
|
||||
pub fn predict_active(&self, input: &[f32], k: usize) -> Vec<usize> {
|
||||
let scores = self.forward(input);
|
||||
let mut indices: Vec<usize> = (0..scores.len()).collect();
|
||||
indices.sort_by(|&a, &b| scores[b].partial_cmp(&scores[a]).unwrap());
|
||||
indices.truncate(k);
|
||||
indices
|
||||
}
|
||||
|
||||
fn forward(&self, input: &[f32]) -> Vec<f32> {
|
||||
// Compute UV^T · input in two steps
|
||||
// First: U^T · input (r-dimensional)
|
||||
let mut hidden = vec![0.0; self.rank];
|
||||
for i in 0..self.rank {
|
||||
for (j, u_ji) in self.u.iter().enumerate() {
|
||||
if j < input.len() && i < u_ji.len() {
|
||||
hidden[i] += u_ji[i] * input[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second: V · hidden (m-dimensional)
|
||||
let output_dim = self.v.first().map(|v| v.len()).unwrap_or(0);
|
||||
let mut output = vec![0.0; output_dim];
|
||||
for i in 0..output_dim {
|
||||
for (j, &h) in hidden.iter().enumerate() {
|
||||
if j < self.v.len() && i < self.v[j].len() {
|
||||
output[i] += self.v[j][i] * h;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Llama Model
|
||||
// ============================================================================
|
||||
|
||||
/// Llama model for sparse inference
|
||||
pub struct LlamaModel {
|
||||
pub metadata: ModelMetadata,
|
||||
pub layers: Vec<LlamaLayer>,
|
||||
pub embed_tokens: Embedding,
|
||||
pub norm: RMSNorm,
|
||||
pub lm_head: Option<Linear>,
|
||||
}
|
||||
|
||||
pub struct LlamaLayer {
|
||||
pub input_layernorm: RMSNorm,
|
||||
pub self_attn: LlamaAttention,
|
||||
pub post_attention_layernorm: RMSNorm,
|
||||
pub mlp: LlamaMLP,
|
||||
pub predictor: Option<LowRankPredictor>,
|
||||
}
|
||||
|
||||
pub struct LlamaAttention {
|
||||
pub q_proj: Linear,
|
||||
pub k_proj: Linear,
|
||||
pub v_proj: Linear,
|
||||
pub o_proj: Linear,
|
||||
pub num_heads: usize,
|
||||
pub head_dim: usize,
|
||||
}
|
||||
|
||||
pub struct LlamaMLP {
|
||||
pub gate_proj: Linear, // W1 for SwiGLU gate
|
||||
pub up_proj: Linear, // W3 for SwiGLU up
|
||||
pub down_proj: Linear, // W2 for down projection
|
||||
}
|
||||
|
||||
impl LlamaMLP {
|
||||
/// Standard forward pass (dense)
|
||||
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
let gate = self.gate_proj.forward(x);
|
||||
let up = self.up_proj.forward(x);
|
||||
|
||||
// SwiGLU: silu(gate) ⊙ up
|
||||
let hidden: Vec<f32> = gate
|
||||
.iter()
|
||||
.zip(up.iter())
|
||||
.map(|(&g, &u)| silu(g) * u)
|
||||
.collect();
|
||||
|
||||
self.down_proj.forward(&hidden)
|
||||
}
|
||||
|
||||
/// Sparse forward pass using predictor
|
||||
pub fn forward_sparse(&self, x: &[f32], active_neurons: &[usize]) -> Vec<f32> {
|
||||
// Only compute for active neurons in intermediate layer
|
||||
let gate = sparse_matmul(&self.gate_proj, x, active_neurons);
|
||||
let up = sparse_matmul(&self.up_proj, x, active_neurons);
|
||||
|
||||
// SwiGLU on active neurons only
|
||||
let hidden: Vec<f32> = gate
|
||||
.iter()
|
||||
.zip(up.iter())
|
||||
.map(|(&g, &u)| silu(g) * u)
|
||||
.collect();
|
||||
|
||||
// Sparse down projection
|
||||
sparse_matmul_full(&self.down_proj, &hidden, active_neurons)
|
||||
}
|
||||
}
|
||||
|
||||
impl ModelRunner for LlamaModel {
|
||||
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
|
||||
// Embed tokens
|
||||
let mut hidden_states = self.embed_tokens.forward(&input.input_ids);
|
||||
|
||||
let mut all_hidden_states = if config.output_hidden_states {
|
||||
Some(Vec::new())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Process each layer
|
||||
for (idx, layer) in self.layers.iter().enumerate() {
|
||||
if let Some(ref mut states) = all_hidden_states {
|
||||
states.push(hidden_states.clone());
|
||||
}
|
||||
|
||||
// Layer norm
|
||||
let normed = layer.input_layernorm.forward(&hidden_states);
|
||||
|
||||
// Self-attention (simplified, no KV cache)
|
||||
let attn_output = layer.self_attn.forward(&normed);
|
||||
|
||||
// Residual
|
||||
hidden_states = add_vectors(&hidden_states, &attn_output);
|
||||
|
||||
// Post-attention norm
|
||||
let normed = layer.post_attention_layernorm.forward(&hidden_states);
|
||||
|
||||
// MLP with optional sparsity
|
||||
let mlp_output = if config.use_sparse_ffn {
|
||||
if let Some(ref predictor) = layer.predictor {
|
||||
let k = config.active_neurons_per_layer.unwrap_or(
|
||||
(self.metadata.intermediate_size as f32 * (1.0 - config.sparsity)) as usize,
|
||||
);
|
||||
let active = predictor.predict_active(&normed, k);
|
||||
layer.mlp.forward_sparse(&normed, &active)
|
||||
} else {
|
||||
layer.mlp.forward(&normed)
|
||||
}
|
||||
} else {
|
||||
layer.mlp.forward(&normed)
|
||||
};
|
||||
|
||||
// Residual
|
||||
hidden_states = add_vectors(&hidden_states, &mlp_output);
|
||||
}
|
||||
|
||||
// Final norm
|
||||
hidden_states = self.norm.forward(&hidden_states);
|
||||
|
||||
// LM head
|
||||
let logits = if let Some(ref lm_head) = self.lm_head {
|
||||
lm_head.forward(&hidden_states)
|
||||
} else {
|
||||
hidden_states
|
||||
};
|
||||
|
||||
Ok(ModelOutput::new(logits).with_hidden_states(all_hidden_states.unwrap_or_default()))
|
||||
}
|
||||
|
||||
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
|
||||
self.layers.get(layer_idx)?.predictor.as_ref()
|
||||
}
|
||||
|
||||
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
|
||||
// Placeholder: would collect activation statistics
|
||||
Ok(CalibrationStats {
|
||||
num_samples: samples.len(),
|
||||
average_sparsity: 0.9,
|
||||
layer_stats: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn metadata(&self) -> &ModelMetadata {
|
||||
&self.metadata
|
||||
}
|
||||
}
|
||||
|
||||
impl LlamaAttention {
|
||||
pub fn forward(&self, hidden_states: &[f32]) -> Vec<f32> {
|
||||
// Simplified: full attention without KV cache
|
||||
let q = self.q_proj.forward(hidden_states);
|
||||
let k = self.k_proj.forward(hidden_states);
|
||||
let v = self.v_proj.forward(hidden_states);
|
||||
|
||||
// Placeholder: would do scaled dot-product attention
|
||||
self.o_proj.forward(&q)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// LFM2 Model (Liquid AI)
|
||||
// ============================================================================
|
||||
|
||||
pub struct LFM2Model {
|
||||
pub metadata: ModelMetadata,
|
||||
pub embedding: Embedding,
|
||||
pub layers: Vec<LFM2Layer>,
|
||||
pub pooler: Option<Pooler>,
|
||||
}
|
||||
|
||||
pub struct LFM2Layer {
|
||||
pub gated_conv: GatedConv1d,
|
||||
pub attention: GroupedQueryAttention,
|
||||
pub ffn: SparseFfn,
|
||||
pub norm: LayerNorm,
|
||||
}
|
||||
|
||||
pub struct GatedConv1d {
|
||||
pub weight: Vec<Vec<f32>>,
|
||||
pub gate: Linear,
|
||||
}
|
||||
|
||||
pub struct GroupedQueryAttention {
|
||||
pub q_proj: Linear,
|
||||
pub k_proj: Linear,
|
||||
pub v_proj: Linear,
|
||||
pub o_proj: Linear,
|
||||
pub num_groups: usize,
|
||||
}
|
||||
|
||||
pub struct SparseFfn {
|
||||
pub w1: Linear,
|
||||
pub w2: Linear,
|
||||
pub predictor: Option<LowRankPredictor>,
|
||||
}
|
||||
|
||||
impl ModelRunner for LFM2Model {
|
||||
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
|
||||
let mut hidden = self.embedding.forward(&input.input_ids);
|
||||
|
||||
for layer in &self.layers {
|
||||
// Gated convolution for local context
|
||||
hidden = layer.gated_conv.forward(&hidden);
|
||||
|
||||
// Grouped query attention
|
||||
let attn_out = layer.attention.forward(&hidden);
|
||||
hidden = add_vectors(&hidden, &attn_out);
|
||||
|
||||
// Sparse FFN
|
||||
let ffn_out = layer.ffn.forward(&hidden, config);
|
||||
hidden = add_vectors(&hidden, &ffn_out);
|
||||
|
||||
hidden = layer.norm.forward(&hidden);
|
||||
}
|
||||
|
||||
Ok(ModelOutput::new(hidden))
|
||||
}
|
||||
|
||||
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
|
||||
self.layers.get(layer_idx)?.ffn.predictor.as_ref()
|
||||
}
|
||||
|
||||
fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
|
||||
Ok(CalibrationStats {
|
||||
num_samples: 0,
|
||||
average_sparsity: 0.9,
|
||||
layer_stats: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn metadata(&self) -> &ModelMetadata {
|
||||
&self.metadata
|
||||
}
|
||||
}
|
||||
|
||||
impl GatedConv1d {
|
||||
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
// Simplified convolution
|
||||
x.to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
impl GroupedQueryAttention {
|
||||
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
self.o_proj.forward(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl SparseFfn {
|
||||
pub fn forward(&self, x: &[f32], config: &InferenceConfig) -> Vec<f32> {
|
||||
if config.use_sparse_ffn {
|
||||
if let Some(ref predictor) = self.predictor {
|
||||
let k = (self.w1.out_features as f32 * (1.0 - config.sparsity)) as usize;
|
||||
let active = predictor.predict_active(x, k);
|
||||
return sparse_matmul_full(&self.w2, &self.w1.forward(x), &active);
|
||||
}
|
||||
}
|
||||
self.w2.forward(&self.w1.forward(x))
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// BERT Model
|
||||
// ============================================================================
|
||||
|
||||
pub struct BertModel {
|
||||
pub metadata: ModelMetadata,
|
||||
pub embeddings: BertEmbeddings,
|
||||
pub encoder: Vec<BertLayer>,
|
||||
pub pooler: Option<Pooler>,
|
||||
}
|
||||
|
||||
pub struct BertEmbeddings {
|
||||
pub word_embeddings: Embedding,
|
||||
pub position_embeddings: Embedding,
|
||||
pub token_type_embeddings: Embedding,
|
||||
pub layer_norm: LayerNorm,
|
||||
}
|
||||
|
||||
pub struct BertLayer {
|
||||
pub attention: MultiHeadAttention,
|
||||
pub intermediate: Linear,
|
||||
pub output: Linear,
|
||||
pub layer_norm1: LayerNorm,
|
||||
pub layer_norm2: LayerNorm,
|
||||
}
|
||||
|
||||
pub struct MultiHeadAttention {
|
||||
pub q_proj: Linear,
|
||||
pub k_proj: Linear,
|
||||
pub v_proj: Linear,
|
||||
pub o_proj: Linear,
|
||||
pub num_heads: usize,
|
||||
}
|
||||
|
||||
pub struct Pooler {
|
||||
pub dense: Linear,
|
||||
}
|
||||
|
||||
impl ModelRunner for BertModel {
|
||||
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
|
||||
let mut hidden = self.embeddings.forward(&input.input_ids);
|
||||
|
||||
for layer in &self.encoder {
|
||||
let attn_out = layer.attention.forward(&hidden);
|
||||
hidden = layer.layer_norm1.forward(&add_vectors(&hidden, &attn_out));
|
||||
|
||||
let intermediate = layer.intermediate.forward(&hidden);
|
||||
let output = layer.output.forward(&intermediate);
|
||||
hidden = layer.layer_norm2.forward(&add_vectors(&hidden, &output));
|
||||
}
|
||||
|
||||
Ok(ModelOutput::new(hidden))
|
||||
}
|
||||
|
||||
fn get_predictor(&self, _layer_idx: usize) -> Option<&LowRankPredictor> {
|
||||
None
|
||||
}
|
||||
|
||||
fn calibrate(&mut self, _samples: &[ModelInput]) -> Result<CalibrationStats> {
|
||||
Ok(CalibrationStats {
|
||||
num_samples: 0,
|
||||
average_sparsity: 0.0,
|
||||
layer_stats: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn metadata(&self) -> &ModelMetadata {
|
||||
&self.metadata
|
||||
}
|
||||
}
|
||||
|
||||
impl BertEmbeddings {
|
||||
pub fn forward(&self, input_ids: &[u64]) -> Vec<f32> {
|
||||
self.word_embeddings.forward(input_ids)
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiHeadAttention {
|
||||
pub fn forward(&self, x: &[f32]) -> Vec<f32> {
|
||||
self.o_proj.forward(x)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Unified Model Wrapper
|
||||
// ============================================================================
|
||||
|
||||
pub enum SparseModel {
|
||||
Llama(LlamaModel),
|
||||
LFM2(LFM2Model),
|
||||
Bert(BertModel),
|
||||
}
|
||||
|
||||
impl ModelRunner for SparseModel {
|
||||
fn forward(&self, input: &ModelInput, config: &InferenceConfig) -> Result<ModelOutput> {
|
||||
match self {
|
||||
Self::Llama(m) => m.forward(input, config),
|
||||
Self::LFM2(m) => m.forward(input, config),
|
||||
Self::Bert(m) => m.forward(input, config),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_predictor(&self, layer_idx: usize) -> Option<&LowRankPredictor> {
|
||||
match self {
|
||||
Self::Llama(m) => m.get_predictor(layer_idx),
|
||||
Self::LFM2(m) => m.get_predictor(layer_idx),
|
||||
Self::Bert(m) => m.get_predictor(layer_idx),
|
||||
}
|
||||
}
|
||||
|
||||
fn calibrate(&mut self, samples: &[ModelInput]) -> Result<CalibrationStats> {
|
||||
match self {
|
||||
Self::Llama(m) => m.calibrate(samples),
|
||||
Self::LFM2(m) => m.calibrate(samples),
|
||||
Self::Bert(m) => m.calibrate(samples),
|
||||
}
|
||||
}
|
||||
|
||||
fn metadata(&self) -> &ModelMetadata {
|
||||
match self {
|
||||
Self::Llama(m) => m.metadata(),
|
||||
Self::LFM2(m) => m.metadata(),
|
||||
Self::Bert(m) => m.metadata(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Helper Functions
|
||||
// ============================================================================
|
||||
|
||||
fn sparse_matmul(linear: &Linear, input: &[f32], active_cols: &[usize]) -> Vec<f32> {
|
||||
let mut output = vec![0.0; active_cols.len()];
|
||||
|
||||
for (out_idx, &col_idx) in active_cols.iter().enumerate() {
|
||||
if col_idx < linear.out_features {
|
||||
for (in_idx, &x) in input.iter().enumerate() {
|
||||
if in_idx < linear.in_features {
|
||||
output[out_idx] += linear.weight[col_idx][in_idx] * x;
|
||||
}
|
||||
}
|
||||
if let Some(ref bias) = linear.bias {
|
||||
output[out_idx] += bias[col_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
fn sparse_matmul_full(linear: &Linear, input: &[f32], active_input_cols: &[usize]) -> Vec<f32> {
|
||||
let mut output = vec![0.0; linear.out_features];
|
||||
|
||||
for out_idx in 0..linear.out_features {
|
||||
for &in_idx in active_input_cols {
|
||||
if in_idx < input.len() && in_idx < linear.in_features {
|
||||
output[out_idx] += linear.weight[out_idx][in_idx] * input[in_idx];
|
||||
}
|
||||
}
|
||||
if let Some(ref bias) = linear.bias {
|
||||
output[out_idx] += bias[out_idx];
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
fn add_vectors(a: &[f32], b: &[f32]) -> Vec<f32> {
|
||||
a.iter().zip(b.iter()).map(|(x, y)| x + y).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_low_rank_predictor() {
|
||||
let predictor = LowRankPredictor::new(128, 512, 16);
|
||||
let input = vec![1.0; 128];
|
||||
let active = predictor.predict_active(&input, 10);
|
||||
assert_eq!(active.len(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_vectors() {
|
||||
let a = vec![1.0, 2.0, 3.0];
|
||||
let b = vec![4.0, 5.0, 6.0];
|
||||
let result = add_vectors(&a, &b);
|
||||
assert_eq!(result, vec![5.0, 7.0, 9.0]);
|
||||
}
|
||||
}
|
||||
159
vendor/ruvector/crates/ruvector-sparse-inference/src/model/types.rs
vendored
Normal file
159
vendor/ruvector/crates/ruvector-sparse-inference/src/model/types.rs
vendored
Normal file
@@ -0,0 +1,159 @@
|
||||
//! Core types for model inference
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Generic tensor representation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Tensor {
|
||||
pub data: Vec<f32>,
|
||||
pub shape: Vec<u64>,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
impl Tensor {
|
||||
pub fn new(data: Vec<f32>, shape: Vec<u64>, name: String) -> Self {
|
||||
Self { data, shape, name }
|
||||
}
|
||||
|
||||
pub fn zeros(shape: Vec<u64>, name: String) -> Self {
|
||||
let size = shape.iter().product::<u64>() as usize;
|
||||
Self {
|
||||
data: vec![0.0; size],
|
||||
shape,
|
||||
name,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
pub fn reshape(&mut self, new_shape: Vec<u64>) {
|
||||
let new_size = new_shape.iter().product::<u64>() as usize;
|
||||
assert_eq!(
|
||||
new_size,
|
||||
self.size(),
|
||||
"Reshape size mismatch: {} vs {}",
|
||||
new_size,
|
||||
self.size()
|
||||
);
|
||||
self.shape = new_shape;
|
||||
}
|
||||
}
|
||||
|
||||
/// Model input configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ModelInput {
|
||||
pub input_ids: Vec<u64>,
|
||||
pub attention_mask: Option<Vec<u8>>,
|
||||
pub position_ids: Option<Vec<u64>>,
|
||||
}
|
||||
|
||||
impl ModelInput {
|
||||
pub fn new(input_ids: Vec<u64>) -> Self {
|
||||
Self {
|
||||
input_ids,
|
||||
attention_mask: None,
|
||||
position_ids: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_attention_mask(mut self, mask: Vec<u8>) -> Self {
|
||||
self.attention_mask = Some(mask);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_position_ids(mut self, positions: Vec<u64>) -> Self {
|
||||
self.position_ids = Some(positions);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn sequence_length(&self) -> usize {
|
||||
self.input_ids.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Model output
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ModelOutput {
|
||||
pub logits: Vec<f32>,
|
||||
pub hidden_states: Option<Vec<Vec<f32>>>,
|
||||
pub attentions: Option<Vec<Vec<f32>>>,
|
||||
}
|
||||
|
||||
impl ModelOutput {
|
||||
pub fn new(logits: Vec<f32>) -> Self {
|
||||
Self {
|
||||
logits,
|
||||
hidden_states: None,
|
||||
attentions: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_hidden_states(mut self, states: Vec<Vec<f32>>) -> Self {
|
||||
self.hidden_states = Some(states);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Inference configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InferenceConfig {
|
||||
/// Sparsity level (0.0 = dense, 1.0 = maximum sparsity)
|
||||
pub sparsity: f32,
|
||||
|
||||
/// Sparsity threshold for neuron activation
|
||||
pub sparsity_threshold: f32,
|
||||
|
||||
/// Temperature for sampling
|
||||
pub temperature: f32,
|
||||
|
||||
/// Top-k sampling
|
||||
pub top_k: Option<usize>,
|
||||
|
||||
/// Top-p (nucleus) sampling
|
||||
pub top_p: Option<f32>,
|
||||
|
||||
/// Use sparse FFN computation
|
||||
pub use_sparse_ffn: bool,
|
||||
|
||||
/// Number of active neurons per layer
|
||||
pub active_neurons_per_layer: Option<usize>,
|
||||
|
||||
/// Return hidden states
|
||||
pub output_hidden_states: bool,
|
||||
|
||||
/// Return attention weights
|
||||
pub output_attentions: bool,
|
||||
}
|
||||
|
||||
impl Default for InferenceConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sparsity: 0.9,
|
||||
sparsity_threshold: 0.01,
|
||||
temperature: 1.0,
|
||||
top_k: None,
|
||||
top_p: None,
|
||||
use_sparse_ffn: true,
|
||||
active_neurons_per_layer: None,
|
||||
output_hidden_states: false,
|
||||
output_attentions: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calibration statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CalibrationStats {
|
||||
pub num_samples: usize,
|
||||
pub average_sparsity: f32,
|
||||
pub layer_stats: HashMap<usize, LayerStats>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LayerStats {
|
||||
pub active_neurons: usize,
|
||||
pub total_neurons: usize,
|
||||
pub sparsity: f32,
|
||||
}
|
||||
Reference in New Issue
Block a user