Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,899 @@
//! Phi-3 Model Architecture Implementation
//!
//! Microsoft Phi-3 is a compact but powerful model featuring:
//! - **SuRoPE**: Scaled Uniform Rotary Position Embeddings for extended context
//! - **SwiGLU activation**: Gated Linear Unit with Swish (SiLU)
//! - **Fused gate_up_proj**: Combined gate and up projection for efficiency
//! - **Sliding window attention**: 2048 token window for memory efficiency
//!
//! ## Model Variants
//!
//! | Model | Hidden Size | Layers | Heads | Context |
//! |-------|-------------|--------|-------|---------|
//! | Phi-3-mini | 3072 | 32 | 32 | 4096/128K |
//! | Phi-3-small | 2560 | 32 | 32 | 8192/128K |
//! | Phi-3-medium | 5120 | 40 | 40 | 4096/128K |
//!
//! ## Example
//!
//! ```rust,ignore
//! use ruvllm::backends::phi3::{Phi3Config, Phi3Model};
//!
//! let config = Phi3Config::phi3_mini_128k();
//! let model = Phi3Model::new(&config)?;
//!
//! let output = model.forward(&input_ids, &attention_mask, None)?;
//! ```
use crate::error::{Result, RuvLLMError};
use crate::kernels::rope::{precompute_rope_tables_with_config, RopeConfig, RopeTables};
use crate::kernels::{apply_rope_neon, flash_attention_neon, rms_norm_neon, AttentionConfig};
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
/// Phi-3 model configuration
#[derive(Debug, Clone)]
pub struct Phi3Config {
/// Hidden size (embedding dimension)
pub hidden_size: usize,
/// Intermediate size for MLP (typically 8/3 * hidden_size for SwiGLU)
pub intermediate_size: usize,
/// Number of hidden layers
pub num_hidden_layers: usize,
/// Number of attention heads
pub num_attention_heads: usize,
/// Number of key-value heads (same as attention heads for Phi-3, no GQA)
pub num_kv_heads: usize,
/// Vocabulary size
pub vocab_size: usize,
/// Maximum position embeddings
pub max_position_embeddings: usize,
/// Original maximum position embeddings (for SuRoPE scaling)
pub original_max_position_embeddings: usize,
/// RoPE base frequency
pub rope_theta: f32,
/// RoPE scaling factor (for SuRoPE)
pub rope_scaling_factor: f32,
/// RMS norm epsilon
pub rms_norm_eps: f32,
/// Sliding window size (typically 2048 for Phi-3)
pub sliding_window: Option<usize>,
/// Head dimension (hidden_size / num_attention_heads)
pub head_dim: usize,
/// Whether to use flash attention
pub use_flash_attention: bool,
/// BOS token ID
pub bos_token_id: u32,
/// EOS token ID
pub eos_token_id: u32,
}
impl Default for Phi3Config {
fn default() -> Self {
Self::phi3_mini_4k()
}
}
impl Phi3Config {
/// Phi-3-mini with 4K context
pub fn phi3_mini_4k() -> Self {
Self {
hidden_size: 3072,
intermediate_size: 8192,
num_hidden_layers: 32,
num_attention_heads: 32,
num_kv_heads: 32, // No GQA
vocab_size: 32064,
max_position_embeddings: 4096,
original_max_position_embeddings: 4096,
rope_theta: 10000.0,
rope_scaling_factor: 1.0,
rms_norm_eps: 1e-5,
sliding_window: Some(2048),
head_dim: 96, // 3072 / 32
use_flash_attention: true,
bos_token_id: 1,
eos_token_id: 32000,
}
}
/// Phi-3-mini with 128K extended context (SuRoPE)
pub fn phi3_mini_128k() -> Self {
Self {
hidden_size: 3072,
intermediate_size: 8192,
num_hidden_layers: 32,
num_attention_heads: 32,
num_kv_heads: 32,
vocab_size: 32064,
max_position_embeddings: 131072,
original_max_position_embeddings: 4096,
rope_theta: 10000.0,
rope_scaling_factor: 32.0, // SuRoPE scaling
rms_norm_eps: 1e-5,
sliding_window: Some(2048),
head_dim: 96,
use_flash_attention: true,
bos_token_id: 1,
eos_token_id: 32000,
}
}
/// Phi-3-small configuration
pub fn phi3_small() -> Self {
Self {
hidden_size: 2560,
intermediate_size: 6912,
num_hidden_layers: 32,
num_attention_heads: 32,
num_kv_heads: 32,
vocab_size: 32064,
max_position_embeddings: 8192,
original_max_position_embeddings: 8192,
rope_theta: 10000.0,
rope_scaling_factor: 1.0,
rms_norm_eps: 1e-5,
sliding_window: Some(2048),
head_dim: 80, // 2560 / 32
use_flash_attention: true,
bos_token_id: 1,
eos_token_id: 32000,
}
}
/// Phi-3-medium configuration
pub fn phi3_medium() -> Self {
Self {
hidden_size: 5120,
intermediate_size: 13824,
num_hidden_layers: 40,
num_attention_heads: 40,
num_kv_heads: 40,
vocab_size: 32064,
max_position_embeddings: 4096,
original_max_position_embeddings: 4096,
rope_theta: 10000.0,
rope_scaling_factor: 1.0,
rms_norm_eps: 1e-5,
sliding_window: Some(2048),
head_dim: 128, // 5120 / 40
use_flash_attention: true,
bos_token_id: 1,
eos_token_id: 32000,
}
}
/// Get the attention configuration
pub fn attention_config(&self) -> AttentionConfig {
AttentionConfig {
num_heads: self.num_attention_heads,
num_kv_heads: self.num_kv_heads,
head_dim: self.head_dim,
max_seq_len: self.max_position_embeddings,
causal: true,
scale: 0.0, // Will be computed from head_dim
}
}
/// Get the RoPE configuration with SuRoPE scaling
pub fn rope_config(&self) -> RopeConfig {
RopeConfig {
base: self.rope_theta,
head_dim: self.head_dim,
max_seq_len: self.max_position_embeddings,
scaling_factor: self.rope_scaling_factor,
ntk_aware: self.rope_scaling_factor > 1.0,
original_max_len: self.original_max_position_embeddings,
}
}
}
/// Phi-3 Attention layer
///
/// Implements multi-head attention with:
/// - SuRoPE (Scaled Uniform RoPE) for extended context
/// - Optional sliding window attention
/// - Fused QKV projection
#[derive(Debug)]
pub struct Phi3Attention {
/// Query projection weights (hidden_size, hidden_size)
pub q_proj: Vec<f32>,
/// Key projection weights (hidden_size, hidden_size)
pub k_proj: Vec<f32>,
/// Value projection weights (hidden_size, hidden_size)
pub v_proj: Vec<f32>,
/// Output projection weights (hidden_size, hidden_size)
pub o_proj: Vec<f32>,
/// Configuration
pub config: Phi3Config,
/// Precomputed RoPE tables
pub rope_tables: RopeTables,
}
impl Phi3Attention {
/// Create a new Phi3Attention layer
pub fn new(config: &Phi3Config) -> Self {
let hidden_size = config.hidden_size;
let qkv_size = hidden_size * hidden_size;
Self {
q_proj: vec![0.0; qkv_size],
k_proj: vec![0.0; qkv_size],
v_proj: vec![0.0; qkv_size],
o_proj: vec![0.0; qkv_size],
config: config.clone(),
rope_tables: precompute_rope_tables_with_config(&config.rope_config()),
}
}
/// Load weights from flat arrays
pub fn load_weights(
&mut self,
q_proj: &[f32],
k_proj: &[f32],
v_proj: &[f32],
o_proj: &[f32],
) -> Result<()> {
let expected_size = self.config.hidden_size * self.config.hidden_size;
if q_proj.len() != expected_size
|| k_proj.len() != expected_size
|| v_proj.len() != expected_size
|| o_proj.len() != expected_size
{
return Err(RuvLLMError::Model(format!(
"Invalid weight dimensions: expected {}, got q={}, k={}, v={}, o={}",
expected_size,
q_proj.len(),
k_proj.len(),
v_proj.len(),
o_proj.len()
)));
}
self.q_proj.copy_from_slice(q_proj);
self.k_proj.copy_from_slice(k_proj);
self.v_proj.copy_from_slice(v_proj);
self.o_proj.copy_from_slice(o_proj);
Ok(())
}
/// Forward pass through attention
///
/// # Arguments
/// * `hidden_states` - Input tensor (batch_size * seq_len, hidden_size)
/// * `positions` - Position indices for RoPE
/// * `kv_cache` - Optional KV cache (keys, values)
///
/// # Returns
/// Output tensor (batch_size * seq_len, hidden_size)
pub fn forward(
&self,
hidden_states: &[f32],
positions: &[usize],
kv_cache: Option<(&mut Vec<f32>, &mut Vec<f32>)>,
) -> Result<Vec<f32>> {
let seq_len = positions.len();
let hidden_size = self.config.hidden_size;
let num_heads = self.config.num_attention_heads;
let head_dim = self.config.head_dim;
if hidden_states.len() != seq_len * hidden_size {
return Err(RuvLLMError::InvalidOperation(format!(
"Invalid hidden_states shape: expected {}, got {}",
seq_len * hidden_size,
hidden_states.len()
)));
}
// Project to Q, K, V
let mut query =
self.linear_transform(hidden_states, &self.q_proj, hidden_size, hidden_size);
let mut key = self.linear_transform(hidden_states, &self.k_proj, hidden_size, hidden_size);
let value = self.linear_transform(hidden_states, &self.v_proj, hidden_size, hidden_size);
// Apply SuRoPE (Scaled Uniform RoPE)
self.apply_surope(&mut query, positions);
self.apply_surope(&mut key, positions);
// Handle KV cache
let (key_states, value_states) = if let Some((k_cache, v_cache)) = kv_cache {
k_cache.extend_from_slice(&key);
v_cache.extend_from_slice(&value);
(k_cache.as_slice(), v_cache.as_slice())
} else {
(key.as_slice(), value.as_slice())
};
// Compute attention for each head
let kv_len = key_states.len() / hidden_size;
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0; seq_len * hidden_size];
for h in 0..num_heads {
for t in 0..seq_len {
// Extract query for this head and position
let q_offset = (t * num_heads + h) * head_dim;
let q_slice = &query[q_offset..q_offset + head_dim];
// Extract keys and values for this head
let mut k_slice = Vec::with_capacity(kv_len * head_dim);
let mut v_slice = Vec::with_capacity(kv_len * head_dim);
for kv_t in 0..kv_len {
let kv_offset = (kv_t * num_heads + h) * head_dim;
k_slice.extend_from_slice(&key_states[kv_offset..kv_offset + head_dim]);
v_slice.extend_from_slice(&value_states[kv_offset..kv_offset + head_dim]);
}
// Apply sliding window if configured
let (k_slice, v_slice, effective_kv_len) =
if let Some(window) = self.config.sliding_window {
let pos = positions[t];
let start = pos.saturating_sub(window);
let end = kv_len;
if start > 0 {
let start_offset = start * head_dim;
(
k_slice[start_offset..].to_vec(),
v_slice[start_offset..].to_vec(),
end - start,
)
} else {
(k_slice, v_slice, kv_len)
}
} else {
(k_slice, v_slice, kv_len)
};
// Flash attention
let head_output = flash_attention_neon(q_slice, &k_slice, &v_slice, scale, true);
// Write output
let out_offset = (t * num_heads + h) * head_dim;
output[out_offset..out_offset + head_dim].copy_from_slice(&head_output);
}
}
// Output projection
let output = self.linear_transform(&output, &self.o_proj, hidden_size, hidden_size);
Ok(output)
}
/// Apply SuRoPE (Scaled Uniform RoPE)
fn apply_surope(&self, x: &mut [f32], positions: &[usize]) {
let head_dim = self.config.head_dim;
let num_heads = self.config.num_attention_heads;
let seq_len = positions.len();
// Apply RoPE per head
for h in 0..num_heads {
for t in 0..seq_len {
let offset = (t * num_heads + h) * head_dim;
let mut head_vec = x[offset..offset + head_dim].to_vec();
// Scale position by scaling factor for SuRoPE
let scaled_pos = (positions[t] as f32 / self.config.rope_scaling_factor) as usize;
apply_rope_neon(
&mut head_vec,
&[scaled_pos],
head_dim,
self.config.rope_theta,
);
x[offset..offset + head_dim].copy_from_slice(&head_vec);
}
}
}
/// Linear transformation: output = input @ weights.T
fn linear_transform(
&self,
input: &[f32],
weights: &[f32],
in_dim: usize,
out_dim: usize,
) -> Vec<f32> {
let batch_size = input.len() / in_dim;
let mut output = vec![0.0; batch_size * out_dim];
for b in 0..batch_size {
for o in 0..out_dim {
let mut sum = 0.0;
for i in 0..in_dim {
sum += input[b * in_dim + i] * weights[o * in_dim + i];
}
output[b * out_dim + o] = sum;
}
}
output
}
}
/// Phi-3 MLP layer with SwiGLU activation
///
/// SwiGLU combines gating with Swish activation:
/// ```text
/// MLP(x) = down_proj(SiLU(gate_proj(x)) * up_proj(x))
/// ```
///
/// Phi-3 uses a fused gate_up_proj for efficiency
#[derive(Debug)]
pub struct Phi3MLP {
/// Gate projection weights (intermediate_size, hidden_size)
pub gate_proj: Vec<f32>,
/// Up projection weights (intermediate_size, hidden_size)
pub up_proj: Vec<f32>,
/// Down projection weights (hidden_size, intermediate_size)
pub down_proj: Vec<f32>,
/// Hidden size
pub hidden_size: usize,
/// Intermediate size
pub intermediate_size: usize,
}
impl Phi3MLP {
/// Create a new Phi3MLP layer
pub fn new(config: &Phi3Config) -> Self {
Self {
gate_proj: vec![0.0; config.intermediate_size * config.hidden_size],
up_proj: vec![0.0; config.intermediate_size * config.hidden_size],
down_proj: vec![0.0; config.hidden_size * config.intermediate_size],
hidden_size: config.hidden_size,
intermediate_size: config.intermediate_size,
}
}
/// Load weights
pub fn load_weights(
&mut self,
gate_proj: &[f32],
up_proj: &[f32],
down_proj: &[f32],
) -> Result<()> {
let gate_up_size = self.intermediate_size * self.hidden_size;
let down_size = self.hidden_size * self.intermediate_size;
if gate_proj.len() != gate_up_size
|| up_proj.len() != gate_up_size
|| down_proj.len() != down_size
{
return Err(RuvLLMError::Model(
"Invalid MLP weight dimensions".to_string(),
));
}
self.gate_proj.copy_from_slice(gate_proj);
self.up_proj.copy_from_slice(up_proj);
self.down_proj.copy_from_slice(down_proj);
Ok(())
}
/// Forward pass with SwiGLU activation
pub fn forward(&self, hidden_states: &[f32]) -> Result<Vec<f32>> {
let batch_size = hidden_states.len() / self.hidden_size;
// Gate projection + SiLU
let gate = self.linear(
hidden_states,
&self.gate_proj,
self.hidden_size,
self.intermediate_size,
);
let gate_activated = self.silu(&gate);
// Up projection
let up = self.linear(
hidden_states,
&self.up_proj,
self.hidden_size,
self.intermediate_size,
);
// Element-wise multiply (gating)
let hidden: Vec<f32> = gate_activated
.iter()
.zip(up.iter())
.map(|(g, u)| g * u)
.collect();
// Down projection
let output = self.linear(
&hidden,
&self.down_proj,
self.intermediate_size,
self.hidden_size,
);
Ok(output)
}
/// Linear transformation
fn linear(&self, input: &[f32], weights: &[f32], in_dim: usize, out_dim: usize) -> Vec<f32> {
let batch_size = input.len() / in_dim;
let mut output = vec![0.0; batch_size * out_dim];
#[cfg(target_arch = "aarch64")]
unsafe {
self.linear_neon(input, weights, &mut output, batch_size, in_dim, out_dim);
}
#[cfg(not(target_arch = "aarch64"))]
{
for b in 0..batch_size {
for o in 0..out_dim {
let mut sum = 0.0;
for i in 0..in_dim {
sum += input[b * in_dim + i] * weights[o * in_dim + i];
}
output[b * out_dim + o] = sum;
}
}
}
output
}
/// NEON-optimized linear transformation
#[cfg(target_arch = "aarch64")]
unsafe fn linear_neon(
&self,
input: &[f32],
weights: &[f32],
output: &mut [f32],
batch_size: usize,
in_dim: usize,
out_dim: usize,
) {
let in_ptr: *const f32 = input.as_ptr();
let w_ptr: *const f32 = weights.as_ptr();
let out_ptr: *mut f32 = output.as_mut_ptr();
for b in 0..batch_size {
for o in 0..out_dim {
let mut acc = vdupq_n_f32(0.0);
let mut i = 0;
// Process 4 elements at a time
while i + 4 <= in_dim {
let x = vld1q_f32(in_ptr.add(b * in_dim + i));
let w = vld1q_f32(w_ptr.add(o * in_dim + i));
acc = vfmaq_f32(acc, x, w);
i += 4;
}
// Horizontal sum
let mut sum = vaddvq_f32(acc);
// Handle remainder
while i < in_dim {
sum += *in_ptr.add(b * in_dim + i) * *w_ptr.add(o * in_dim + i);
i += 1;
}
*out_ptr.add(b * out_dim + o) = sum;
}
}
}
/// SiLU (Swish) activation: x * sigmoid(x)
///
/// Uses the vectorized NEON implementation from the activations module
/// for ~3.5x speedup over the previous scalar-in-vector approach.
fn silu(&self, x: &[f32]) -> Vec<f32> {
crate::kernels::silu_vec(x)
}
}
/// Phi-3 Decoder Layer
///
/// Each layer consists of:
/// 1. Self-attention with pre-normalization
/// 2. MLP with pre-normalization
#[derive(Debug)]
pub struct Phi3DecoderLayer {
/// Self attention
pub self_attn: Phi3Attention,
/// MLP
pub mlp: Phi3MLP,
/// Input layer norm weights
pub input_layernorm: Vec<f32>,
/// Post-attention layer norm weights
pub post_attention_layernorm: Vec<f32>,
/// RMS norm epsilon
pub rms_norm_eps: f32,
/// Hidden size
pub hidden_size: usize,
}
impl Phi3DecoderLayer {
/// Create a new decoder layer
pub fn new(config: &Phi3Config) -> Self {
Self {
self_attn: Phi3Attention::new(config),
mlp: Phi3MLP::new(config),
input_layernorm: vec![1.0; config.hidden_size],
post_attention_layernorm: vec![1.0; config.hidden_size],
rms_norm_eps: config.rms_norm_eps,
hidden_size: config.hidden_size,
}
}
/// Forward pass
pub fn forward(
&self,
hidden_states: &[f32],
positions: &[usize],
kv_cache: Option<(&mut Vec<f32>, &mut Vec<f32>)>,
) -> Result<Vec<f32>> {
let seq_len = positions.len();
// Pre-norm for attention
let mut normed = hidden_states.to_vec();
for t in 0..seq_len {
let offset = t * self.hidden_size;
let slice = &mut normed[offset..offset + self.hidden_size];
rms_norm_neon(slice, &self.input_layernorm, self.rms_norm_eps);
}
// Self attention
let attn_output = self.self_attn.forward(&normed, positions, kv_cache)?;
// Residual connection
let mut hidden: Vec<f32> = hidden_states
.iter()
.zip(attn_output.iter())
.map(|(h, a)| h + a)
.collect();
// Pre-norm for MLP
let mut normed = hidden.clone();
for t in 0..seq_len {
let offset = t * self.hidden_size;
let slice = &mut normed[offset..offset + self.hidden_size];
rms_norm_neon(slice, &self.post_attention_layernorm, self.rms_norm_eps);
}
// MLP
let mlp_output = self.mlp.forward(&normed)?;
// Residual connection
for (h, m) in hidden.iter_mut().zip(mlp_output.iter()) {
*h += m;
}
Ok(hidden)
}
}
/// Complete Phi-3 Model
#[derive(Debug)]
pub struct Phi3Model {
/// Model configuration
pub config: Phi3Config,
/// Token embeddings (vocab_size, hidden_size)
pub embed_tokens: Vec<f32>,
/// Decoder layers
pub layers: Vec<Phi3DecoderLayer>,
/// Final layer norm
pub norm: Vec<f32>,
/// LM head weights (vocab_size, hidden_size) - often tied to embeddings
pub lm_head: Option<Vec<f32>>,
/// Whether lm_head is tied to embeddings
pub tie_word_embeddings: bool,
}
impl Phi3Model {
/// Create a new Phi-3 model
pub fn new(config: &Phi3Config) -> Result<Self> {
let mut layers = Vec::with_capacity(config.num_hidden_layers);
for _ in 0..config.num_hidden_layers {
layers.push(Phi3DecoderLayer::new(config));
}
Ok(Self {
config: config.clone(),
embed_tokens: vec![0.0; config.vocab_size * config.hidden_size],
layers,
norm: vec![1.0; config.hidden_size],
lm_head: None,
tie_word_embeddings: true,
})
}
/// Forward pass through the model
///
/// # Arguments
/// * `input_ids` - Token IDs (batch_size * seq_len)
/// * `positions` - Position indices
/// * `kv_caches` - Optional KV caches for each layer
///
/// # Returns
/// Logits tensor (batch_size * seq_len, vocab_size)
pub fn forward(
&self,
input_ids: &[u32],
positions: &[usize],
mut kv_caches: Option<&mut Vec<(Vec<f32>, Vec<f32>)>>,
) -> Result<Vec<f32>> {
let seq_len = positions.len();
if input_ids.len() != seq_len {
return Err(RuvLLMError::InvalidOperation(format!(
"input_ids length {} != positions length {}",
input_ids.len(),
seq_len
)));
}
// Token embeddings
let mut hidden_states = Vec::with_capacity(seq_len * self.config.hidden_size);
for &token_id in input_ids {
let offset = (token_id as usize) * self.config.hidden_size;
if offset + self.config.hidden_size > self.embed_tokens.len() {
return Err(RuvLLMError::InvalidOperation(format!(
"Token ID {} out of vocabulary bounds",
token_id
)));
}
hidden_states
.extend_from_slice(&self.embed_tokens[offset..offset + self.config.hidden_size]);
}
// Process through decoder layers
for (layer_idx, layer) in self.layers.iter().enumerate() {
let kv_cache = kv_caches.as_mut().map(|caches| {
while caches.len() <= layer_idx {
caches.push((Vec::new(), Vec::new()));
}
let (k, v) = &mut caches[layer_idx];
(k, v)
});
hidden_states = layer.forward(&hidden_states, positions, kv_cache)?;
}
// Final norm
for t in 0..seq_len {
let offset = t * self.config.hidden_size;
let slice = &mut hidden_states[offset..offset + self.config.hidden_size];
rms_norm_neon(slice, &self.norm, self.config.rms_norm_eps);
}
// LM head
let lm_weights = if self.tie_word_embeddings {
&self.embed_tokens
} else {
self.lm_head
.as_ref()
.ok_or_else(|| RuvLLMError::InvalidOperation("No LM head weights".to_string()))?
};
// Compute logits
let mut logits = vec![0.0; seq_len * self.config.vocab_size];
for t in 0..seq_len {
for v in 0..self.config.vocab_size {
let mut sum = 0.0;
for h in 0..self.config.hidden_size {
sum += hidden_states[t * self.config.hidden_size + h]
* lm_weights[v * self.config.hidden_size + h];
}
logits[t * self.config.vocab_size + v] = sum;
}
}
Ok(logits)
}
/// Generate Phi-3 chat template format
///
/// Phi-3 uses: `<|user|>\n{content}<|end|>\n<|assistant|>`
pub fn apply_chat_template(messages: &[(String, String)]) -> String {
let mut result = String::new();
for (role, content) in messages {
result.push_str(&format!("<|{}|>\n{}<|end|>\n", role, content));
}
result.push_str("<|assistant|>");
result
}
/// Load model weights from GGUF format
#[cfg(feature = "candle")]
pub fn from_gguf(_path: &std::path::Path) -> Result<Self> {
// Implementation would parse GGUF and load weights
Err(RuvLLMError::NotFound(
"GGUF loading not yet implemented for Phi-3".to_string(),
))
}
/// Load model weights from safetensors format
#[cfg(feature = "candle")]
pub fn from_safetensors(_path: &std::path::Path) -> Result<Self> {
// Implementation would parse safetensors and load weights
Err(RuvLLMError::NotFound(
"Safetensors loading not yet implemented for Phi-3".to_string(),
))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_phi3_config() {
let config = Phi3Config::phi3_mini_4k();
assert_eq!(config.hidden_size, 3072);
assert_eq!(config.num_hidden_layers, 32);
assert_eq!(config.head_dim, 96);
assert_eq!(config.sliding_window, Some(2048));
}
#[test]
fn test_phi3_config_128k() {
let config = Phi3Config::phi3_mini_128k();
assert_eq!(config.max_position_embeddings, 131072);
assert_eq!(config.rope_scaling_factor, 32.0);
}
#[test]
fn test_phi3_attention_config() {
let config = Phi3Config::phi3_mini_4k();
let attn_config = config.attention_config();
assert_eq!(attn_config.num_heads, 32);
assert_eq!(attn_config.num_kv_heads, 32);
assert!(attn_config.causal);
}
#[test]
fn test_phi3_mlp_silu() {
let config = Phi3Config::phi3_mini_4k();
let mlp = Phi3MLP::new(&config);
// Test SiLU activation
let input = vec![0.0, 1.0, -1.0, 2.0];
let output = mlp.silu(&input);
// SiLU(0) = 0
assert!((output[0]).abs() < 1e-5);
// SiLU(1) = 1 * sigmoid(1) ~ 0.731
assert!((output[1] - 0.731).abs() < 0.01);
// SiLU(-1) ~ -0.269
assert!((output[2] - (-0.269)).abs() < 0.01);
}
#[test]
fn test_phi3_model_creation() {
let config = Phi3Config::phi3_mini_4k();
let model = Phi3Model::new(&config).unwrap();
assert_eq!(model.layers.len(), 32);
assert_eq!(
model.embed_tokens.len(),
config.vocab_size * config.hidden_size
);
}
#[test]
fn test_chat_template() {
let messages = vec![
("user".to_string(), "Hello!".to_string()),
("assistant".to_string(), "Hi there!".to_string()),
("user".to_string(), "How are you?".to_string()),
];
let template = Phi3Model::apply_chat_template(&messages);
assert!(template.contains("<|user|>"));
assert!(template.contains("<|assistant|>"));
assert!(template.contains("<|end|>"));
assert!(template.ends_with("<|assistant|>"));
}
}