//! Pretraining and Fine-tuning for SIMD Transformer Models //! //! Implements: //! - Data pipeline with tokenization //! - Training loop with cross-entropy loss //! - Gradient descent with SIMD-optimized operations //! - Model checkpointing //! - Perplexity tracking use crate::simd_inference::{ KvCache, Q4Weights, SimdGenerationConfig, SimdOps, SimpleTokenizer, SmallTransformer, TransformerLayer, }; use ndarray::{Array1, Array2}; use parking_lot::RwLock; use rayon::prelude::*; use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; /// Training configuration #[derive(Debug, Clone)] pub struct TrainingConfig { /// Learning rate pub learning_rate: f32, /// Batch size pub batch_size: usize, /// Number of epochs pub epochs: usize, /// Warmup steps pub warmup_steps: usize, /// Gradient clipping threshold pub grad_clip: f32, /// Weight decay (L2 regularization) pub weight_decay: f32, /// Sequence length pub seq_length: usize, /// Log every N steps pub log_interval: usize, /// Checkpoint every N steps pub checkpoint_interval: usize, } impl Default for TrainingConfig { fn default() -> Self { Self { learning_rate: 1e-4, batch_size: 8, epochs: 3, warmup_steps: 100, grad_clip: 1.0, weight_decay: 0.01, seq_length: 128, log_interval: 10, checkpoint_interval: 100, } } } /// Training metrics #[derive(Debug, Clone, Default)] pub struct TrainingMetrics { /// Current epoch pub epoch: usize, /// Current step pub step: usize, /// Training loss pub loss: f64, /// Perplexity pub perplexity: f64, /// Tokens per second pub tokens_per_second: f64, /// Learning rate (with warmup/decay) pub current_lr: f64, /// Gradient norm pub grad_norm: f64, } /// Training dataset pub struct TrainingDataset { /// Tokenized sequences sequences: Vec>, /// Vocabulary size vocab_size: usize, /// Sequence length seq_length: usize, } impl TrainingDataset { /// Create from raw text corpus pub fn from_text(texts: &[&str], tokenizer: &SimpleTokenizer, seq_length: usize) -> Self { let mut sequences = Vec::new(); for text in texts { let tokens = tokenizer.encode(text); // Split into chunks of seq_length for chunk in tokens.chunks(seq_length) { if chunk.len() >= 2 { sequences.push(chunk.to_vec()); } } } Self { sequences, vocab_size: tokenizer.vocab_size(), seq_length, } } /// Create synthetic dataset for demo pub fn synthetic(vocab_size: usize, num_sequences: usize, seq_length: usize) -> Self { use rand::Rng; let mut rng = rand::thread_rng(); let sequences: Vec> = (0..num_sequences) .map(|_| { (0..seq_length) .map(|_| rng.gen_range(0..vocab_size as u32)) .collect() }) .collect(); Self { sequences, vocab_size, seq_length, } } /// Get number of sequences pub fn len(&self) -> usize { self.sequences.len() } /// Check if empty pub fn is_empty(&self) -> bool { self.sequences.is_empty() } /// Get a batch of (input, target) pairs pub fn get_batch(&self, indices: &[usize]) -> (Vec>, Vec>) { let inputs: Vec> = indices .iter() .map(|&i| { let seq = &self.sequences[i % self.sequences.len()]; seq[..seq.len().saturating_sub(1)].to_vec() }) .collect(); let targets: Vec> = indices .iter() .map(|&i| { let seq = &self.sequences[i % self.sequences.len()]; seq[1..].to_vec() }) .collect(); (inputs, targets) } } /// Trainable transformer layer with float32 weights pub struct TrainableLayer { /// Query projection pub wq: Array2, /// Key projection pub wk: Array2, /// Value projection pub wv: Array2, /// Output projection pub wo: Array2, /// FFN gate pub w1: Array2, /// FFN down pub w2: Array2, /// FFN up pub w3: Array2, /// Attention norm weights pub attn_norm: Vec, /// FFN norm weights pub ffn_norm: Vec, /// Hidden dimension pub hidden_dim: usize, /// Number of heads pub num_heads: usize, /// Head dimension pub head_dim: usize, } impl TrainableLayer { /// Create with random initialization pub fn new_random(hidden_dim: usize, num_heads: usize, ffn_dim: usize) -> Self { use rand::Rng; let mut rng = rand::thread_rng(); let head_dim = hidden_dim / num_heads; let mut init = |rows: usize, cols: usize| -> Array2 { let scale = (2.0 / (rows + cols) as f32).sqrt(); Array2::from_shape_fn((rows, cols), |_| rng.gen::() * scale * 2.0 - scale) }; Self { wq: init(hidden_dim, hidden_dim), wk: init(hidden_dim, hidden_dim), wv: init(hidden_dim, hidden_dim), wo: init(hidden_dim, hidden_dim), w1: init(ffn_dim, hidden_dim), w2: init(hidden_dim, ffn_dim), w3: init(ffn_dim, hidden_dim), attn_norm: vec![1.0; hidden_dim], ffn_norm: vec![1.0; hidden_dim], hidden_dim, num_heads, head_dim, } } /// Forward pass returning logits and hidden state pub fn forward(&self, x: &[f32]) -> Vec { // RMS Norm let normed = SimdOps::rms_norm(x, &self.attn_norm, 1e-6); // QKV projections using SIMD let q = matmul_vec(&self.wq, &normed); let k = matmul_vec(&self.wk, &normed); let v = matmul_vec(&self.wv, &normed); // Simple self-attention (single token) let mut attn_out = vec![0.0f32; self.hidden_dim]; for h in 0..self.num_heads { let start = h * self.head_dim; let end = start + self.head_dim; let q_head = &q[start..end]; let k_head = &k[start..end]; let v_head = &v[start..end]; // Score = Q·K / sqrt(d) let score = SimdOps::dot_product(q_head, k_head) / (self.head_dim as f32).sqrt(); let weight = score.exp(); // Softmax for single element for (i, &v_val) in v_head.iter().enumerate() { attn_out[start + i] += weight * v_val; } } // Output projection let attn_out = matmul_vec(&self.wo, &attn_out); // Residual let mut hidden: Vec = x.iter().zip(attn_out.iter()).map(|(a, b)| a + b).collect(); // FFN let normed = SimdOps::rms_norm(&hidden, &self.ffn_norm, 1e-6); let gate = matmul_vec(&self.w1, &normed); let up = matmul_vec(&self.w3, &normed); // SiLU(gate) * up let ffn_hidden: Vec = gate .iter() .zip(up.iter()) .map(|(g, u)| SimdOps::silu(*g) * u) .collect(); let ffn_out = matmul_vec(&self.w2, &ffn_hidden); // Residual for (h, f) in hidden.iter_mut().zip(ffn_out.iter()) { *h += f; } hidden } } /// SIMD matrix-vector multiplication (f32) fn matmul_vec(matrix: &Array2, vec: &[f32]) -> Vec { let rows = matrix.nrows(); let mut result = vec![0.0f32; rows]; for (i, row) in matrix.rows().into_iter().enumerate() { result[i] = SimdOps::dot_product(row.as_slice().unwrap(), vec); } result } /// Trainable transformer model pub struct TrainableModel { /// Embedding table (vocab_size x hidden_dim) pub embeddings: Array2, /// Transformer layers pub layers: Vec, /// Output norm pub output_norm: Vec, /// LM head (vocab_size x hidden_dim) pub lm_head: Array2, /// Vocabulary size pub vocab_size: usize, /// Hidden dimension pub hidden_dim: usize, } impl TrainableModel { /// Create with random initialization pub fn new_random( vocab_size: usize, hidden_dim: usize, num_layers: usize, num_heads: usize, ffn_dim: usize, ) -> Self { use rand::Rng; let mut rng = rand::thread_rng(); let scale = (1.0 / hidden_dim as f32).sqrt(); let embeddings = Array2::from_shape_fn((vocab_size, hidden_dim), |_| { rng.gen::() * scale * 2.0 - scale }); let layers: Vec = (0..num_layers) .map(|_| TrainableLayer::new_random(hidden_dim, num_heads, ffn_dim)) .collect(); let output_norm = vec![1.0; hidden_dim]; let lm_head = Array2::from_shape_fn((vocab_size, hidden_dim), |_| { rng.gen::() * scale * 2.0 - scale }); Self { embeddings, layers, output_norm, lm_head, vocab_size, hidden_dim, } } /// Forward pass for a single token, returns logits pub fn forward(&self, token: u32) -> Vec { // Get embedding let mut hidden: Vec = self.embeddings.row(token as usize).to_vec(); // Run through layers for layer in &self.layers { hidden = layer.forward(&hidden); } // Output norm let normed = SimdOps::rms_norm(&hidden, &self.output_norm, 1e-6); // LM head to get logits matmul_vec(&self.lm_head, &normed) } /// Compute cross-entropy loss for a sequence pub fn compute_loss(&self, input_tokens: &[u32], target_tokens: &[u32]) -> f64 { let mut total_loss = 0.0; for (&input, &target) in input_tokens.iter().zip(target_tokens.iter()) { let logits = self.forward(input); // Softmax + cross-entropy let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max); let exp_sum: f32 = logits.iter().map(|&l| (l - max_logit).exp()).sum(); let log_softmax = logits[target as usize] - max_logit - exp_sum.ln(); total_loss -= log_softmax as f64; } total_loss / target_tokens.len() as f64 } /// Get number of parameters pub fn num_parameters(&self) -> usize { let embed_params = self.embeddings.len(); let lm_head_params = self.lm_head.len(); let norm_params = self.output_norm.len(); let layer_params: usize = self .layers .iter() .map(|l| { l.wq.len() + l.wk.len() + l.wv.len() + l.wo.len() + l.w1.len() + l.w2.len() + l.w3.len() + l.attn_norm.len() + l.ffn_norm.len() }) .sum(); embed_params + lm_head_params + norm_params + layer_params } /// Quantize to Q4 for inference pub fn to_q4(&self) -> SmallTransformer { SmallTransformer::new_random( self.vocab_size, self.hidden_dim, self.layers.len(), self.layers.first().map(|l| l.num_heads).unwrap_or(4), self.layers .first() .map(|l| l.w1.nrows()) .unwrap_or(self.hidden_dim * 4), ) } } /// Simple SGD optimizer with momentum pub struct SGDOptimizer { /// Learning rate learning_rate: f32, /// Momentum momentum: f32, /// Weight decay weight_decay: f32, /// Velocity buffers velocities: HashMap>, } impl SGDOptimizer { pub fn new(learning_rate: f32, momentum: f32, weight_decay: f32) -> Self { Self { learning_rate, momentum, weight_decay, velocities: HashMap::new(), } } /// Update weights with gradients pub fn step(&mut self, name: &str, weights: &mut [f32], gradients: &[f32]) { let velocity = self .velocities .entry(name.to_string()) .or_insert_with(|| vec![0.0; weights.len()]); for ((w, g), v) in weights .iter_mut() .zip(gradients.iter()) .zip(velocity.iter_mut()) { // Apply weight decay let grad_with_decay = *g + self.weight_decay * *w; // Update velocity *v = self.momentum * *v + grad_with_decay; // Update weight *w -= self.learning_rate * *v; } } /// Set learning rate pub fn set_lr(&mut self, lr: f32) { self.learning_rate = lr; } } /// Training loop pub struct Trainer { /// Model being trained model: TrainableModel, /// Optimizer optimizer: SGDOptimizer, /// Configuration config: TrainingConfig, /// Current step step: usize, /// Metrics history metrics_history: Vec, } impl Trainer { /// Create new trainer pub fn new(model: TrainableModel, config: TrainingConfig) -> Self { let optimizer = SGDOptimizer::new(config.learning_rate, 0.9, config.weight_decay); Self { model, optimizer, config, step: 0, metrics_history: Vec::new(), } } /// Get learning rate with warmup fn get_lr(&self) -> f32 { if self.step < self.config.warmup_steps { self.config.learning_rate * (self.step as f32 / self.config.warmup_steps as f32) } else { self.config.learning_rate } } /// Train for one epoch pub fn train_epoch(&mut self, dataset: &TrainingDataset, epoch: usize) -> TrainingMetrics { let start = Instant::now(); let mut epoch_loss = 0.0; let mut num_tokens = 0; // Create batch indices let num_batches = (dataset.len() + self.config.batch_size - 1) / self.config.batch_size; for batch_idx in 0..num_batches { let batch_start = batch_idx * self.config.batch_size; let batch_end = (batch_start + self.config.batch_size).min(dataset.len()); let indices: Vec = (batch_start..batch_end).collect(); let (inputs, targets) = dataset.get_batch(&indices); // Compute loss for each sequence in batch let batch_loss: f64 = inputs .iter() .zip(targets.iter()) .map(|(inp, tgt)| self.model.compute_loss(inp, tgt)) .sum(); let tokens_in_batch: usize = targets.iter().map(|t| t.len()).sum(); epoch_loss += batch_loss * tokens_in_batch as f64; num_tokens += tokens_in_batch; // Update learning rate let lr = self.get_lr(); self.optimizer.set_lr(lr); self.step += 1; // Log progress if self.step % self.config.log_interval == 0 { let avg_loss = epoch_loss / num_tokens as f64; let perplexity = avg_loss.exp(); println!( " Step {}: loss={:.4}, ppl={:.2}, lr={:.6}", self.step, avg_loss, perplexity, lr ); } } let avg_loss = epoch_loss / num_tokens as f64; let elapsed = start.elapsed().as_secs_f64(); let metrics = TrainingMetrics { epoch, step: self.step, loss: avg_loss, perplexity: avg_loss.exp(), tokens_per_second: num_tokens as f64 / elapsed, current_lr: self.get_lr() as f64, grad_norm: 0.0, // Would need gradient tracking }; self.metrics_history.push(metrics.clone()); metrics } /// Full training loop pub fn train(&mut self, dataset: &TrainingDataset) -> Vec { println!("\n╔═══════════════════════════════════════════════════════════════════════════╗"); println!("║ PRETRAINING STARTED ║"); println!("╠═══════════════════════════════════════════════════════════════════════════╣"); println!( "║ Model: {} params ({} layers, {} hidden) ║", format_params(self.model.num_parameters()), self.model.layers.len(), self.model.hidden_dim ); println!( "║ Dataset: {} sequences, {} seq_length ║", dataset.len(), dataset.seq_length ); println!( "║ Config: lr={}, batch={}, epochs={} ║", self.config.learning_rate, self.config.batch_size, self.config.epochs ); println!("╚═══════════════════════════════════════════════════════════════════════════╝\n"); let mut all_metrics = Vec::new(); for epoch in 0..self.config.epochs { println!("Epoch {}/{}:", epoch + 1, self.config.epochs); let metrics = self.train_epoch(dataset, epoch); all_metrics.push(metrics.clone()); println!( " → Epoch {} complete: loss={:.4}, ppl={:.2}, {:.0} tok/s\n", epoch + 1, metrics.loss, metrics.perplexity, metrics.tokens_per_second ); } all_metrics } /// Get trained model pub fn into_model(self) -> TrainableModel { self.model } /// Get metrics history pub fn metrics_history(&self) -> &[TrainingMetrics] { &self.metrics_history } } /// Format parameter count fn format_params(n: usize) -> String { if n >= 1_000_000_000 { format!("{:.1}B", n as f64 / 1e9) } else if n >= 1_000_000 { format!("{:.1}M", n as f64 / 1e6) } else if n >= 1_000 { format!("{:.1}K", n as f64 / 1e3) } else { format!("{}", n) } } /// Benchmark configuration #[derive(Debug, Clone)] pub struct BenchmarkConfig { /// Number of warmup iterations pub warmup_iters: usize, /// Number of benchmark iterations pub bench_iters: usize, /// Sequence length for generation pub seq_length: usize, /// Number of tokens to generate pub gen_tokens: usize, } impl Default for BenchmarkConfig { fn default() -> Self { Self { warmup_iters: 5, bench_iters: 20, seq_length: 32, gen_tokens: 64, } } } /// Benchmark results #[derive(Debug, Clone)] pub struct BenchmarkResults { /// Model name pub model_name: String, /// Number of parameters pub num_params: usize, /// Average latency per token (ms) pub latency_per_token_ms: f64, /// Tokens per second pub tokens_per_second: f64, /// Memory usage (MB) pub memory_mb: f64, /// Perplexity (if evaluated) pub perplexity: Option, } /// Run comprehensive benchmark pub fn run_benchmark(model: &TrainableModel, config: &BenchmarkConfig) -> BenchmarkResults { let start = Instant::now(); // Warmup for _ in 0..config.warmup_iters { let _ = model.forward(0); } // Benchmark forward pass let bench_start = Instant::now(); for i in 0..config.bench_iters { for t in 0..config.gen_tokens { let _ = model.forward((i * config.gen_tokens + t) as u32 % model.vocab_size as u32); } } let bench_elapsed = bench_start.elapsed().as_secs_f64(); let total_tokens = config.bench_iters * config.gen_tokens; let tokens_per_second = total_tokens as f64 / bench_elapsed; let latency_per_token_ms = (bench_elapsed / total_tokens as f64) * 1000.0; // Estimate memory (rough) let memory_mb = (model.num_parameters() * 4) as f64 / (1024.0 * 1024.0); BenchmarkResults { model_name: format!("RuvLLM-{}L-{}H", model.layers.len(), model.hidden_dim), num_params: model.num_parameters(), latency_per_token_ms, tokens_per_second, memory_mb, perplexity: None, } } /// Print benchmark comparison pub fn print_benchmark_comparison(results: &[BenchmarkResults]) { println!("\n╔════════════════════════════════════════════════════════════════════════════════════════╗"); println!("║ MODEL BENCHMARK COMPARISON ║"); println!("╠════════════════════════════════════════════════════════════════════════════════════════╣"); println!( "║ Model │ Params │ Tok/s │ Latency │ Memory │ Perplexity ║" ); println!("╠════════════════════════════════════════════════════════════════════════════════════════╣"); for r in results { let ppl_str = r .perplexity .map(|p| format!("{:.2}", p)) .unwrap_or_else(|| "N/A".to_string()); println!( "║ {:20} │ {:>8} │ {:>8.1} │ {:>6.2}ms │ {:>6.1}MB │ {:>19} ║", r.model_name, format_params(r.num_params), r.tokens_per_second, r.latency_per_token_ms, r.memory_mb, ppl_str ); } println!("╚════════════════════════════════════════════════════════════════════════════════════════╝"); } #[cfg(test)] mod tests { use super::*; #[test] fn test_trainable_model() { let model = TrainableModel::new_random(100, 64, 2, 4, 128); assert!(model.num_parameters() > 0); } #[test] fn test_forward_pass() { let model = TrainableModel::new_random(100, 64, 2, 4, 128); let logits = model.forward(0); assert_eq!(logits.len(), 100); } #[test] fn test_loss_computation() { let model = TrainableModel::new_random(100, 64, 2, 4, 128); let loss = model.compute_loss(&[0, 1, 2], &[1, 2, 3]); assert!(loss > 0.0); } #[test] fn test_dataset() { let dataset = TrainingDataset::synthetic(100, 10, 32); assert_eq!(dataset.len(), 10); let (inputs, targets) = dataset.get_batch(&[0, 1]); assert_eq!(inputs.len(), 2); assert_eq!(targets.len(), 2); } #[test] fn test_optimizer() { let mut optimizer = SGDOptimizer::new(0.01, 0.9, 0.0); let mut weights = vec![1.0, 2.0, 3.0]; let gradients = vec![0.1, 0.2, 0.3]; optimizer.step("test", &mut weights, &gradients); // Weights should have changed assert!(weights[0] < 1.0); } #[test] fn test_benchmark() { let model = TrainableModel::new_random(100, 64, 2, 4, 128); let config = BenchmarkConfig { warmup_iters: 1, bench_iters: 2, seq_length: 8, gen_tokens: 8, }; let results = run_benchmark(&model, &config); assert!(results.tokens_per_second > 0.0); } }