Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,341 @@
//! Sparse Feed-Forward Network implementation.
use ndarray::{Array1, Array2};
use serde::{Deserialize, Serialize};
use tracing::{debug, trace};
use crate::backend::{get_backend, Backend};
use crate::config::ActivationType;
use crate::error::{InferenceError, Result};
/// Sparse Feed-Forward Network computation.
///
/// This implements a two-layer FFN that can compute using only a subset of neurons:
/// - W1: [hidden_dim, input_dim] - first projection (row-major for neuron access)
/// - W2_T: [hidden_dim, output_dim] - second projection TRANSPOSED (row-major for contiguous access)
/// - Activation function applied between layers
///
/// The sparse forward pass:
/// 1. Sparse first layer: only compute active neurons
/// 2. Apply activation function
/// 3. Sparse second layer: accumulate only active neuron contributions (now contiguous!)
///
/// # Performance Optimization
///
/// W2 is stored transposed so that accessing columns (by neuron index) becomes row access,
/// which is contiguous in memory. This provides 15-25% speedup in the sparse accumulation step.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SparseFfn {
/// W1: [hidden_dim, input_dim] - first projection.
/// Row-major layout for efficient neuron access.
w1: Array2<f32>,
/// W2_T: [hidden_dim, output_dim] - second projection TRANSPOSED.
/// Row-major layout for contiguous neuron weight access.
/// Original W2 shape was [output_dim, hidden_dim].
#[serde(with = "w2_serde")]
w2_t: Array2<f32>,
/// Bias for first layer.
b1: Array1<f32>,
/// Bias for second layer.
b2: Array1<f32>,
/// Activation function type.
activation: ActivationType,
/// Output dimension (cached for efficiency)
output_dim: usize,
}
// Custom serialization for w2_t - stores as original W2 for compatibility
mod w2_serde {
use super::*;
use ndarray::Array2;
pub fn serialize<S>(w2_t: &Array2<f32>, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
// Transpose back to original W2 shape for serialization compatibility
let w2 = w2_t.t().to_owned();
w2.serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> std::result::Result<Array2<f32>, D::Error>
where
D: serde::Deserializer<'de>,
{
// Load as original W2 and transpose for optimized storage
let w2 = Array2::<f32>::deserialize(deserializer)?;
Ok(w2.t().to_owned())
}
}
impl SparseFfn {
/// Create a new sparse FFN with given dimensions.
pub fn new(
input_dim: usize,
hidden_dim: usize,
output_dim: usize,
activation: ActivationType,
) -> Result<Self> {
use rand::Rng;
let mut rng = rand::thread_rng();
// Initialize with small random values
let w1 = Array2::from_shape_fn((hidden_dim, input_dim), |_| rng.gen::<f32>() * 0.01);
// Store W2 transposed: [hidden_dim, output_dim] instead of [output_dim, hidden_dim]
// This allows contiguous row access when iterating by neuron index
let w2_t = Array2::from_shape_fn((hidden_dim, output_dim), |_| rng.gen::<f32>() * 0.01);
let b1 = Array1::zeros(hidden_dim);
let b2 = Array1::zeros(output_dim);
Ok(Self {
w1,
w2_t,
b1,
b2,
activation,
output_dim,
})
}
/// Create from existing weights.
pub fn from_weights(
w1: Array2<f32>,
w2: Array2<f32>,
b1: Array1<f32>,
b2: Array1<f32>,
activation: ActivationType,
) -> Result<Self> {
let (hidden_dim, _input_dim) = w1.dim();
let (output_dim, w2_hidden) = w2.dim();
if hidden_dim != w2_hidden {
return Err(InferenceError::Failed(format!(
"Hidden dimension mismatch: W1 has {}, W2 has {}",
hidden_dim, w2_hidden
))
.into());
}
if b1.len() != hidden_dim {
return Err(InferenceError::Failed(format!(
"b1 dimension mismatch: expected {}, got {}",
hidden_dim,
b1.len()
))
.into());
}
if b2.len() != output_dim {
return Err(InferenceError::Failed(format!(
"b2 dimension mismatch: expected {}, got {}",
output_dim,
b2.len()
))
.into());
}
// Transpose W2 for optimized storage
let w2_t = w2.t().to_owned();
Ok(Self {
w1,
w2_t,
b1,
b2,
activation,
output_dim,
})
}
/// Get input dimension.
pub fn input_dim(&self) -> usize {
self.w1.ncols()
}
/// Get hidden dimension.
pub fn hidden_dim(&self) -> usize {
self.w1.nrows()
}
/// Get output dimension.
pub fn output_dim(&self) -> usize {
self.output_dim
}
/// Compute FFN using only active neurons (sparse computation).
///
/// This is the main optimization: only compute activations for predicted neurons.
pub fn forward_sparse(&self, input: &[f32], active_neurons: &[usize]) -> Result<Vec<f32>> {
if input.len() != self.input_dim() {
return Err(InferenceError::InputDimensionMismatch {
expected: self.input_dim(),
actual: input.len(),
}
.into());
}
if active_neurons.is_empty() {
return Err(InferenceError::NoActiveNeurons.into());
}
trace!(
"Sparse forward: {} active neurons ({:.1}% sparsity)",
active_neurons.len(),
100.0 * (1.0 - active_neurons.len() as f32 / self.hidden_dim() as f32)
);
let backend = get_backend();
// 1. Sparse first layer: only compute active neurons
let mut hidden = Vec::with_capacity(active_neurons.len());
for &neuron_idx in active_neurons {
if neuron_idx >= self.hidden_dim() {
return Err(InferenceError::Failed(format!(
"Invalid neuron index: {}",
neuron_idx
))
.into());
}
let row = self.w1.row(neuron_idx);
let dot = backend.dot_product(row.as_slice().unwrap(), input);
hidden.push(dot + self.b1[neuron_idx]);
}
// 2. Apply activation function
backend.activation(&mut hidden, self.activation);
// 3. Sparse second layer: accumulate only active neuron contributions
// W2_T is [hidden_dim, output_dim], so row access by neuron_idx is CONTIGUOUS
let mut output = self.b2.to_vec();
let backend = get_backend();
for (i, &neuron_idx) in active_neurons.iter().enumerate() {
// Row access is contiguous in memory - major optimization!
let weights = self.w2_t.row(neuron_idx);
let h_val = hidden[i];
// Use SIMD-optimized axpy: output += h_val * weights
backend.axpy(&mut output, weights.as_slice().unwrap(), h_val);
}
Ok(output)
}
/// Compute FFN using all neurons (dense computation).
///
/// This is the baseline for comparison and correctness checking.
pub fn forward_dense(&self, input: &[f32]) -> Result<Vec<f32>> {
if input.len() != self.input_dim() {
return Err(InferenceError::InputDimensionMismatch {
expected: self.input_dim(),
actual: input.len(),
}
.into());
}
let backend = get_backend();
let input_arr = Array1::from_vec(input.to_vec());
// 1. First layer: hidden = activation(W1 · input + b1)
let mut hidden = self.w1.dot(&input_arr) + &self.b1;
backend.activation(hidden.as_slice_mut().unwrap(), self.activation);
// 2. Second layer: output = W2 · hidden + b2
// W2_T is [hidden_dim, output_dim], so W2 = W2_T.t()
// output = W2_T.t() · hidden = (hidden.t() · W2_T).t() = W2_T.t().dot(hidden)
let output = self.w2_t.t().dot(&hidden) + &self.b2;
Ok(output.to_vec())
}
/// Compute both sparse and dense, returning the difference for validation.
#[cfg(test)]
pub fn validate_sparse(&self, input: &[f32], active_neurons: &[usize]) -> Result<f32> {
let sparse_output = self.forward_sparse(input, active_neurons)?;
let dense_output = self.forward_dense(input)?;
// Compute mean absolute error
let mae: f32 = sparse_output
.iter()
.zip(dense_output.iter())
.map(|(s, d)| (s - d).abs())
.sum::<f32>()
/ sparse_output.len() as f32;
Ok(mae)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ffn_creation() {
let ffn = SparseFfn::new(128, 512, 128, ActivationType::Gelu).unwrap();
assert_eq!(ffn.input_dim(), 128);
assert_eq!(ffn.hidden_dim(), 512);
assert_eq!(ffn.output_dim(), 128);
}
#[test]
fn test_dense_forward() {
let ffn = SparseFfn::new(64, 256, 64, ActivationType::Relu).unwrap();
let input = vec![0.1; 64];
let output = ffn.forward_dense(&input).unwrap();
assert_eq!(output.len(), 64);
}
#[test]
fn test_sparse_forward() {
let ffn = SparseFfn::new(64, 256, 64, ActivationType::Relu).unwrap();
let input = vec![0.1; 64];
let active_neurons: Vec<usize> = (0..64).collect(); // First 64 neurons
let output = ffn.forward_sparse(&input, &active_neurons).unwrap();
assert_eq!(output.len(), 64);
}
#[test]
fn test_sparse_vs_dense() {
let ffn = SparseFfn::new(32, 128, 32, ActivationType::Relu).unwrap();
let input = vec![0.5; 32];
// Use all neurons - should match dense computation
let all_neurons: Vec<usize> = (0..128).collect();
let mae = ffn.validate_sparse(&input, &all_neurons).unwrap();
// Should be very close (allowing for floating point precision)
assert!(mae < 1e-5, "MAE too large: {}", mae);
}
#[test]
fn test_empty_active_neurons() {
let ffn = SparseFfn::new(32, 128, 32, ActivationType::Relu).unwrap();
let input = vec![0.1; 32];
let empty: Vec<usize> = vec![];
let result = ffn.forward_sparse(&input, &empty);
assert!(result.is_err());
}
#[test]
fn test_invalid_neuron_index() {
let ffn = SparseFfn::new(32, 128, 32, ActivationType::Relu).unwrap();
let input = vec![0.1; 32];
let invalid = vec![200]; // Out of bounds
let result = ffn.forward_sparse(&input, &invalid);
assert!(result.is_err());
}
}

View File

@@ -0,0 +1,59 @@
//! Sparse computation module.
//!
//! This module provides sparse implementations of neural network layers.
mod ffn;
pub use crate::config::ActivationType;
pub use ffn::SparseFfn;
/// Trait for feed-forward network layers.
pub trait FeedForward: Send + Sync {
/// Sparse forward pass using only active neurons.
fn forward_sparse(
&self,
input: &[f32],
active_neurons: &[usize],
) -> crate::error::Result<Vec<f32>>;
/// Dense forward pass using all neurons.
fn forward_dense(&self, input: &[f32]) -> crate::error::Result<Vec<f32>>;
}
impl FeedForward for SparseFfn {
fn forward_sparse(
&self,
input: &[f32],
active_neurons: &[usize],
) -> crate::error::Result<Vec<f32>> {
SparseFfn::forward_sparse(self, input, active_neurons)
}
fn forward_dense(&self, input: &[f32]) -> crate::error::Result<Vec<f32>> {
SparseFfn::forward_dense(self, input)
}
}
/// SwiGLU FFN (placeholder for future implementation).
pub struct SwiGLUFfn;
impl SwiGLUFfn {
/// Create a new SwiGLU FFN.
pub fn new(_input_dim: usize, _hidden_dim: usize) -> Self {
Self
}
}
impl FeedForward for SwiGLUFfn {
fn forward_sparse(
&self,
_input: &[f32],
_active_neurons: &[usize],
) -> crate::error::Result<Vec<f32>> {
unimplemented!("SwiGLUFfn not yet implemented")
}
fn forward_dense(&self, _input: &[f32]) -> crate::error::Result<Vec<f32>> {
unimplemented!("SwiGLUFfn not yet implemented")
}
}