Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
401
vendor/ruvector/examples/ruvLLM/esp32/src/federation/coordinator.rs
vendored
Normal file
401
vendor/ruvector/examples/ruvLLM/esp32/src/federation/coordinator.rs
vendored
Normal file
@@ -0,0 +1,401 @@
|
||||
//! Federation Coordinator - Cluster Management
|
||||
//!
|
||||
//! Manages the multi-chip cluster with self-learning optimization.
|
||||
//! Integrates MicroLoRA for distributed fine-tuning.
|
||||
|
||||
use super::protocol::{ChipId, FederationMessage, MessageType, CommStats};
|
||||
use super::{FederationConfig, FederationMode, FederationSpeedup, estimate_speedup};
|
||||
use crate::optimizations::micro_lora::{MicroLoRA, LoRAConfig, LoRAStack};
|
||||
|
||||
/// Maximum chips in cluster
|
||||
pub const MAX_CLUSTER_SIZE: usize = 8;
|
||||
|
||||
/// Cluster topology
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum ClusterTopology {
|
||||
/// Linear pipeline: 0 -> 1 -> 2 -> 3 -> 4
|
||||
Linear,
|
||||
/// Ring: 0 -> 1 -> 2 -> 3 -> 4 -> 0
|
||||
Ring,
|
||||
/// Star: 0 <-> all others
|
||||
Star,
|
||||
/// Mesh: all-to-all
|
||||
Mesh,
|
||||
}
|
||||
|
||||
/// Chip status in cluster
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ChipStatus {
|
||||
/// Chip ID
|
||||
pub id: ChipId,
|
||||
/// Is chip active
|
||||
pub active: bool,
|
||||
/// Last heartbeat time (in ticks)
|
||||
pub last_heartbeat: u32,
|
||||
/// Current load (0-255)
|
||||
pub load: u8,
|
||||
/// Memory used (KB)
|
||||
pub memory_used_kb: u16,
|
||||
/// Tokens processed
|
||||
pub tokens_processed: u32,
|
||||
}
|
||||
|
||||
/// Self-learning state for optimization
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SelfLearningState {
|
||||
/// Learning rate for LoRA updates
|
||||
pub learning_rate: i8,
|
||||
/// Gradient accumulation counter
|
||||
pub gradient_steps: u32,
|
||||
/// Average loss (fixed-point)
|
||||
pub avg_loss: i32,
|
||||
/// Best loss seen
|
||||
pub best_loss: i32,
|
||||
/// Adaptation enabled
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
impl Default for SelfLearningState {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
learning_rate: 4,
|
||||
gradient_steps: 0,
|
||||
avg_loss: i32::MAX,
|
||||
best_loss: i32::MAX,
|
||||
enabled: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Federation coordinator
|
||||
pub struct FederationCoordinator {
|
||||
/// This coordinator's chip ID
|
||||
chip_id: ChipId,
|
||||
/// Is this the master coordinator
|
||||
is_master: bool,
|
||||
/// Cluster configuration
|
||||
config: FederationConfig,
|
||||
/// Topology
|
||||
topology: ClusterTopology,
|
||||
/// Status of all chips
|
||||
chip_status: [Option<ChipStatus>; MAX_CLUSTER_SIZE],
|
||||
/// Communication stats
|
||||
comm_stats: CommStats,
|
||||
/// Self-learning state
|
||||
learning: SelfLearningState,
|
||||
/// Distributed LoRA adapters (one per layer shard)
|
||||
lora_stack: Option<LoRAStack<4>>,
|
||||
/// Current tick (for timeouts)
|
||||
current_tick: u32,
|
||||
/// Sequence counter
|
||||
seq_counter: u16,
|
||||
}
|
||||
|
||||
impl FederationCoordinator {
|
||||
/// Create new coordinator
|
||||
pub fn new(config: FederationConfig, is_master: bool) -> Self {
|
||||
let chip_status = core::array::from_fn(|i| {
|
||||
if i < config.num_chips {
|
||||
Some(ChipStatus {
|
||||
id: ChipId(i as u8),
|
||||
active: i == config.chip_id.0 as usize,
|
||||
last_heartbeat: 0,
|
||||
load: 0,
|
||||
memory_used_kb: 0,
|
||||
tokens_processed: 0,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
chip_id: config.chip_id,
|
||||
is_master,
|
||||
topology: Self::optimal_topology(&config),
|
||||
config,
|
||||
chip_status,
|
||||
comm_stats: CommStats::default(),
|
||||
learning: SelfLearningState::default(),
|
||||
lora_stack: None,
|
||||
current_tick: 0,
|
||||
seq_counter: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine optimal topology for config
|
||||
fn optimal_topology(config: &FederationConfig) -> ClusterTopology {
|
||||
match config.mode {
|
||||
FederationMode::Pipeline => ClusterTopology::Linear,
|
||||
FederationMode::TensorParallel => ClusterTopology::Star,
|
||||
FederationMode::Speculative => ClusterTopology::Star,
|
||||
FederationMode::MixtureOfExperts => ClusterTopology::Mesh,
|
||||
_ => ClusterTopology::Linear,
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize distributed LoRA for self-learning
|
||||
pub fn init_distributed_lora(&mut self, dim: usize, seed: u32) -> crate::Result<()> {
|
||||
let lora_config = LoRAConfig {
|
||||
rank: 1, // Minimal rank for distributed
|
||||
dim,
|
||||
scale: 8,
|
||||
frozen: false,
|
||||
};
|
||||
|
||||
let mut stack = LoRAStack::new();
|
||||
|
||||
// Each chip gets LoRA for its assigned layers
|
||||
let layers_per_chip = self.config.layers_per_chip;
|
||||
for i in 0..layers_per_chip.min(4) {
|
||||
let layer_seed = seed.wrapping_add(i as u32 * 1000);
|
||||
let adapter = MicroLoRA::new(lora_config, layer_seed)?;
|
||||
stack.add_adapter(i, adapter)?;
|
||||
}
|
||||
|
||||
self.lora_stack = Some(stack);
|
||||
self.learning.enabled = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process tick (call regularly)
|
||||
pub fn tick(&mut self) {
|
||||
self.current_tick += 1;
|
||||
|
||||
// Check for timeouts
|
||||
for status in self.chip_status.iter_mut().flatten() {
|
||||
if self.current_tick - status.last_heartbeat > 1000 {
|
||||
status.active = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle received message
|
||||
pub fn handle_message(&mut self, msg: &FederationMessage) -> Option<FederationMessage> {
|
||||
self.comm_stats.messages_received += 1;
|
||||
self.comm_stats.bytes_received += msg.payload.len() as u32;
|
||||
|
||||
let msg_type = MessageType::from(msg.header.msg_type);
|
||||
|
||||
match msg_type {
|
||||
MessageType::Heartbeat => {
|
||||
// Update chip status
|
||||
let src = msg.header.src as usize;
|
||||
if let Some(status) = self.chip_status.get_mut(src).and_then(|s| s.as_mut()) {
|
||||
status.active = true;
|
||||
status.last_heartbeat = self.current_tick;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
MessageType::Discovery => {
|
||||
// Respond with our status
|
||||
Some(self.create_heartbeat())
|
||||
}
|
||||
|
||||
MessageType::Barrier => {
|
||||
// Acknowledge barrier
|
||||
Some(FederationMessage::new(
|
||||
MessageType::Ack,
|
||||
self.chip_id,
|
||||
ChipId(msg.header.src),
|
||||
msg.header.seq,
|
||||
))
|
||||
}
|
||||
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create heartbeat message
|
||||
pub fn create_heartbeat(&mut self) -> FederationMessage {
|
||||
self.seq_counter += 1;
|
||||
let mut msg = FederationMessage::new(
|
||||
MessageType::Heartbeat,
|
||||
self.chip_id,
|
||||
ChipId::BROADCAST,
|
||||
self.seq_counter,
|
||||
);
|
||||
|
||||
// Add load info to payload
|
||||
if let Some(status) = &self.chip_status[self.chip_id.0 as usize] {
|
||||
let _ = msg.payload.push(status.load);
|
||||
let _ = msg.payload.push((status.memory_used_kb & 0xFF) as u8);
|
||||
let _ = msg.payload.push((status.memory_used_kb >> 8) as u8);
|
||||
}
|
||||
msg.header.payload_len = msg.payload.len() as u16;
|
||||
msg.update_checksum();
|
||||
|
||||
self.comm_stats.messages_sent += 1;
|
||||
msg
|
||||
}
|
||||
|
||||
/// Get number of active chips
|
||||
pub fn active_chip_count(&self) -> usize {
|
||||
self.chip_status.iter().filter(|s| s.as_ref().is_some_and(|s| s.active)).count()
|
||||
}
|
||||
|
||||
/// Estimate current speedup based on active chips
|
||||
pub fn current_speedup(&self) -> FederationSpeedup {
|
||||
let active = self.active_chip_count();
|
||||
let mut effective_config = self.config.clone();
|
||||
effective_config.num_chips = active;
|
||||
estimate_speedup(&effective_config)
|
||||
}
|
||||
|
||||
/// Update learning state with loss
|
||||
pub fn update_learning(&mut self, loss: i32) {
|
||||
if !self.learning.enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
self.learning.gradient_steps += 1;
|
||||
|
||||
// Exponential moving average of loss
|
||||
if self.learning.avg_loss == i32::MAX {
|
||||
self.learning.avg_loss = loss;
|
||||
} else {
|
||||
self.learning.avg_loss = (self.learning.avg_loss * 15 + loss) / 16;
|
||||
}
|
||||
|
||||
// Track best
|
||||
if loss < self.learning.best_loss {
|
||||
self.learning.best_loss = loss;
|
||||
}
|
||||
|
||||
// Adaptive learning rate
|
||||
if self.learning.gradient_steps % 100 == 0 {
|
||||
if self.learning.avg_loss < self.learning.best_loss * 11 / 10 {
|
||||
// Good progress, increase LR
|
||||
self.learning.learning_rate = (self.learning.learning_rate + 1).min(16);
|
||||
} else {
|
||||
// Slow progress, decrease LR
|
||||
self.learning.learning_rate = (self.learning.learning_rate - 1).max(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply distributed LoRA update
|
||||
#[cfg(not(feature = "frozen"))]
|
||||
pub fn apply_lora_gradient(
|
||||
&mut self,
|
||||
layer_idx: usize,
|
||||
input: &[i8],
|
||||
grad_output: &[i32],
|
||||
) {
|
||||
if let Some(ref mut stack) = self.lora_stack {
|
||||
if let Some(lora) = stack.get(layer_idx) {
|
||||
lora.update(input, grad_output, self.learning.learning_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get LoRA adapter for a layer
|
||||
pub fn get_lora(&mut self, layer_idx: usize) -> Option<&mut MicroLoRA> {
|
||||
self.lora_stack.as_mut()?.get(layer_idx)
|
||||
}
|
||||
|
||||
/// Get cluster statistics
|
||||
pub fn stats(&self) -> ClusterStats {
|
||||
let total_tokens: u32 = self.chip_status.iter()
|
||||
.filter_map(|s| s.as_ref())
|
||||
.map(|s| s.tokens_processed)
|
||||
.sum();
|
||||
|
||||
let total_memory: u32 = self.chip_status.iter()
|
||||
.filter_map(|s| s.as_ref())
|
||||
.map(|s| s.memory_used_kb as u32)
|
||||
.sum();
|
||||
|
||||
ClusterStats {
|
||||
active_chips: self.active_chip_count(),
|
||||
total_chips: self.config.num_chips,
|
||||
total_tokens_processed: total_tokens,
|
||||
total_memory_kb: total_memory,
|
||||
messages_sent: self.comm_stats.messages_sent,
|
||||
messages_received: self.comm_stats.messages_received,
|
||||
current_speedup: self.current_speedup(),
|
||||
learning_enabled: self.learning.enabled,
|
||||
learning_rate: self.learning.learning_rate,
|
||||
avg_loss: self.learning.avg_loss,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update chip's token count
|
||||
pub fn record_tokens(&mut self, count: u32) {
|
||||
if let Some(status) = self.chip_status.get_mut(self.chip_id.0 as usize).and_then(|s| s.as_mut()) {
|
||||
status.tokens_processed += count;
|
||||
}
|
||||
}
|
||||
|
||||
/// Update chip's memory usage
|
||||
pub fn update_memory_usage(&mut self, kb: u16) {
|
||||
if let Some(status) = self.chip_status.get_mut(self.chip_id.0 as usize).and_then(|s| s.as_mut()) {
|
||||
status.memory_used_kb = kb;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Cluster statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterStats {
|
||||
/// Active chips
|
||||
pub active_chips: usize,
|
||||
/// Total chips configured
|
||||
pub total_chips: usize,
|
||||
/// Total tokens processed
|
||||
pub total_tokens_processed: u32,
|
||||
/// Total memory used (KB)
|
||||
pub total_memory_kb: u32,
|
||||
/// Messages sent
|
||||
pub messages_sent: u32,
|
||||
/// Messages received
|
||||
pub messages_received: u32,
|
||||
/// Current speedup estimate
|
||||
pub current_speedup: FederationSpeedup,
|
||||
/// Self-learning enabled
|
||||
pub learning_enabled: bool,
|
||||
/// Current learning rate
|
||||
pub learning_rate: i8,
|
||||
/// Average loss
|
||||
pub avg_loss: i32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_coordinator_creation() {
|
||||
let config = FederationConfig::default();
|
||||
let coord = FederationCoordinator::new(config, true);
|
||||
|
||||
assert_eq!(coord.active_chip_count(), 1); // Only self is active initially
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distributed_lora() {
|
||||
let config = FederationConfig::default();
|
||||
let mut coord = FederationCoordinator::new(config, true);
|
||||
|
||||
coord.init_distributed_lora(32, 42).unwrap();
|
||||
|
||||
assert!(coord.learning.enabled);
|
||||
assert!(coord.get_lora(0).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_learning_update() {
|
||||
let config = FederationConfig::default();
|
||||
let mut coord = FederationCoordinator::new(config, true);
|
||||
coord.learning.enabled = true;
|
||||
|
||||
coord.update_learning(1000);
|
||||
coord.update_learning(900);
|
||||
coord.update_learning(800);
|
||||
|
||||
assert!(coord.learning.avg_loss < 1000);
|
||||
assert_eq!(coord.learning.best_loss, 800);
|
||||
}
|
||||
}
|
||||
344
vendor/ruvector/examples/ruvLLM/esp32/src/federation/fastgrnn_router.rs
vendored
Normal file
344
vendor/ruvector/examples/ruvLLM/esp32/src/federation/fastgrnn_router.rs
vendored
Normal file
@@ -0,0 +1,344 @@
|
||||
//! FastGRNN-Inspired Micro Router for ESP32
|
||||
//!
|
||||
//! Lightweight gated routing for dynamic chip selection.
|
||||
//! Adapted from ruvector's FastGRNN for minimal compute overhead.
|
||||
//!
|
||||
//! Key differences from full FastGRNN:
|
||||
//! - INT8 weights instead of FP32
|
||||
//! - Fixed-point gate computation
|
||||
//! - Minimal hidden dimension (4-8)
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::ChipId;
|
||||
|
||||
/// Maximum hidden dimension for micro router
|
||||
pub const MAX_ROUTER_HIDDEN: usize = 8;
|
||||
/// Maximum input features
|
||||
pub const MAX_ROUTER_INPUT: usize = 16;
|
||||
|
||||
/// Micro FastGRNN configuration
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct MicroGRNNConfig {
|
||||
/// Input dimension
|
||||
pub input_dim: usize,
|
||||
/// Hidden dimension
|
||||
pub hidden_dim: usize,
|
||||
/// Number of output classes (chips)
|
||||
pub num_chips: usize,
|
||||
/// Zeta parameter (gate scaling)
|
||||
pub zeta: i8,
|
||||
/// Nu parameter (update scaling)
|
||||
pub nu: i8,
|
||||
}
|
||||
|
||||
impl Default for MicroGRNNConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
input_dim: 8,
|
||||
hidden_dim: 4,
|
||||
num_chips: 5,
|
||||
zeta: 16,
|
||||
nu: 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Micro FastGRNN cell for routing decisions
|
||||
pub struct MicroFastGRNN {
|
||||
config: MicroGRNNConfig,
|
||||
/// Gate weights: W_g [input_dim * hidden_dim] + U_g [hidden_dim * hidden_dim]
|
||||
w_gate: HVec<i8, 128>,
|
||||
u_gate: HVec<i8, 64>,
|
||||
/// Update weights: W_u, U_u
|
||||
w_update: HVec<i8, 128>,
|
||||
u_update: HVec<i8, 64>,
|
||||
/// Biases
|
||||
bias_gate: HVec<i8, MAX_ROUTER_HIDDEN>,
|
||||
bias_update: HVec<i8, MAX_ROUTER_HIDDEN>,
|
||||
/// Output projection to chips
|
||||
w_output: HVec<i8, 64>,
|
||||
/// Hidden state
|
||||
hidden: HVec<i32, MAX_ROUTER_HIDDEN>,
|
||||
}
|
||||
|
||||
impl MicroFastGRNN {
|
||||
/// Create new micro FastGRNN
|
||||
pub fn new(config: MicroGRNNConfig, seed: u32) -> crate::Result<Self> {
|
||||
let mut rng_state = seed;
|
||||
let mut next_rand = || {
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
(((rng_state >> 16) & 0x3F) as i16 - 32) as i8
|
||||
};
|
||||
|
||||
// Initialize weights
|
||||
let gate_size = config.input_dim * config.hidden_dim;
|
||||
let hidden_size = config.hidden_dim * config.hidden_dim;
|
||||
let output_size = config.hidden_dim * config.num_chips;
|
||||
|
||||
let mut w_gate = HVec::new();
|
||||
let mut u_gate = HVec::new();
|
||||
let mut w_update = HVec::new();
|
||||
let mut u_update = HVec::new();
|
||||
let mut w_output = HVec::new();
|
||||
let mut bias_gate = HVec::new();
|
||||
let mut bias_update = HVec::new();
|
||||
let mut hidden = HVec::new();
|
||||
|
||||
for _ in 0..gate_size {
|
||||
w_gate.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
w_update.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
for _ in 0..hidden_size {
|
||||
u_gate.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
u_update.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
for _ in 0..output_size {
|
||||
w_output.push(next_rand()).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
for _ in 0..config.hidden_dim {
|
||||
bias_gate.push(0).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
bias_update.push(0).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
hidden.push(0).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
w_gate,
|
||||
u_gate,
|
||||
w_update,
|
||||
u_update,
|
||||
bias_gate,
|
||||
bias_update,
|
||||
w_output,
|
||||
hidden,
|
||||
})
|
||||
}
|
||||
|
||||
/// Reset hidden state
|
||||
pub fn reset(&mut self) {
|
||||
for h in self.hidden.iter_mut() {
|
||||
*h = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Fixed-point sigmoid approximation
|
||||
#[inline]
|
||||
fn sigmoid_fp(x: i32) -> i32 {
|
||||
// Piecewise linear sigmoid: clamp to [0, 256] representing [0, 1]
|
||||
if x < -512 { 0 }
|
||||
else if x > 512 { 256 }
|
||||
else { (x + 512) >> 2 }
|
||||
}
|
||||
|
||||
/// Fixed-point tanh approximation
|
||||
#[inline]
|
||||
fn tanh_fp(x: i32) -> i32 {
|
||||
// Piecewise linear tanh: clamp to [-256, 256] representing [-1, 1]
|
||||
if x < -512 { -256 }
|
||||
else if x > 512 { 256 }
|
||||
else { x >> 1 }
|
||||
}
|
||||
|
||||
/// Matrix-vector multiply (INT8 weights, INT32 accumulator)
|
||||
fn matmul(&self, weights: &[i8], input: &[i32], rows: usize, cols: usize) -> HVec<i32, MAX_ROUTER_HIDDEN> {
|
||||
let mut output = HVec::new();
|
||||
|
||||
for r in 0..rows {
|
||||
let mut sum: i32 = 0;
|
||||
for c in 0..cols {
|
||||
if c < input.len() {
|
||||
sum += weights[r * cols + c] as i32 * input[c];
|
||||
}
|
||||
}
|
||||
let _ = output.push(sum >> 8); // Scale down
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
/// One step of FastGRNN computation
|
||||
///
|
||||
/// h_new = (1 - z) ⊙ h + z ⊙ tanh(W_u*x + U_u*h + b_u)
|
||||
/// where z = sigmoid(W_g*x + U_g*h + b_g)
|
||||
pub fn step(&mut self, input: &[i8]) -> crate::Result<()> {
|
||||
// Convert input to i32
|
||||
let input_i32: HVec<i32, MAX_ROUTER_INPUT> = input.iter()
|
||||
.take(self.config.input_dim)
|
||||
.map(|&x| x as i32 * 16) // Scale up
|
||||
.collect();
|
||||
|
||||
// Compute gate: z = sigmoid(W_g * x + U_g * h + b_g)
|
||||
let wx_gate = self.matmul(&self.w_gate, &input_i32, self.config.hidden_dim, self.config.input_dim);
|
||||
let uh_gate = self.matmul(&self.u_gate, &self.hidden, self.config.hidden_dim, self.config.hidden_dim);
|
||||
|
||||
let mut gate = HVec::<i32, MAX_ROUTER_HIDDEN>::new();
|
||||
for i in 0..self.config.hidden_dim {
|
||||
let wx = wx_gate.get(i).copied().unwrap_or(0);
|
||||
let uh = uh_gate.get(i).copied().unwrap_or(0);
|
||||
let b = self.bias_gate.get(i).copied().unwrap_or(0) as i32 * 16;
|
||||
let z = Self::sigmoid_fp((wx + uh + b) * self.config.zeta as i32 / 16);
|
||||
let _ = gate.push(z);
|
||||
}
|
||||
|
||||
// Compute update: u = tanh(W_u * x + U_u * h + b_u)
|
||||
let wx_update = self.matmul(&self.w_update, &input_i32, self.config.hidden_dim, self.config.input_dim);
|
||||
let uh_update = self.matmul(&self.u_update, &self.hidden, self.config.hidden_dim, self.config.hidden_dim);
|
||||
|
||||
// Update hidden state: h = (1 - z) * h + z * u
|
||||
for i in 0..self.config.hidden_dim {
|
||||
let wx = wx_update.get(i).copied().unwrap_or(0);
|
||||
let uh = uh_update.get(i).copied().unwrap_or(0);
|
||||
let b = self.bias_update.get(i).copied().unwrap_or(0) as i32 * 16;
|
||||
let u = Self::tanh_fp((wx + uh + b) * self.config.nu as i32 / 16);
|
||||
|
||||
let z = gate.get(i).copied().unwrap_or(128);
|
||||
let h = self.hidden.get(i).copied().unwrap_or(0);
|
||||
|
||||
// h_new = (256 - z) * h / 256 + z * u / 256
|
||||
let h_new = ((256 - z) * h + z * u) >> 8;
|
||||
self.hidden[i] = h_new;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get routing decision (which chip to use)
|
||||
pub fn route(&self) -> ChipId {
|
||||
// Output projection: scores = W_o * hidden
|
||||
let mut scores = [0i32; 8];
|
||||
|
||||
for chip in 0..self.config.num_chips {
|
||||
let mut sum: i32 = 0;
|
||||
for h in 0..self.config.hidden_dim {
|
||||
let w_idx = chip * self.config.hidden_dim + h;
|
||||
let w = self.w_output.get(w_idx).copied().unwrap_or(0) as i32;
|
||||
let hidden = self.hidden.get(h).copied().unwrap_or(0);
|
||||
sum += w * hidden;
|
||||
}
|
||||
scores[chip] = sum;
|
||||
}
|
||||
|
||||
// Find argmax
|
||||
let mut best_chip = 0;
|
||||
let mut best_score = scores[0];
|
||||
for (i, &score) in scores[..self.config.num_chips].iter().enumerate() {
|
||||
if score > best_score {
|
||||
best_score = score;
|
||||
best_chip = i;
|
||||
}
|
||||
}
|
||||
|
||||
ChipId(best_chip as u8)
|
||||
}
|
||||
|
||||
/// Get routing probabilities (softmax-like)
|
||||
pub fn route_probs(&self) -> HVec<u8, 8> {
|
||||
let mut probs = HVec::new();
|
||||
let mut scores = [0i32; 8];
|
||||
let mut max_score = i32::MIN;
|
||||
|
||||
// Compute scores
|
||||
for chip in 0..self.config.num_chips {
|
||||
let mut sum: i32 = 0;
|
||||
for h in 0..self.config.hidden_dim {
|
||||
let w_idx = chip * self.config.hidden_dim + h;
|
||||
let w = self.w_output.get(w_idx).copied().unwrap_or(0) as i32;
|
||||
let hidden = self.hidden.get(h).copied().unwrap_or(0);
|
||||
sum += w * hidden;
|
||||
}
|
||||
scores[chip] = sum;
|
||||
if sum > max_score {
|
||||
max_score = sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Simple softmax approximation
|
||||
let mut total: i32 = 0;
|
||||
for chip in 0..self.config.num_chips {
|
||||
let exp_score = (scores[chip] - max_score + 256).max(1);
|
||||
scores[chip] = exp_score;
|
||||
total += exp_score;
|
||||
}
|
||||
|
||||
for chip in 0..self.config.num_chips {
|
||||
let prob = (scores[chip] * 255 / total.max(1)) as u8;
|
||||
let _ = probs.push(prob);
|
||||
}
|
||||
|
||||
probs
|
||||
}
|
||||
|
||||
/// Memory size
|
||||
pub fn memory_size(&self) -> usize {
|
||||
self.w_gate.len() + self.u_gate.len() +
|
||||
self.w_update.len() + self.u_update.len() +
|
||||
self.w_output.len() +
|
||||
self.bias_gate.len() + self.bias_update.len() +
|
||||
self.hidden.len() * 4
|
||||
}
|
||||
}
|
||||
|
||||
/// Feature extractor for routing input
|
||||
pub struct RoutingFeatures {
|
||||
/// Token embedding summary (mean)
|
||||
pub embed_mean: i8,
|
||||
/// Token embedding variance proxy
|
||||
pub embed_var: i8,
|
||||
/// Current sequence position (normalized)
|
||||
pub position: i8,
|
||||
/// Current load on each chip (0-127)
|
||||
pub chip_loads: [i8; 5],
|
||||
}
|
||||
|
||||
impl RoutingFeatures {
|
||||
/// Convert to input vector
|
||||
pub fn to_input(&self) -> [i8; 8] {
|
||||
[
|
||||
self.embed_mean,
|
||||
self.embed_var,
|
||||
self.position,
|
||||
self.chip_loads[0],
|
||||
self.chip_loads[1],
|
||||
self.chip_loads[2],
|
||||
self.chip_loads[3],
|
||||
self.chip_loads[4],
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_micro_fastgrnn() {
|
||||
let config = MicroGRNNConfig::default();
|
||||
let mut router = MicroFastGRNN::new(config, 42).unwrap();
|
||||
|
||||
// Test step
|
||||
let input = [10i8, 20, 30, 40, 50, 60, 70, 80];
|
||||
router.step(&input).unwrap();
|
||||
|
||||
// Should produce valid routing
|
||||
let chip = router.route();
|
||||
assert!(chip.0 < 5);
|
||||
|
||||
println!("Memory: {} bytes", router.memory_size());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_routing_probs() {
|
||||
let config = MicroGRNNConfig::default();
|
||||
let mut router = MicroFastGRNN::new(config, 42).unwrap();
|
||||
|
||||
let input = [10i8; 8];
|
||||
router.step(&input).unwrap();
|
||||
|
||||
let probs = router.route_probs();
|
||||
assert_eq!(probs.len(), 5);
|
||||
|
||||
// Sum should be approximately 255
|
||||
let sum: i32 = probs.iter().map(|&p| p as i32).sum();
|
||||
assert!(sum > 200 && sum < 280);
|
||||
}
|
||||
}
|
||||
705
vendor/ruvector/examples/ruvLLM/esp32/src/federation/massive_scale.rs
vendored
Normal file
705
vendor/ruvector/examples/ruvLLM/esp32/src/federation/massive_scale.rs
vendored
Normal file
@@ -0,0 +1,705 @@
|
||||
//! Massive Scale Federation - 100s to Millions of Chips
|
||||
//!
|
||||
//! Hierarchical coordination for extreme-scale distributed inference.
|
||||
//!
|
||||
//! # Topology Options
|
||||
//!
|
||||
//! ```text
|
||||
//! Flat (≤16 chips): Hierarchical Tree (≤10K): Hypercube (≤1M):
|
||||
//! ○─○─○─○─○ ┌───[Root]───┐ ○═══○
|
||||
//! │ │ │ │ │ │ │ │ ╱│ │╲
|
||||
//! └─┴─┴─┴─┘ [L1] [L1] [L1] ○─┼───┼─○
|
||||
//! │││ │││ │││ │ ○═══○ │
|
||||
//! chips chips chips ○═══════○
|
||||
//! ```
|
||||
//!
|
||||
//! # Scaling Laws
|
||||
//!
|
||||
//! - **Pipeline**: O(n) throughput, O(1) latency per stage
|
||||
//! - **Tree**: O(log n) coordination, O(n) compute
|
||||
//! - **Hypercube**: O(log n) hops, O(n) total bandwidth
|
||||
//! - **Torus**: O(√n) diameter, excellent locality
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::ChipId;
|
||||
|
||||
/// Maximum depth for hierarchical topologies
|
||||
pub const MAX_TREE_DEPTH: usize = 20; // 2^20 = 1M chips
|
||||
/// Maximum children per node in tree
|
||||
pub const MAX_CHILDREN: usize = 16;
|
||||
/// Maximum nodes at any level
|
||||
pub const MAX_LEVEL_NODES: usize = 64;
|
||||
|
||||
/// Large-scale topology types
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum MassiveTopology {
|
||||
/// Flat mesh - up to ~16 chips
|
||||
FlatMesh { size: usize },
|
||||
/// Binary tree - scales to millions
|
||||
BinaryTree { depth: usize },
|
||||
/// K-ary tree with configurable fanout
|
||||
KaryTree { depth: usize, fanout: usize },
|
||||
/// Hypercube - O(log n) diameter
|
||||
Hypercube { dimensions: usize },
|
||||
/// 2D Torus - good for spatial locality
|
||||
Torus2D { width: usize, height: usize },
|
||||
/// 3D Torus - even better scaling
|
||||
Torus3D { x: usize, y: usize, z: usize },
|
||||
/// Butterfly network - FFT-like communication
|
||||
Butterfly { stages: usize },
|
||||
/// Hierarchical pipeline - practical for real deployments
|
||||
HierarchicalPipeline {
|
||||
clusters: usize, // Number of clusters
|
||||
chips_per_cluster: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl MassiveTopology {
|
||||
/// Total number of chips in topology
|
||||
pub fn total_chips(&self) -> usize {
|
||||
match *self {
|
||||
Self::FlatMesh { size } => size,
|
||||
Self::BinaryTree { depth } => (1 << depth) - 1,
|
||||
Self::KaryTree { depth, fanout } => {
|
||||
// (k^(d+1) - 1) / (k - 1)
|
||||
if fanout == 1 { depth + 1 }
|
||||
else { (fanout.pow(depth as u32 + 1) - 1) / (fanout - 1) }
|
||||
}
|
||||
Self::Hypercube { dimensions } => 1 << dimensions,
|
||||
Self::Torus2D { width, height } => width * height,
|
||||
Self::Torus3D { x, y, z } => x * y * z,
|
||||
Self::Butterfly { stages } => stages * (1 << stages),
|
||||
Self::HierarchicalPipeline { clusters, chips_per_cluster } => {
|
||||
clusters * chips_per_cluster
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Network diameter (max hops between any two nodes)
|
||||
pub fn diameter(&self) -> usize {
|
||||
match *self {
|
||||
Self::FlatMesh { size } => size - 1,
|
||||
Self::BinaryTree { depth } => 2 * depth,
|
||||
Self::KaryTree { depth, .. } => 2 * depth,
|
||||
Self::Hypercube { dimensions } => dimensions,
|
||||
Self::Torus2D { width, height } => width / 2 + height / 2,
|
||||
Self::Torus3D { x, y, z } => x / 2 + y / 2 + z / 2,
|
||||
Self::Butterfly { stages } => stages,
|
||||
Self::HierarchicalPipeline { chips_per_cluster, .. } => {
|
||||
chips_per_cluster + 2 // Within cluster + up + down
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Bisection bandwidth (edges crossing middle cut)
|
||||
pub fn bisection_bandwidth(&self) -> usize {
|
||||
match *self {
|
||||
Self::FlatMesh { .. } => 1,
|
||||
Self::BinaryTree { .. } => 1, // Root is bottleneck
|
||||
Self::KaryTree { fanout, .. } => fanout,
|
||||
Self::Hypercube { dimensions } => 1 << (dimensions - 1),
|
||||
Self::Torus2D { width, height } => 2 * width.min(height),
|
||||
Self::Torus3D { x, y, z } => 2 * x.min(y).min(z) * x.min(y).min(z),
|
||||
Self::Butterfly { stages } => 1 << (stages - 1),
|
||||
Self::HierarchicalPipeline { clusters, .. } => clusters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Recommended topology for given chip count
|
||||
pub fn recommended(chip_count: usize) -> Self {
|
||||
match chip_count {
|
||||
0..=16 => Self::FlatMesh { size: chip_count },
|
||||
17..=256 => Self::HierarchicalPipeline {
|
||||
clusters: (chip_count as f64).sqrt().ceil() as usize,
|
||||
chips_per_cluster: (chip_count as f64).sqrt().ceil() as usize,
|
||||
},
|
||||
257..=10_000 => {
|
||||
// Use hierarchical pipeline for medium scale
|
||||
let clusters = (chip_count as f64).sqrt().ceil() as usize;
|
||||
let per_cluster = (chip_count + clusters - 1) / clusters;
|
||||
Self::HierarchicalPipeline {
|
||||
clusters,
|
||||
chips_per_cluster: per_cluster,
|
||||
}
|
||||
}
|
||||
10_001..=1_000_000 => {
|
||||
// Hypercube for large scale
|
||||
let dims = (chip_count as f64).log2().ceil() as usize;
|
||||
Self::Hypercube { dimensions: dims }
|
||||
}
|
||||
_ => {
|
||||
// Millions+ : 3D Torus
|
||||
let side = (chip_count as f64).cbrt().ceil() as usize;
|
||||
Self::Torus3D { x: side, y: side, z: side }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scaling configuration for massive clusters
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MassiveScaleConfig {
|
||||
/// Topology type
|
||||
pub topology: MassiveTopology,
|
||||
/// Layers of model
|
||||
pub total_layers: usize,
|
||||
/// Embedding dimension
|
||||
pub embed_dim: usize,
|
||||
/// Communication latency per hop (microseconds)
|
||||
pub hop_latency_us: usize,
|
||||
/// Bandwidth per link (bytes/sec)
|
||||
pub link_bandwidth: usize,
|
||||
/// Computation time per layer (microseconds)
|
||||
pub layer_compute_us: usize,
|
||||
/// Enable speculative execution
|
||||
pub speculative: bool,
|
||||
/// Speculation depth (tokens to draft)
|
||||
pub spec_depth: usize,
|
||||
/// Enable gradient checkpointing for memory
|
||||
pub gradient_checkpointing: bool,
|
||||
/// Fault tolerance level (0=none, 1=retry, 2=redundancy)
|
||||
pub fault_tolerance: u8,
|
||||
}
|
||||
|
||||
impl Default for MassiveScaleConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
topology: MassiveTopology::HierarchicalPipeline {
|
||||
clusters: 10,
|
||||
chips_per_cluster: 10,
|
||||
},
|
||||
total_layers: 32,
|
||||
embed_dim: 64,
|
||||
hop_latency_us: 10, // SPI latency
|
||||
link_bandwidth: 10_000_000, // 10 MB/s
|
||||
layer_compute_us: 4000, // 4ms per layer on ESP32
|
||||
speculative: true,
|
||||
spec_depth: 4,
|
||||
gradient_checkpointing: false,
|
||||
fault_tolerance: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Performance projection for massive scale
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScaleProjection {
|
||||
/// Total chips
|
||||
pub total_chips: usize,
|
||||
/// Throughput in tokens/sec
|
||||
pub throughput_tokens_sec: f64,
|
||||
/// Latency per token in milliseconds
|
||||
pub latency_ms: f64,
|
||||
/// Memory per chip in KB
|
||||
pub memory_per_chip_kb: f64,
|
||||
/// Total model parameters supportable
|
||||
pub max_parameters: usize,
|
||||
/// Efficiency (vs linear scaling)
|
||||
pub efficiency: f64,
|
||||
/// Communication overhead percentage
|
||||
pub comm_overhead_pct: f64,
|
||||
/// Estimated power in watts
|
||||
pub power_watts: f64,
|
||||
/// Estimated cost in USD
|
||||
pub cost_usd: f64,
|
||||
}
|
||||
|
||||
/// Massive scale simulator
|
||||
pub struct MassiveScaleSimulator {
|
||||
config: MassiveScaleConfig,
|
||||
}
|
||||
|
||||
impl MassiveScaleSimulator {
|
||||
pub fn new(config: MassiveScaleConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Project performance for current configuration
|
||||
pub fn project(&self) -> ScaleProjection {
|
||||
let chips = self.config.topology.total_chips();
|
||||
let diameter = self.config.topology.diameter();
|
||||
let bisection = self.config.topology.bisection_bandwidth();
|
||||
|
||||
// Compute distribution
|
||||
let layers_per_chip = (self.config.total_layers as f64 / chips as f64).max(0.1);
|
||||
let compute_per_chip_us = layers_per_chip * self.config.layer_compute_us as f64;
|
||||
|
||||
// Communication cost
|
||||
let activation_size = self.config.embed_dim * 4; // INT8 with some overhead
|
||||
let comm_time_us = (activation_size as f64 / self.config.link_bandwidth as f64)
|
||||
* 1_000_000.0
|
||||
* diameter as f64;
|
||||
|
||||
// Pipeline efficiency
|
||||
let pipeline_stages = chips.min(self.config.total_layers);
|
||||
let bubble_overhead = (pipeline_stages - 1) as f64 / pipeline_stages as f64;
|
||||
|
||||
// Speculative multiplier
|
||||
let spec_multiplier = if self.config.speculative {
|
||||
1.0 + (self.config.spec_depth as f64 - 1.0) * 0.7 // 70% acceptance
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
|
||||
// Throughput calculation
|
||||
let base_throughput = 1_000_000.0 / compute_per_chip_us.max(1.0);
|
||||
let comm_factor = 1.0 / (1.0 + comm_time_us / compute_per_chip_us.max(1.0));
|
||||
let efficiency = (1.0 - bubble_overhead * 0.15) * comm_factor;
|
||||
let throughput = base_throughput * pipeline_stages as f64 * efficiency * spec_multiplier;
|
||||
|
||||
// Latency
|
||||
let latency_us = compute_per_chip_us * pipeline_stages as f64 + comm_time_us;
|
||||
let latency_ms = latency_us / 1000.0;
|
||||
|
||||
// Memory
|
||||
let base_memory_kb = 119.0; // Single chip baseline
|
||||
let memory_per_chip = base_memory_kb / (chips as f64).sqrt().max(1.0);
|
||||
|
||||
// Max parameters
|
||||
let params_per_chip = (memory_per_chip * 1024.0 * 0.7) as usize; // 70% for weights
|
||||
let max_parameters = params_per_chip * chips;
|
||||
|
||||
// Communication overhead
|
||||
let comm_overhead = comm_time_us / (compute_per_chip_us + comm_time_us) * 100.0;
|
||||
|
||||
// Power and cost estimates
|
||||
let power_per_chip = 0.5; // 500mW per ESP32
|
||||
let cost_per_chip = 4.0; // $4 per ESP32
|
||||
|
||||
ScaleProjection {
|
||||
total_chips: chips,
|
||||
throughput_tokens_sec: throughput,
|
||||
latency_ms,
|
||||
memory_per_chip_kb: memory_per_chip,
|
||||
max_parameters,
|
||||
efficiency,
|
||||
comm_overhead_pct: comm_overhead,
|
||||
power_watts: power_per_chip * chips as f64,
|
||||
cost_usd: cost_per_chip * chips as f64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Run scaling study across multiple configurations
|
||||
pub fn scaling_study(&self, chip_counts: &[usize]) -> HVec<ScaleProjection, 32> {
|
||||
let mut results = HVec::new();
|
||||
|
||||
for &count in chip_counts {
|
||||
let topology = MassiveTopology::recommended(count);
|
||||
let config = MassiveScaleConfig {
|
||||
topology,
|
||||
..self.config.clone()
|
||||
};
|
||||
let sim = MassiveScaleSimulator::new(config);
|
||||
let _ = results.push(sim.project());
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Find optimal configuration for target throughput
|
||||
pub fn optimize_for_throughput(&self, target_tokens_sec: f64) -> MassiveScaleConfig {
|
||||
let mut best_config = self.config.clone();
|
||||
let mut best_efficiency = 0.0;
|
||||
|
||||
// Try different chip counts
|
||||
for power in 2..=20 {
|
||||
let chips = 1 << power;
|
||||
|
||||
for &topology in &[
|
||||
MassiveTopology::KaryTree { depth: power, fanout: 4 },
|
||||
MassiveTopology::Hypercube { dimensions: power },
|
||||
MassiveTopology::HierarchicalPipeline {
|
||||
clusters: 1 << (power / 2),
|
||||
chips_per_cluster: 1 << (power - power / 2),
|
||||
},
|
||||
] {
|
||||
if topology.total_chips() < 4 { continue; }
|
||||
|
||||
let config = MassiveScaleConfig {
|
||||
topology,
|
||||
..self.config.clone()
|
||||
};
|
||||
let sim = MassiveScaleSimulator::new(config.clone());
|
||||
let proj = sim.project();
|
||||
|
||||
if proj.throughput_tokens_sec >= target_tokens_sec {
|
||||
let efficiency = proj.throughput_tokens_sec / (proj.total_chips as f64);
|
||||
if efficiency > best_efficiency {
|
||||
best_efficiency = efficiency;
|
||||
best_config = config;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
best_config
|
||||
}
|
||||
}
|
||||
|
||||
/// Distributed coordinator for massive scale
|
||||
pub struct DistributedCoordinator {
|
||||
/// This node's ID
|
||||
node_id: u32,
|
||||
/// Parent node (None if root)
|
||||
parent: Option<u32>,
|
||||
/// Child nodes
|
||||
children: HVec<u32, MAX_CHILDREN>,
|
||||
/// Sibling nodes (same level)
|
||||
siblings: HVec<u32, MAX_CHILDREN>,
|
||||
/// Current level in hierarchy
|
||||
level: u8,
|
||||
/// Total levels
|
||||
total_levels: u8,
|
||||
/// Local state
|
||||
local_state: NodeState,
|
||||
}
|
||||
|
||||
/// State of a node in the distributed system
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct NodeState {
|
||||
/// Tokens processed
|
||||
pub tokens_processed: u64,
|
||||
/// Current load (0-255)
|
||||
pub load: u8,
|
||||
/// Last heartbeat (ticks)
|
||||
pub last_heartbeat: u32,
|
||||
/// Active flag
|
||||
pub active: bool,
|
||||
/// Current sequence position being processed
|
||||
pub seq_position: u32,
|
||||
/// Error count
|
||||
pub errors: u16,
|
||||
}
|
||||
|
||||
impl DistributedCoordinator {
|
||||
/// Create coordinator for position in tree
|
||||
pub fn new(node_id: u32, total_nodes: usize, topology: MassiveTopology) -> Self {
|
||||
let (parent, children, siblings, level, total_levels) =
|
||||
Self::compute_neighbors(node_id, total_nodes, topology);
|
||||
|
||||
Self {
|
||||
node_id,
|
||||
parent,
|
||||
children,
|
||||
siblings,
|
||||
level,
|
||||
total_levels,
|
||||
local_state: NodeState { active: true, ..Default::default() },
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_neighbors(
|
||||
node_id: u32,
|
||||
total_nodes: usize,
|
||||
topology: MassiveTopology
|
||||
) -> (Option<u32>, HVec<u32, MAX_CHILDREN>, HVec<u32, MAX_CHILDREN>, u8, u8) {
|
||||
let mut children = HVec::new();
|
||||
let mut siblings = HVec::new();
|
||||
|
||||
match topology {
|
||||
MassiveTopology::BinaryTree { depth } |
|
||||
MassiveTopology::KaryTree { depth, fanout: 2 } => {
|
||||
let level = (node_id + 1).ilog2() as u8;
|
||||
let parent = if node_id == 0 { None } else { Some((node_id - 1) / 2) };
|
||||
|
||||
let left = 2 * node_id + 1;
|
||||
let right = 2 * node_id + 2;
|
||||
if (left as usize) < total_nodes {
|
||||
let _ = children.push(left);
|
||||
}
|
||||
if (right as usize) < total_nodes {
|
||||
let _ = children.push(right);
|
||||
}
|
||||
|
||||
// Sibling
|
||||
if node_id > 0 {
|
||||
let sib = if node_id % 2 == 1 { node_id + 1 } else { node_id - 1 };
|
||||
if (sib as usize) < total_nodes {
|
||||
let _ = siblings.push(sib);
|
||||
}
|
||||
}
|
||||
|
||||
(parent, children, siblings, level, depth as u8)
|
||||
}
|
||||
MassiveTopology::Hypercube { dimensions } => {
|
||||
// In hypercube, neighbors differ by one bit
|
||||
let level = node_id.count_ones() as u8;
|
||||
for d in 0..dimensions {
|
||||
let neighbor = node_id ^ (1 << d);
|
||||
if (neighbor as usize) < total_nodes {
|
||||
if neighbor < node_id {
|
||||
// Could be parent
|
||||
}
|
||||
let _ = siblings.push(neighbor);
|
||||
}
|
||||
}
|
||||
(None, children, siblings, level, dimensions as u8)
|
||||
}
|
||||
MassiveTopology::HierarchicalPipeline { clusters, chips_per_cluster } => {
|
||||
let cluster_id = node_id as usize / chips_per_cluster;
|
||||
let local_id = node_id as usize % chips_per_cluster;
|
||||
let level = local_id as u8;
|
||||
|
||||
// Parent is previous in pipeline
|
||||
let parent = if local_id > 0 {
|
||||
Some(node_id - 1)
|
||||
} else if cluster_id > 0 {
|
||||
// Cross-cluster: last node of previous cluster
|
||||
Some((cluster_id * chips_per_cluster - 1) as u32)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Child is next in pipeline
|
||||
if local_id + 1 < chips_per_cluster {
|
||||
let _ = children.push(node_id + 1);
|
||||
} else if cluster_id + 1 < clusters {
|
||||
// Cross-cluster
|
||||
let _ = children.push(((cluster_id + 1) * chips_per_cluster) as u32);
|
||||
}
|
||||
|
||||
(parent, children, siblings, level, chips_per_cluster as u8)
|
||||
}
|
||||
_ => {
|
||||
// Default: linear chain
|
||||
let parent = if node_id > 0 { Some(node_id - 1) } else { None };
|
||||
if ((node_id + 1) as usize) < total_nodes {
|
||||
let _ = children.push(node_id + 1);
|
||||
}
|
||||
(parent, children, siblings, node_id as u8, total_nodes as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if this node is root
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.parent.is_none()
|
||||
}
|
||||
|
||||
/// Check if this node is leaf
|
||||
pub fn is_leaf(&self) -> bool {
|
||||
self.children.is_empty()
|
||||
}
|
||||
|
||||
/// Get nodes to send to for broadcast
|
||||
pub fn broadcast_targets(&self) -> &[u32] {
|
||||
&self.children
|
||||
}
|
||||
|
||||
/// Get node to send to for aggregation (reduce)
|
||||
pub fn reduce_target(&self) -> Option<u32> {
|
||||
self.parent
|
||||
}
|
||||
|
||||
/// Update local state
|
||||
pub fn update_state(&mut self, tokens: u64, load: u8) {
|
||||
self.local_state.tokens_processed = tokens;
|
||||
self.local_state.load = load;
|
||||
self.local_state.last_heartbeat = self.local_state.last_heartbeat.wrapping_add(1);
|
||||
}
|
||||
|
||||
/// Get aggregate statistics (for root to report)
|
||||
pub fn aggregate_stats(&self, child_stats: &[NodeState]) -> NodeState {
|
||||
let mut agg = self.local_state.clone();
|
||||
for child in child_stats {
|
||||
agg.tokens_processed += child.tokens_processed;
|
||||
agg.load = agg.load.saturating_add(child.load / (child_stats.len() as u8).max(1));
|
||||
agg.errors += child.errors;
|
||||
}
|
||||
agg
|
||||
}
|
||||
}
|
||||
|
||||
/// Gossip protocol for state synchronization at massive scale
|
||||
pub struct GossipProtocol {
|
||||
/// Known node states (sampled)
|
||||
known_states: HVec<(u32, NodeState), 64>,
|
||||
/// Fanout for gossip
|
||||
fanout: usize,
|
||||
/// Round number
|
||||
round: u32,
|
||||
}
|
||||
|
||||
impl GossipProtocol {
|
||||
pub fn new(fanout: usize) -> Self {
|
||||
Self {
|
||||
known_states: HVec::new(),
|
||||
fanout,
|
||||
round: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Select random nodes for gossip
|
||||
pub fn select_gossip_targets(&self, my_id: u32, total_nodes: usize, seed: u32) -> HVec<u32, 8> {
|
||||
let mut targets = HVec::new();
|
||||
let mut rng = seed.wrapping_mul(1103515245).wrapping_add(my_id);
|
||||
|
||||
for _ in 0..self.fanout.min(8) {
|
||||
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let target = (rng % total_nodes as u32) as u32;
|
||||
if target != my_id && !targets.contains(&target) {
|
||||
let _ = targets.push(target);
|
||||
}
|
||||
}
|
||||
|
||||
targets
|
||||
}
|
||||
|
||||
/// Merge received state
|
||||
pub fn merge_state(&mut self, node_id: u32, state: NodeState) {
|
||||
// Update or insert
|
||||
for (id, s) in self.known_states.iter_mut() {
|
||||
if *id == node_id {
|
||||
*s = state;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Insert new
|
||||
if self.known_states.len() < 64 {
|
||||
let _ = self.known_states.push((node_id, state));
|
||||
} else {
|
||||
// Replace oldest (simple LRU)
|
||||
self.known_states[0] = (node_id, state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get estimated cluster health
|
||||
pub fn cluster_health(&self) -> f32 {
|
||||
if self.known_states.is_empty() {
|
||||
return 1.0;
|
||||
}
|
||||
let active = self.known_states.iter().filter(|(_, s)| s.active).count();
|
||||
active as f32 / self.known_states.len() as f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Fault tolerance manager
|
||||
pub struct FaultTolerance {
|
||||
/// Redundancy level (1 = no redundancy, 2 = pairs, 3 = triples)
|
||||
redundancy: u8,
|
||||
/// Failed node IDs
|
||||
failed_nodes: HVec<u32, 64>,
|
||||
/// Backup assignments (primary -> backup)
|
||||
backups: HVec<(u32, u32), 32>,
|
||||
}
|
||||
|
||||
impl FaultTolerance {
|
||||
pub fn new(redundancy: u8) -> Self {
|
||||
Self {
|
||||
redundancy: redundancy.max(1),
|
||||
failed_nodes: HVec::new(),
|
||||
backups: HVec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark node as failed
|
||||
pub fn mark_failed(&mut self, node_id: u32) {
|
||||
if !self.failed_nodes.contains(&node_id) {
|
||||
let _ = self.failed_nodes.push(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get backup for failed node
|
||||
pub fn get_backup(&self, failed_id: u32) -> Option<u32> {
|
||||
self.backups.iter()
|
||||
.find(|(primary, _)| *primary == failed_id)
|
||||
.map(|(_, backup)| *backup)
|
||||
}
|
||||
|
||||
/// Assign backups for nodes
|
||||
pub fn assign_backups(&mut self, total_nodes: usize) {
|
||||
if self.redundancy < 2 { return; }
|
||||
|
||||
for i in 0..total_nodes {
|
||||
let backup = (i + total_nodes / 2) % total_nodes;
|
||||
if self.backups.len() < 32 {
|
||||
let _ = self.backups.push((i as u32, backup as u32));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if node is available (not failed)
|
||||
pub fn is_available(&self, node_id: u32) -> bool {
|
||||
!self.failed_nodes.contains(&node_id)
|
||||
}
|
||||
|
||||
/// Get failure rate
|
||||
pub fn failure_rate(&self, total_nodes: usize) -> f32 {
|
||||
self.failed_nodes.len() as f32 / total_nodes as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_topology_sizing() {
|
||||
assert_eq!(MassiveTopology::BinaryTree { depth: 10 }.total_chips(), 1023);
|
||||
assert_eq!(MassiveTopology::Hypercube { dimensions: 10 }.total_chips(), 1024);
|
||||
assert_eq!(MassiveTopology::Torus2D { width: 100, height: 100 }.total_chips(), 10_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scaling_projection() {
|
||||
let config = MassiveScaleConfig {
|
||||
topology: MassiveTopology::HierarchicalPipeline {
|
||||
clusters: 10,
|
||||
chips_per_cluster: 10,
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let sim = MassiveScaleSimulator::new(config);
|
||||
let proj = sim.project();
|
||||
|
||||
assert_eq!(proj.total_chips, 100);
|
||||
assert!(proj.throughput_tokens_sec > 1000.0);
|
||||
assert!(proj.efficiency > 0.5);
|
||||
|
||||
println!("100 chips: {:.0} tok/s, {:.1}% efficiency",
|
||||
proj.throughput_tokens_sec, proj.efficiency * 100.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_massive_scale() {
|
||||
let chip_counts = [5, 100, 1000, 10_000, 100_000, 1_000_000];
|
||||
|
||||
for &count in &chip_counts {
|
||||
let topology = MassiveTopology::recommended(count);
|
||||
let config = MassiveScaleConfig {
|
||||
topology,
|
||||
..Default::default()
|
||||
};
|
||||
let sim = MassiveScaleSimulator::new(config);
|
||||
let proj = sim.project();
|
||||
|
||||
println!("{:>10} chips: {:>12.0} tok/s, {:>6.1}% eff, ${:.0}",
|
||||
count, proj.throughput_tokens_sec, proj.efficiency * 100.0, proj.cost_usd);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distributed_coordinator() {
|
||||
let coord = DistributedCoordinator::new(
|
||||
5,
|
||||
100,
|
||||
MassiveTopology::BinaryTree { depth: 7 }
|
||||
);
|
||||
|
||||
assert!(!coord.is_root());
|
||||
println!("Node 5: parent={:?}, children={:?}", coord.parent, coord.children);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gossip_protocol() {
|
||||
let mut gossip = GossipProtocol::new(3);
|
||||
|
||||
let targets = gossip.select_gossip_targets(5, 1000, 42);
|
||||
assert!(!targets.is_empty());
|
||||
assert!(!targets.contains(&5)); // Shouldn't include self
|
||||
|
||||
gossip.merge_state(10, NodeState { active: true, ..Default::default() });
|
||||
assert_eq!(gossip.cluster_health(), 1.0);
|
||||
}
|
||||
}
|
||||
420
vendor/ruvector/examples/ruvLLM/esp32/src/federation/medium_scale.rs
vendored
Normal file
420
vendor/ruvector/examples/ruvLLM/esp32/src/federation/medium_scale.rs
vendored
Normal file
@@ -0,0 +1,420 @@
|
||||
//! Medium Scale Federation - 100 to 500 Chip Clusters
|
||||
//!
|
||||
//! This is the "sweet spot" for ESP32 federation:
|
||||
//! - High efficiency (40-70%)
|
||||
//! - Practical throughput (50K-100K tokens/sec)
|
||||
//! - Manageable communication overhead
|
||||
//! - Affordable cost ($400-$2,000)
|
||||
//!
|
||||
//! # Why 100-500 Chips?
|
||||
//!
|
||||
//! ```text
|
||||
//! Performance vs Chip Count:
|
||||
//!
|
||||
//! 100K ┤ ┌─────────────────────── Communication-bound
|
||||
//! │ ____/│ Sweet Spot
|
||||
//! 80K ┤ / │ 100-500 chips
|
||||
//! │ / │
|
||||
//! 60K ┤ / │ • 40-70% efficiency
|
||||
//! │ │ │ • Low communication overhead
|
||||
//! 40K ┤ │ │ • Best $/performance
|
||||
//! ││ └─────────────────────────────────
|
||||
//! 20K ┤│
|
||||
//! │
|
||||
//! 0 ┼──────────────────────────────────────────────────
|
||||
//! 5 50 100 200 500 1K 5K 10K 100K 1M
|
||||
//! ▲ ▲
|
||||
//! │ │
|
||||
//! Good start Best value
|
||||
//! ```
|
||||
//!
|
||||
//! # Topology Recommendations
|
||||
//!
|
||||
//! | Chips | Best Topology | Clusters × Chips | Efficiency |
|
||||
//! |-------|---------------|------------------|------------|
|
||||
//! | 100 | 10×10 Grid | 10 × 10 | ~70% |
|
||||
//! | 144 | 12×12 Grid | 12 × 12 | ~65% |
|
||||
//! | 256 | 16×16 Grid | 16 × 16 | ~55% |
|
||||
//! | 400 | 20×20 Grid | 20 × 20 | ~45% |
|
||||
//! | 500 | 25×20 Grid | 25 × 20 | ~40% |
|
||||
|
||||
use super::massive_scale::{MassiveTopology, MassiveScaleConfig, MassiveScaleSimulator, ScaleProjection};
|
||||
use heapless::Vec as HVec;
|
||||
|
||||
/// Medium-scale cluster sizes (sweet spot)
|
||||
pub const MEDIUM_SCALE_MIN: usize = 100;
|
||||
pub const MEDIUM_SCALE_MAX: usize = 500;
|
||||
pub const MEDIUM_SCALE_OPTIMAL: usize = 256; // Best efficiency/throughput balance
|
||||
|
||||
/// Pre-optimized cluster configurations
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct MediumClusterConfig {
|
||||
/// Total chips in cluster
|
||||
pub total_chips: usize,
|
||||
/// Number of clusters (groups)
|
||||
pub clusters: usize,
|
||||
/// Chips per cluster
|
||||
pub chips_per_cluster: usize,
|
||||
/// Expected throughput (tokens/sec)
|
||||
pub expected_throughput: f64,
|
||||
/// Expected efficiency
|
||||
pub expected_efficiency: f64,
|
||||
/// Estimated cost USD
|
||||
pub cost_usd: f64,
|
||||
/// Power consumption watts
|
||||
pub power_watts: f64,
|
||||
/// Max model parameters supportable
|
||||
pub max_params: usize,
|
||||
}
|
||||
|
||||
impl MediumClusterConfig {
|
||||
/// Get optimal configuration for given chip count
|
||||
pub fn optimal_for(chip_count: usize) -> Self {
|
||||
let chips = chip_count.clamp(MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX);
|
||||
|
||||
// Find best square-ish layout
|
||||
let sqrt = (chips as f64).sqrt();
|
||||
let clusters = sqrt.ceil() as usize;
|
||||
let per_cluster = (chips + clusters - 1) / clusters;
|
||||
let actual_chips = clusters * per_cluster;
|
||||
|
||||
// Simulate to get accurate projections
|
||||
let config = MassiveScaleConfig {
|
||||
topology: MassiveTopology::HierarchicalPipeline {
|
||||
clusters,
|
||||
chips_per_cluster: per_cluster,
|
||||
},
|
||||
total_layers: 32,
|
||||
embed_dim: 64,
|
||||
hop_latency_us: 10,
|
||||
link_bandwidth: 10_000_000,
|
||||
layer_compute_us: 4000,
|
||||
speculative: true,
|
||||
spec_depth: 4,
|
||||
gradient_checkpointing: false,
|
||||
fault_tolerance: 1,
|
||||
};
|
||||
|
||||
let sim = MassiveScaleSimulator::new(config);
|
||||
let proj = sim.project();
|
||||
|
||||
Self {
|
||||
total_chips: actual_chips,
|
||||
clusters,
|
||||
chips_per_cluster: per_cluster,
|
||||
expected_throughput: proj.throughput_tokens_sec,
|
||||
expected_efficiency: proj.efficiency,
|
||||
cost_usd: proj.cost_usd,
|
||||
power_watts: proj.power_watts,
|
||||
max_params: proj.max_parameters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all standard configurations
|
||||
pub fn standard_configs() -> [Self; 5] {
|
||||
[
|
||||
Self::optimal_for(100),
|
||||
Self::optimal_for(144),
|
||||
Self::optimal_for(256),
|
||||
Self::optimal_for(400),
|
||||
Self::optimal_for(500),
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
/// Comparison with smaller clusters
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScaleComparison {
|
||||
/// Single chip baseline
|
||||
pub single_chip: ScaleProjection,
|
||||
/// 5-chip small cluster
|
||||
pub small_cluster: ScaleProjection,
|
||||
/// Medium cluster (specified)
|
||||
pub medium_cluster: ScaleProjection,
|
||||
/// Throughput multiplier vs single
|
||||
pub throughput_multiplier: f64,
|
||||
/// Throughput multiplier vs 5-chip
|
||||
pub vs_small_multiplier: f64,
|
||||
/// Cost per 1K tokens/sec
|
||||
pub cost_per_1k_tokens: f64,
|
||||
}
|
||||
|
||||
impl ScaleComparison {
|
||||
/// Compare medium cluster against baselines
|
||||
pub fn analyze(chip_count: usize) -> Self {
|
||||
let base_config = MassiveScaleConfig {
|
||||
total_layers: 32,
|
||||
embed_dim: 64,
|
||||
hop_latency_us: 10,
|
||||
link_bandwidth: 10_000_000,
|
||||
layer_compute_us: 4000,
|
||||
speculative: true,
|
||||
spec_depth: 4,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Single chip
|
||||
let single_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
|
||||
topology: MassiveTopology::FlatMesh { size: 1 },
|
||||
..base_config.clone()
|
||||
});
|
||||
let single = single_sim.project();
|
||||
|
||||
// 5-chip small cluster
|
||||
let small_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
|
||||
topology: MassiveTopology::FlatMesh { size: 5 },
|
||||
..base_config.clone()
|
||||
});
|
||||
let small = small_sim.project();
|
||||
|
||||
// Medium cluster
|
||||
let medium_sim = MassiveScaleSimulator::new(MassiveScaleConfig {
|
||||
topology: MassiveTopology::recommended(chip_count),
|
||||
..base_config.clone()
|
||||
});
|
||||
let medium = medium_sim.project();
|
||||
|
||||
Self {
|
||||
throughput_multiplier: medium.throughput_tokens_sec / single.throughput_tokens_sec,
|
||||
vs_small_multiplier: medium.throughput_tokens_sec / small.throughput_tokens_sec,
|
||||
cost_per_1k_tokens: medium.cost_usd / (medium.throughput_tokens_sec / 1000.0),
|
||||
single_chip: single,
|
||||
small_cluster: small,
|
||||
medium_cluster: medium,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Model categories that can run at different scales
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum ModelCategory {
|
||||
/// 50K-500K params, minimal memory
|
||||
Nano,
|
||||
/// 500K-5M params, basic tasks
|
||||
Micro,
|
||||
/// 5M-20M params, good general use
|
||||
Small,
|
||||
/// 20M-100M params, high quality
|
||||
Base,
|
||||
/// 100M-500M params, needs large clusters
|
||||
Large,
|
||||
}
|
||||
|
||||
impl ModelCategory {
|
||||
/// Minimum chips required for this model category
|
||||
pub fn min_chips(&self) -> usize {
|
||||
match self {
|
||||
Self::Nano => 1,
|
||||
Self::Micro => 5,
|
||||
Self::Small => 50,
|
||||
Self::Base => 200,
|
||||
Self::Large => 500,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parameter range
|
||||
pub fn param_range(&self) -> (usize, usize) {
|
||||
match self {
|
||||
Self::Nano => (50_000, 500_000),
|
||||
Self::Micro => (500_000, 5_000_000),
|
||||
Self::Small => (5_000_000, 20_000_000),
|
||||
Self::Base => (20_000_000, 100_000_000),
|
||||
Self::Large => (100_000_000, 500_000_000),
|
||||
}
|
||||
}
|
||||
|
||||
/// Example models
|
||||
pub fn examples(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Nano => "TinyBERT-nano, Custom embeddings",
|
||||
Self::Micro => "DistilBERT-tiny, MiniLM",
|
||||
Self::Small => "TinyLlama, Phi-nano",
|
||||
Self::Base => "Phi-1, GPT-2-Small",
|
||||
Self::Large => "Phi-2, LLaMA-7B (quantized)",
|
||||
}
|
||||
}
|
||||
|
||||
/// What's possible with given chip count
|
||||
pub fn for_chip_count(chips: usize) -> Self {
|
||||
match chips {
|
||||
0..=4 => Self::Nano,
|
||||
5..=49 => Self::Micro,
|
||||
50..=199 => Self::Small,
|
||||
200..=499 => Self::Base,
|
||||
_ => Self::Large,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Hardware configuration for physical deployment
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HardwareConfig {
|
||||
/// Chips per PCB (physical board)
|
||||
pub chips_per_board: usize,
|
||||
/// Number of PCBs
|
||||
pub num_boards: usize,
|
||||
/// Communication bus
|
||||
pub bus_type: BusType,
|
||||
/// Power supply requirement (watts)
|
||||
pub power_supply_watts: f64,
|
||||
/// Recommended form factor
|
||||
pub form_factor: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum BusType {
|
||||
/// SPI - up to 40MHz, simple
|
||||
Spi,
|
||||
/// I2C - 400kHz standard, lower bandwidth
|
||||
I2c,
|
||||
/// UART mesh - flexible, medium speed
|
||||
Uart,
|
||||
/// Custom high-speed interconnect
|
||||
HighSpeed,
|
||||
}
|
||||
|
||||
impl BusType {
|
||||
pub fn bandwidth_bytes_sec(&self) -> usize {
|
||||
match self {
|
||||
Self::Spi => 5_000_000, // 5 MB/s typical
|
||||
Self::I2c => 50_000, // 50 KB/s
|
||||
Self::Uart => 1_000_000, // 1 MB/s at 10Mbaud
|
||||
Self::HighSpeed => 50_000_000, // Custom FPGA/ASIC
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HardwareConfig {
|
||||
/// Recommended hardware for chip count
|
||||
pub fn for_cluster(chip_count: usize) -> Self {
|
||||
match chip_count {
|
||||
0..=25 => Self {
|
||||
chips_per_board: chip_count.min(10),
|
||||
num_boards: (chip_count + 9) / 10,
|
||||
bus_type: BusType::Spi,
|
||||
power_supply_watts: chip_count as f64 * 0.5 + 10.0,
|
||||
form_factor: "Single PCB or small rack",
|
||||
},
|
||||
26..=100 => Self {
|
||||
chips_per_board: 10,
|
||||
num_boards: (chip_count + 9) / 10,
|
||||
bus_type: BusType::Spi,
|
||||
power_supply_watts: chip_count as f64 * 0.5 + 25.0,
|
||||
form_factor: "1U rack mount (10 boards)",
|
||||
},
|
||||
101..=256 => Self {
|
||||
chips_per_board: 16,
|
||||
num_boards: (chip_count + 15) / 16,
|
||||
bus_type: BusType::Uart,
|
||||
power_supply_watts: chip_count as f64 * 0.5 + 50.0,
|
||||
form_factor: "2U-4U rack mount",
|
||||
},
|
||||
257..=500 => Self {
|
||||
chips_per_board: 20,
|
||||
num_boards: (chip_count + 19) / 20,
|
||||
bus_type: BusType::Uart,
|
||||
power_supply_watts: chip_count as f64 * 0.5 + 75.0,
|
||||
form_factor: "Full rack unit",
|
||||
},
|
||||
_ => Self {
|
||||
chips_per_board: 25,
|
||||
num_boards: (chip_count + 24) / 25,
|
||||
bus_type: BusType::HighSpeed,
|
||||
power_supply_watts: chip_count as f64 * 0.5 + 100.0,
|
||||
form_factor: "Multi-rack datacenter",
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run complete analysis for 100-500 chip clusters
|
||||
pub struct MediumScaleAnalyzer;
|
||||
|
||||
impl MediumScaleAnalyzer {
|
||||
/// Compare all standard medium-scale configurations
|
||||
pub fn full_analysis() -> HVec<(MediumClusterConfig, ScaleComparison), 8> {
|
||||
let mut results = HVec::new();
|
||||
|
||||
for chips in [100, 144, 196, 256, 324, 400, 484, 500] {
|
||||
if chips <= MEDIUM_SCALE_MAX {
|
||||
let config = MediumClusterConfig::optimal_for(chips);
|
||||
let comparison = ScaleComparison::analyze(chips);
|
||||
let _ = results.push((config, comparison));
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Find optimal configuration for target throughput
|
||||
pub fn optimize_for_throughput(target_tokens_sec: f64) -> Option<MediumClusterConfig> {
|
||||
// Binary search in medium scale range
|
||||
let mut low = MEDIUM_SCALE_MIN;
|
||||
let mut high = MEDIUM_SCALE_MAX;
|
||||
let mut best: Option<MediumClusterConfig> = None;
|
||||
|
||||
while low <= high {
|
||||
let mid = (low + high) / 2;
|
||||
let config = MediumClusterConfig::optimal_for(mid);
|
||||
|
||||
if config.expected_throughput >= target_tokens_sec {
|
||||
best = Some(config);
|
||||
high = mid.saturating_sub(1);
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
best
|
||||
}
|
||||
|
||||
/// Find optimal configuration for target cost
|
||||
pub fn optimize_for_budget(budget_usd: f64) -> MediumClusterConfig {
|
||||
let max_chips = (budget_usd / 4.0) as usize; // $4 per chip
|
||||
let clamped = max_chips.clamp(MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX);
|
||||
MediumClusterConfig::optimal_for(clamped)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_optimal_config_100() {
|
||||
let config = MediumClusterConfig::optimal_for(100);
|
||||
assert_eq!(config.clusters, 10);
|
||||
assert_eq!(config.chips_per_cluster, 10);
|
||||
assert!(config.expected_throughput > 40000.0); // 40K+ tok/s
|
||||
assert!(config.expected_efficiency > 0.5); // 50%+ efficiency
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_optimal_config_256() {
|
||||
let config = MediumClusterConfig::optimal_for(256);
|
||||
assert_eq!(config.clusters, 16);
|
||||
assert_eq!(config.chips_per_cluster, 16);
|
||||
assert!(config.expected_throughput > 60000.0); // 60K+ tok/s
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scale_comparison() {
|
||||
let comparison = ScaleComparison::analyze(256);
|
||||
assert!(comparison.throughput_multiplier > 50.0); // 50x+ vs single chip
|
||||
assert!(comparison.vs_small_multiplier > 10.0); // 10x+ vs 5 chips
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_model_categories() {
|
||||
assert_eq!(ModelCategory::for_chip_count(50).min_chips(), 50);
|
||||
assert_eq!(ModelCategory::for_chip_count(256).min_chips(), 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hardware_config() {
|
||||
let hw = HardwareConfig::for_cluster(256);
|
||||
assert_eq!(hw.chips_per_board, 16);
|
||||
assert_eq!(hw.num_boards, 16);
|
||||
assert!(hw.power_supply_watts > 100.0);
|
||||
}
|
||||
}
|
||||
280
vendor/ruvector/examples/ruvLLM/esp32/src/federation/mod.rs
vendored
Normal file
280
vendor/ruvector/examples/ruvLLM/esp32/src/federation/mod.rs
vendored
Normal file
@@ -0,0 +1,280 @@
|
||||
//! Federation Module for Multi-ESP32 Distributed Inference
|
||||
//!
|
||||
//! Enables running larger models across multiple ESP32 chips:
|
||||
//! - Pipeline parallelism: Each chip handles different layers
|
||||
//! - Tensor parallelism: Split attention heads across chips
|
||||
//! - Model sharding: Distribute embeddings/weights
|
||||
//! - Speculative decoding: Draft on one chip, verify on others
|
||||
//!
|
||||
//! # Architecture Options
|
||||
//!
|
||||
//! ```text
|
||||
//! 5-Chip Pipeline (recommended for latency):
|
||||
//! ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐
|
||||
//! │ ESP32-0 │───▶│ ESP32-1 │───▶│ ESP32-2 │───▶│ ESP32-3 │───▶│ ESP32-4 │
|
||||
//! │ Embed + │ │ Layer 1 │ │ Layer 2 │ │ Layer 3 │ │ Layer 4 │
|
||||
//! │ Layer 0 │ │ │ │ │ │ │ │ + Head │
|
||||
//! └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘
|
||||
//!
|
||||
//! 5-Chip Tensor Parallel (for throughput):
|
||||
//! ┌─────────┐
|
||||
//! │ ESP32-0 │ ◀──┐
|
||||
//! │ Head 0 │ │
|
||||
//! └─────────┘ │
|
||||
//! ┌─────────┐ │ ┌─────────┐
|
||||
//! │ ESP32-1 │ ◀──┼────│ ESP32-4 │
|
||||
//! │ Head 1 │ │ │ Coord │
|
||||
//! └─────────┘ │ └─────────┘
|
||||
//! ┌─────────┐ │
|
||||
//! │ ESP32-2 │ ◀──┤
|
||||
//! │ Head 2 │ │
|
||||
//! └─────────┘ │
|
||||
//! ┌─────────┐ │
|
||||
//! │ ESP32-3 │ ◀──┘
|
||||
//! │ Head 3 │
|
||||
//! └─────────┘
|
||||
//! ```
|
||||
|
||||
pub mod pipeline;
|
||||
pub mod tensor_parallel;
|
||||
pub mod sharding;
|
||||
pub mod speculative;
|
||||
pub mod protocol;
|
||||
pub mod coordinator;
|
||||
pub mod fastgrnn_router;
|
||||
pub mod massive_scale;
|
||||
pub mod medium_scale;
|
||||
|
||||
// Re-exports
|
||||
pub use pipeline::{PipelineNode, PipelineConfig, PipelineRole};
|
||||
pub use tensor_parallel::{TensorParallelNode, TPConfig};
|
||||
pub use sharding::{ShardedEmbedding, ShardConfig};
|
||||
pub use speculative::{SpeculativeDecoder, DraftVerifyConfig};
|
||||
pub use protocol::{FederationMessage, MessageType, ChipId};
|
||||
pub use coordinator::{FederationCoordinator, ClusterTopology};
|
||||
pub use fastgrnn_router::{MicroFastGRNN, MicroGRNNConfig, RoutingFeatures};
|
||||
pub use massive_scale::{
|
||||
MassiveTopology, MassiveScaleConfig, MassiveScaleSimulator, ScaleProjection,
|
||||
DistributedCoordinator, GossipProtocol, FaultTolerance,
|
||||
};
|
||||
pub use medium_scale::{
|
||||
MediumClusterConfig, ScaleComparison, MediumScaleAnalyzer,
|
||||
ModelCategory, HardwareConfig, BusType,
|
||||
MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX, MEDIUM_SCALE_OPTIMAL,
|
||||
};
|
||||
|
||||
/// Maximum chips in small federation
|
||||
pub const MAX_FEDERATION_SIZE: usize = 8;
|
||||
/// Maximum chips in massive scale (theoretical)
|
||||
pub const MAX_MASSIVE_SCALE: usize = 1_000_000;
|
||||
|
||||
/// Federation mode
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum FederationMode {
|
||||
/// Single chip (no federation)
|
||||
Standalone,
|
||||
/// Pipeline parallelism - each chip handles different layers
|
||||
Pipeline,
|
||||
/// Tensor parallelism - split heads across chips
|
||||
TensorParallel,
|
||||
/// Hybrid: pipeline + tensor parallel
|
||||
Hybrid,
|
||||
/// Speculative decoding with draft/verify
|
||||
Speculative,
|
||||
/// Mixture of Experts - each chip is an expert
|
||||
MixtureOfExperts,
|
||||
}
|
||||
|
||||
/// Federation cluster configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationConfig {
|
||||
/// Number of chips in cluster
|
||||
pub num_chips: usize,
|
||||
/// This chip's ID (0-indexed)
|
||||
pub chip_id: ChipId,
|
||||
/// Federation mode
|
||||
pub mode: FederationMode,
|
||||
/// Communication bus type
|
||||
pub bus: CommunicationBus,
|
||||
/// Layers per chip (for pipeline mode)
|
||||
pub layers_per_chip: usize,
|
||||
/// Heads per chip (for tensor parallel mode)
|
||||
pub heads_per_chip: usize,
|
||||
/// Enable pipelining (process next token while current finishes)
|
||||
pub enable_pipelining: bool,
|
||||
}
|
||||
|
||||
impl Default for FederationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_chips: 5,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::Pipeline,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip: 2,
|
||||
heads_per_chip: 1,
|
||||
enable_pipelining: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Communication bus between chips
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum CommunicationBus {
|
||||
/// SPI bus (fastest, 10-80 MHz)
|
||||
Spi,
|
||||
/// I2C bus (slower, 400 kHz - 1 MHz)
|
||||
I2c,
|
||||
/// UART (flexible, up to 5 Mbps)
|
||||
Uart,
|
||||
/// ESP-NOW (wireless, ~1 Mbps)
|
||||
EspNow,
|
||||
/// Custom parallel bus
|
||||
Parallel,
|
||||
}
|
||||
|
||||
impl CommunicationBus {
|
||||
/// Estimated bandwidth in bytes/second
|
||||
pub const fn bandwidth_bytes_per_sec(&self) -> usize {
|
||||
match self {
|
||||
Self::Spi => 10_000_000, // 10 MB/s at 80 MHz
|
||||
Self::I2c => 100_000, // 100 KB/s at 1 MHz
|
||||
Self::Uart => 500_000, // 500 KB/s at 5 Mbps
|
||||
Self::EspNow => 125_000, // ~1 Mbps
|
||||
Self::Parallel => 20_000_000, // Custom 8-bit parallel
|
||||
}
|
||||
}
|
||||
|
||||
/// Latency overhead in microseconds
|
||||
pub const fn latency_us(&self) -> usize {
|
||||
match self {
|
||||
Self::Spi => 10,
|
||||
Self::I2c => 50,
|
||||
Self::Uart => 20,
|
||||
Self::EspNow => 500, // Wireless overhead
|
||||
Self::Parallel => 5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate optimal federation configuration for given model
|
||||
pub fn calculate_optimal_config(
|
||||
model_size_bytes: usize,
|
||||
num_layers: usize,
|
||||
num_heads: usize,
|
||||
num_chips: usize,
|
||||
per_chip_ram: usize,
|
||||
) -> FederationConfig {
|
||||
let model_per_chip = model_size_bytes / num_chips;
|
||||
|
||||
// Check if model fits with pipeline parallelism
|
||||
if model_per_chip <= per_chip_ram {
|
||||
let layers_per_chip = (num_layers + num_chips - 1) / num_chips;
|
||||
return FederationConfig {
|
||||
num_chips,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::Pipeline,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip,
|
||||
heads_per_chip: num_heads,
|
||||
enable_pipelining: true,
|
||||
};
|
||||
}
|
||||
|
||||
// Try tensor parallelism
|
||||
let heads_per_chip = (num_heads + num_chips - 1) / num_chips;
|
||||
FederationConfig {
|
||||
num_chips,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::TensorParallel,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip: num_layers,
|
||||
heads_per_chip,
|
||||
enable_pipelining: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimate performance improvement from federation
|
||||
pub fn estimate_speedup(config: &FederationConfig) -> FederationSpeedup {
|
||||
let n = config.num_chips as f32;
|
||||
|
||||
match config.mode {
|
||||
FederationMode::Standalone => FederationSpeedup {
|
||||
throughput_multiplier: 1.0,
|
||||
latency_reduction: 1.0,
|
||||
memory_per_chip_reduction: 1.0,
|
||||
},
|
||||
FederationMode::Pipeline => FederationSpeedup {
|
||||
// Pipeline: n-way throughput, slightly higher latency
|
||||
throughput_multiplier: n * 0.85, // 85% efficiency due to bubble
|
||||
latency_reduction: 1.0 / (1.0 + 0.1 * (n - 1.0)), // Slight increase
|
||||
memory_per_chip_reduction: n,
|
||||
},
|
||||
FederationMode::TensorParallel => FederationSpeedup {
|
||||
// TP: near-linear speedup on attention
|
||||
throughput_multiplier: n * 0.7, // Communication overhead
|
||||
latency_reduction: n * 0.7,
|
||||
memory_per_chip_reduction: n * 0.8, // Some duplication
|
||||
},
|
||||
FederationMode::Hybrid => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.75,
|
||||
latency_reduction: (n / 2.0) * 0.8,
|
||||
memory_per_chip_reduction: n * 0.9,
|
||||
},
|
||||
FederationMode::Speculative => FederationSpeedup {
|
||||
// Speculative: 2-4x speedup typical
|
||||
throughput_multiplier: 2.5,
|
||||
latency_reduction: 2.0,
|
||||
memory_per_chip_reduction: 1.0, // Full model on draft chip
|
||||
},
|
||||
FederationMode::MixtureOfExperts => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.9, // Excellent scaling
|
||||
latency_reduction: 1.5,
|
||||
memory_per_chip_reduction: n,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Performance improvement estimates
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationSpeedup {
|
||||
/// Throughput improvement (tokens/sec multiplier)
|
||||
pub throughput_multiplier: f32,
|
||||
/// Latency reduction (time per token)
|
||||
pub latency_reduction: f32,
|
||||
/// Memory reduction per chip
|
||||
pub memory_per_chip_reduction: f32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_optimal_config() {
|
||||
let config = calculate_optimal_config(
|
||||
500 * 1024, // 500 KB model
|
||||
10, // 10 layers
|
||||
4, // 4 heads
|
||||
5, // 5 chips
|
||||
120 * 1024, // 120 KB per chip
|
||||
);
|
||||
|
||||
assert_eq!(config.mode, FederationMode::Pipeline);
|
||||
assert_eq!(config.layers_per_chip, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_speedup_estimate() {
|
||||
let config = FederationConfig {
|
||||
num_chips: 5,
|
||||
mode: FederationMode::Pipeline,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let speedup = estimate_speedup(&config);
|
||||
|
||||
assert!(speedup.throughput_multiplier > 4.0);
|
||||
assert!(speedup.memory_per_chip_reduction >= 5.0);
|
||||
}
|
||||
}
|
||||
387
vendor/ruvector/examples/ruvLLM/esp32/src/federation/pipeline.rs
vendored
Normal file
387
vendor/ruvector/examples/ruvLLM/esp32/src/federation/pipeline.rs
vendored
Normal file
@@ -0,0 +1,387 @@
|
||||
//! Pipeline Parallelism for Multi-ESP32 Inference
|
||||
//!
|
||||
//! Distributes layers across chips for linear scaling with model size.
|
||||
//! Each chip processes its assigned layers and passes activations to the next.
|
||||
//!
|
||||
//! # 5-Chip Pipeline Example
|
||||
//!
|
||||
//! ```text
|
||||
//! Token 0: [C0:embed+L0] → [C1:L1-2] → [C2:L3-4] → [C3:L5-6] → [C4:L7+head]
|
||||
//! Token 1: idle [C0:embed] [C1:L1-2] [C2:L3-4] [C3:L5-6]
|
||||
//! Token 2: idle idle [C0:embed] [C1:L1-2] [C2:L3-4]
|
||||
//! ...
|
||||
//! ```
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::{ChipId, FederationMessage};
|
||||
|
||||
/// Maximum layers per chip
|
||||
pub const MAX_LAYERS_PER_CHIP: usize = 4;
|
||||
/// Pipeline depth (tokens in flight)
|
||||
pub const MAX_PIPELINE_DEPTH: usize = 8;
|
||||
|
||||
/// Role in the pipeline
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum PipelineRole {
|
||||
/// First chip: handles embedding + first layers
|
||||
Head,
|
||||
/// Middle chip: processes middle layers
|
||||
Middle,
|
||||
/// Last chip: final layers + output head
|
||||
Tail,
|
||||
/// Single chip mode (no pipeline)
|
||||
Standalone,
|
||||
}
|
||||
|
||||
/// Pipeline configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PipelineConfig {
|
||||
/// Total chips in pipeline
|
||||
pub num_chips: usize,
|
||||
/// This chip's position (0 = head)
|
||||
pub position: usize,
|
||||
/// Layers assigned to this chip
|
||||
pub layer_start: usize,
|
||||
/// Number of layers on this chip
|
||||
pub layer_count: usize,
|
||||
/// Total layers in model
|
||||
pub total_layers: usize,
|
||||
/// Embedding dimension
|
||||
pub embed_dim: usize,
|
||||
/// Enable micro-batching
|
||||
pub micro_batch_size: usize,
|
||||
}
|
||||
|
||||
impl PipelineConfig {
|
||||
/// Create config for a specific chip in the pipeline
|
||||
pub fn for_chip(
|
||||
chip_pos: usize,
|
||||
num_chips: usize,
|
||||
total_layers: usize,
|
||||
embed_dim: usize,
|
||||
) -> Self {
|
||||
let layers_per_chip = (total_layers + num_chips - 1) / num_chips;
|
||||
let layer_start = chip_pos * layers_per_chip;
|
||||
let layer_count = layers_per_chip.min(total_layers - layer_start);
|
||||
|
||||
Self {
|
||||
num_chips,
|
||||
position: chip_pos,
|
||||
layer_start,
|
||||
layer_count,
|
||||
total_layers,
|
||||
embed_dim,
|
||||
micro_batch_size: 1,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get role of this chip
|
||||
pub fn role(&self) -> PipelineRole {
|
||||
if self.num_chips == 1 {
|
||||
PipelineRole::Standalone
|
||||
} else if self.position == 0 {
|
||||
PipelineRole::Head
|
||||
} else if self.position == self.num_chips - 1 {
|
||||
PipelineRole::Tail
|
||||
} else {
|
||||
PipelineRole::Middle
|
||||
}
|
||||
}
|
||||
|
||||
/// Previous chip in pipeline (if any)
|
||||
pub fn prev_chip(&self) -> Option<ChipId> {
|
||||
if self.position > 0 {
|
||||
Some(ChipId((self.position - 1) as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Next chip in pipeline (if any)
|
||||
pub fn next_chip(&self) -> Option<ChipId> {
|
||||
if self.position + 1 < self.num_chips {
|
||||
Some(ChipId((self.position + 1) as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pipeline state for a chip
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum PipelineState {
|
||||
/// Waiting for input from previous chip
|
||||
WaitingInput,
|
||||
/// Processing layers
|
||||
Processing,
|
||||
/// Waiting to send output
|
||||
WaitingSend,
|
||||
/// Idle (pipeline bubble)
|
||||
Idle,
|
||||
}
|
||||
|
||||
/// In-flight token tracking
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InFlightToken {
|
||||
/// Sequence position
|
||||
pub seq_pos: u16,
|
||||
/// Token ID
|
||||
pub token_id: u16,
|
||||
/// Current layer being processed
|
||||
pub current_layer: u8,
|
||||
/// Activation data (INT8)
|
||||
pub activation: HVec<i8, 128>,
|
||||
}
|
||||
|
||||
/// Pipeline node managing this chip's portion
|
||||
pub struct PipelineNode {
|
||||
/// Configuration
|
||||
config: PipelineConfig,
|
||||
/// Current state
|
||||
state: PipelineState,
|
||||
/// Chip ID
|
||||
chip_id: ChipId,
|
||||
/// Sequence counter
|
||||
seq_counter: u16,
|
||||
/// Tokens in flight in the pipeline
|
||||
in_flight: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
|
||||
/// Completed tokens waiting to send
|
||||
output_queue: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
|
||||
/// Input buffer for receiving activations
|
||||
input_buffer: HVec<i8, 256>,
|
||||
/// Barrier counter for synchronization
|
||||
barrier_counter: u16,
|
||||
}
|
||||
|
||||
impl PipelineNode {
|
||||
/// Create new pipeline node
|
||||
pub fn new(config: PipelineConfig) -> Self {
|
||||
Self {
|
||||
chip_id: ChipId(config.position as u8),
|
||||
config,
|
||||
state: PipelineState::Idle,
|
||||
seq_counter: 0,
|
||||
in_flight: HVec::new(),
|
||||
output_queue: HVec::new(),
|
||||
input_buffer: HVec::new(),
|
||||
barrier_counter: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current pipeline state
|
||||
pub fn state(&self) -> PipelineState {
|
||||
self.state
|
||||
}
|
||||
|
||||
/// Check if this chip should handle embedding
|
||||
pub fn handles_embedding(&self) -> bool {
|
||||
self.config.role() == PipelineRole::Head ||
|
||||
self.config.role() == PipelineRole::Standalone
|
||||
}
|
||||
|
||||
/// Check if this chip should handle output head
|
||||
pub fn handles_output(&self) -> bool {
|
||||
self.config.role() == PipelineRole::Tail ||
|
||||
self.config.role() == PipelineRole::Standalone
|
||||
}
|
||||
|
||||
/// Start processing a new token (head chip only)
|
||||
pub fn start_token(&mut self, token_id: u16) -> crate::Result<()> {
|
||||
if !self.handles_embedding() {
|
||||
return Err(crate::Error::UnsupportedFeature("Not head chip"));
|
||||
}
|
||||
|
||||
if self.in_flight.len() >= MAX_PIPELINE_DEPTH {
|
||||
return Err(crate::Error::BufferOverflow);
|
||||
}
|
||||
|
||||
let token = InFlightToken {
|
||||
seq_pos: self.seq_counter,
|
||||
token_id,
|
||||
current_layer: 0,
|
||||
activation: HVec::new(),
|
||||
};
|
||||
|
||||
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
self.seq_counter += 1;
|
||||
self.state = PipelineState::Processing;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Receive activation from previous chip
|
||||
pub fn receive_activation(&mut self, msg: &FederationMessage) -> crate::Result<()> {
|
||||
let (layer_idx, position, data) = msg.get_activation_data()
|
||||
.ok_or(crate::Error::InvalidModel("Invalid activation message"))?;
|
||||
|
||||
// Create in-flight token from received data
|
||||
let mut activation = HVec::new();
|
||||
for &d in data {
|
||||
activation.push(d as i8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
let token = InFlightToken {
|
||||
seq_pos: position,
|
||||
token_id: 0, // Not needed for middle/tail chips
|
||||
current_layer: layer_idx,
|
||||
activation,
|
||||
};
|
||||
|
||||
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
self.state = PipelineState::Processing;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process one step (one layer for one token)
|
||||
/// Returns true if there's work to do
|
||||
pub fn process_step<F>(&mut self, mut layer_fn: F) -> crate::Result<bool>
|
||||
where
|
||||
F: FnMut(usize, &mut [i8]) -> crate::Result<()>,
|
||||
{
|
||||
if self.in_flight.is_empty() {
|
||||
self.state = PipelineState::WaitingInput;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// Process first token in queue
|
||||
let token = &mut self.in_flight[0];
|
||||
|
||||
// Determine which layer to process
|
||||
let relative_layer = token.current_layer as usize - self.config.layer_start;
|
||||
|
||||
if relative_layer < self.config.layer_count {
|
||||
// Process this layer
|
||||
let layer_idx = self.config.layer_start + relative_layer;
|
||||
layer_fn(layer_idx, &mut token.activation)?;
|
||||
token.current_layer += 1;
|
||||
}
|
||||
|
||||
// Check if done with this chip's layers
|
||||
let next_layer = token.current_layer as usize;
|
||||
if next_layer >= self.config.layer_start + self.config.layer_count {
|
||||
// Move to output queue
|
||||
if let Some(completed) = self.in_flight.pop() {
|
||||
self.output_queue.push(completed).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
self.state = PipelineState::WaitingSend;
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Get activation to send to next chip
|
||||
pub fn get_output(&mut self) -> Option<FederationMessage> {
|
||||
if self.output_queue.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let token = self.output_queue.pop()?;
|
||||
let next_chip = self.config.next_chip()?;
|
||||
|
||||
// Convert activation to bytes
|
||||
let data: Vec<i8> = token.activation.iter().cloned().collect();
|
||||
|
||||
FederationMessage::activation(
|
||||
self.chip_id,
|
||||
next_chip,
|
||||
token.seq_pos,
|
||||
token.current_layer,
|
||||
token.seq_pos,
|
||||
&data,
|
||||
).ok()
|
||||
}
|
||||
|
||||
/// Check if output is available (for tail chip)
|
||||
pub fn has_final_output(&self) -> bool {
|
||||
self.handles_output() && !self.output_queue.is_empty()
|
||||
}
|
||||
|
||||
/// Get final output logits (tail chip only)
|
||||
pub fn get_final_output(&mut self) -> Option<HVec<i8, 128>> {
|
||||
if !self.handles_output() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let token = self.output_queue.pop()?;
|
||||
Some(token.activation)
|
||||
}
|
||||
|
||||
/// Get pipeline statistics
|
||||
pub fn stats(&self) -> PipelineStats {
|
||||
PipelineStats {
|
||||
in_flight_count: self.in_flight.len(),
|
||||
output_queue_len: self.output_queue.len(),
|
||||
tokens_processed: self.seq_counter as usize,
|
||||
current_state: self.state,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create synchronization barrier
|
||||
pub fn create_barrier(&mut self) -> FederationMessage {
|
||||
self.barrier_counter += 1;
|
||||
FederationMessage::barrier(self.chip_id, self.barrier_counter)
|
||||
}
|
||||
}
|
||||
|
||||
/// Pipeline statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PipelineStats {
|
||||
/// Tokens currently in pipeline
|
||||
pub in_flight_count: usize,
|
||||
/// Tokens waiting to send
|
||||
pub output_queue_len: usize,
|
||||
/// Total tokens processed
|
||||
pub tokens_processed: usize,
|
||||
/// Current state
|
||||
pub current_state: PipelineState,
|
||||
}
|
||||
|
||||
/// Calculate pipeline efficiency
|
||||
pub fn calculate_pipeline_efficiency(
|
||||
num_chips: usize,
|
||||
tokens_generated: usize,
|
||||
) -> f32 {
|
||||
// Pipeline efficiency = useful work / total work
|
||||
// With N chips, first N-1 tokens have bubble overhead
|
||||
if tokens_generated <= num_chips {
|
||||
tokens_generated as f32 / (num_chips as f32 * tokens_generated as f32)
|
||||
} else {
|
||||
// After warmup, efficiency approaches 100%
|
||||
let warmup_overhead = (num_chips - 1) as f32;
|
||||
let useful_work = tokens_generated as f32;
|
||||
useful_work / (useful_work + warmup_overhead)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_config() {
|
||||
// 5 chips, 10 layers
|
||||
let config = PipelineConfig::for_chip(0, 5, 10, 64);
|
||||
assert_eq!(config.role(), PipelineRole::Head);
|
||||
assert_eq!(config.layer_start, 0);
|
||||
assert_eq!(config.layer_count, 2);
|
||||
|
||||
let config = PipelineConfig::for_chip(2, 5, 10, 64);
|
||||
assert_eq!(config.role(), PipelineRole::Middle);
|
||||
assert_eq!(config.layer_start, 4);
|
||||
|
||||
let config = PipelineConfig::for_chip(4, 5, 10, 64);
|
||||
assert_eq!(config.role(), PipelineRole::Tail);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_efficiency() {
|
||||
// After 100 tokens, efficiency should be high
|
||||
let eff = calculate_pipeline_efficiency(5, 100);
|
||||
assert!(eff > 0.95);
|
||||
|
||||
// During warmup, efficiency is lower
|
||||
let eff_warmup = calculate_pipeline_efficiency(5, 5);
|
||||
assert!(eff_warmup < 0.5);
|
||||
}
|
||||
}
|
||||
414
vendor/ruvector/examples/ruvLLM/esp32/src/federation/protocol.rs
vendored
Normal file
414
vendor/ruvector/examples/ruvLLM/esp32/src/federation/protocol.rs
vendored
Normal file
@@ -0,0 +1,414 @@
|
||||
//! Inter-Chip Communication Protocol
|
||||
//!
|
||||
//! Defines the message format for ESP32-to-ESP32 communication.
|
||||
//! Designed for low overhead on SPI/I2C/UART buses.
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
|
||||
/// Maximum activation size that can be sent in one message
|
||||
pub const MAX_ACTIVATION_SIZE: usize = 256;
|
||||
/// Maximum message payload
|
||||
pub const MAX_PAYLOAD_SIZE: usize = 512;
|
||||
/// Protocol version
|
||||
pub const PROTOCOL_VERSION: u8 = 1;
|
||||
|
||||
/// Chip identifier in the federation
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
|
||||
pub struct ChipId(pub u8);
|
||||
|
||||
impl ChipId {
|
||||
pub const BROADCAST: ChipId = ChipId(0xFF);
|
||||
|
||||
pub fn is_broadcast(&self) -> bool {
|
||||
self.0 == 0xFF
|
||||
}
|
||||
}
|
||||
|
||||
/// Message types for federation protocol
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum MessageType {
|
||||
/// Heartbeat / keep-alive
|
||||
Heartbeat = 0x00,
|
||||
/// Cluster discovery
|
||||
Discovery = 0x01,
|
||||
/// Ready signal
|
||||
Ready = 0x02,
|
||||
|
||||
/// Forward pass activation data
|
||||
Activation = 0x10,
|
||||
/// Attention K/V cache update
|
||||
KVCache = 0x11,
|
||||
/// Gradient (for future training)
|
||||
Gradient = 0x12,
|
||||
|
||||
/// Token embedding request
|
||||
EmbedRequest = 0x20,
|
||||
/// Token embedding response
|
||||
EmbedResponse = 0x21,
|
||||
/// Output logits
|
||||
Logits = 0x22,
|
||||
/// Sampled token
|
||||
Token = 0x23,
|
||||
|
||||
/// Speculative draft tokens
|
||||
DraftTokens = 0x30,
|
||||
/// Verification result
|
||||
VerifyResult = 0x31,
|
||||
|
||||
/// Synchronization barrier
|
||||
Barrier = 0x40,
|
||||
/// Acknowledgment
|
||||
Ack = 0x41,
|
||||
/// Error
|
||||
Error = 0xFF,
|
||||
}
|
||||
|
||||
impl From<u8> for MessageType {
|
||||
fn from(v: u8) -> Self {
|
||||
match v {
|
||||
0x00 => Self::Heartbeat,
|
||||
0x01 => Self::Discovery,
|
||||
0x02 => Self::Ready,
|
||||
0x10 => Self::Activation,
|
||||
0x11 => Self::KVCache,
|
||||
0x12 => Self::Gradient,
|
||||
0x20 => Self::EmbedRequest,
|
||||
0x21 => Self::EmbedResponse,
|
||||
0x22 => Self::Logits,
|
||||
0x23 => Self::Token,
|
||||
0x30 => Self::DraftTokens,
|
||||
0x31 => Self::VerifyResult,
|
||||
0x40 => Self::Barrier,
|
||||
0x41 => Self::Ack,
|
||||
_ => Self::Error,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Message header (8 bytes)
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[repr(C, packed)]
|
||||
pub struct MessageHeader {
|
||||
/// Protocol version
|
||||
pub version: u8,
|
||||
/// Message type
|
||||
pub msg_type: u8,
|
||||
/// Source chip ID
|
||||
pub src: u8,
|
||||
/// Destination chip ID
|
||||
pub dst: u8,
|
||||
/// Sequence number (for ordering)
|
||||
pub seq: u16,
|
||||
/// Payload length
|
||||
pub payload_len: u16,
|
||||
}
|
||||
|
||||
impl MessageHeader {
|
||||
pub const SIZE: usize = 8;
|
||||
|
||||
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16, payload_len: u16) -> Self {
|
||||
Self {
|
||||
version: PROTOCOL_VERSION,
|
||||
msg_type: msg_type as u8,
|
||||
src: src.0,
|
||||
dst: dst.0,
|
||||
seq,
|
||||
payload_len,
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize to bytes
|
||||
pub fn to_bytes(&self) -> [u8; 8] {
|
||||
[
|
||||
self.version,
|
||||
self.msg_type,
|
||||
self.src,
|
||||
self.dst,
|
||||
(self.seq & 0xFF) as u8,
|
||||
(self.seq >> 8) as u8,
|
||||
(self.payload_len & 0xFF) as u8,
|
||||
(self.payload_len >> 8) as u8,
|
||||
]
|
||||
}
|
||||
|
||||
/// Deserialize from bytes
|
||||
pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
|
||||
if bytes.len() < 8 {
|
||||
return None;
|
||||
}
|
||||
Some(Self {
|
||||
version: bytes[0],
|
||||
msg_type: bytes[1],
|
||||
src: bytes[2],
|
||||
dst: bytes[3],
|
||||
seq: (bytes[4] as u16) | ((bytes[5] as u16) << 8),
|
||||
payload_len: (bytes[6] as u16) | ((bytes[7] as u16) << 8),
|
||||
})
|
||||
}
|
||||
|
||||
/// Calculate simple checksum
|
||||
pub fn checksum(&self) -> u8 {
|
||||
let bytes = self.to_bytes();
|
||||
bytes.iter().fold(0u8, |acc, &b| acc.wrapping_add(b))
|
||||
}
|
||||
}
|
||||
|
||||
/// Complete federation message
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationMessage {
|
||||
/// Message header
|
||||
pub header: MessageHeader,
|
||||
/// Payload data
|
||||
pub payload: HVec<u8, MAX_PAYLOAD_SIZE>,
|
||||
/// Checksum
|
||||
pub checksum: u8,
|
||||
}
|
||||
|
||||
impl FederationMessage {
|
||||
/// Create new message
|
||||
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16) -> Self {
|
||||
Self {
|
||||
header: MessageHeader::new(msg_type, src, dst, seq, 0),
|
||||
payload: HVec::new(),
|
||||
checksum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create activation message with INT8 data
|
||||
pub fn activation(
|
||||
src: ChipId,
|
||||
dst: ChipId,
|
||||
seq: u16,
|
||||
layer_idx: u8,
|
||||
position: u16,
|
||||
data: &[i8],
|
||||
) -> crate::Result<Self> {
|
||||
let mut msg = Self::new(MessageType::Activation, src, dst, seq);
|
||||
|
||||
// Payload format: [layer_idx:1][position:2][data:N]
|
||||
msg.payload.push(layer_idx).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((position & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((position >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
|
||||
for &d in data {
|
||||
msg.payload.push(d as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
msg.header.payload_len = msg.payload.len() as u16;
|
||||
msg.update_checksum();
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
/// Create token message
|
||||
pub fn token(src: ChipId, dst: ChipId, seq: u16, token_id: u16) -> Self {
|
||||
let mut msg = Self::new(MessageType::Token, src, dst, seq);
|
||||
let _ = msg.payload.push((token_id & 0xFF) as u8);
|
||||
let _ = msg.payload.push((token_id >> 8) as u8);
|
||||
msg.header.payload_len = 2;
|
||||
msg.update_checksum();
|
||||
msg
|
||||
}
|
||||
|
||||
/// Create draft tokens message for speculative decoding
|
||||
pub fn draft_tokens(src: ChipId, dst: ChipId, seq: u16, tokens: &[u16]) -> crate::Result<Self> {
|
||||
let mut msg = Self::new(MessageType::DraftTokens, src, dst, seq);
|
||||
|
||||
msg.payload.push(tokens.len() as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
|
||||
for &t in tokens {
|
||||
msg.payload.push((t & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((t >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
msg.header.payload_len = msg.payload.len() as u16;
|
||||
msg.update_checksum();
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
/// Create barrier synchronization message
|
||||
pub fn barrier(src: ChipId, barrier_id: u16) -> Self {
|
||||
let mut msg = Self::new(MessageType::Barrier, src, ChipId::BROADCAST, 0);
|
||||
let _ = msg.payload.push((barrier_id & 0xFF) as u8);
|
||||
let _ = msg.payload.push((barrier_id >> 8) as u8);
|
||||
msg.header.payload_len = 2;
|
||||
msg.update_checksum();
|
||||
msg
|
||||
}
|
||||
|
||||
/// Update checksum
|
||||
pub fn update_checksum(&mut self) {
|
||||
let mut sum = self.header.checksum();
|
||||
for &b in &self.payload {
|
||||
sum = sum.wrapping_add(b);
|
||||
}
|
||||
self.checksum = sum;
|
||||
}
|
||||
|
||||
/// Verify checksum
|
||||
pub fn verify_checksum(&self) -> bool {
|
||||
let mut sum = self.header.checksum();
|
||||
for &b in &self.payload {
|
||||
sum = sum.wrapping_add(b);
|
||||
}
|
||||
sum == self.checksum
|
||||
}
|
||||
|
||||
/// Serialize to bytes
|
||||
pub fn to_bytes(&self) -> HVec<u8, { MAX_PAYLOAD_SIZE + 16 }> {
|
||||
let mut bytes = HVec::new();
|
||||
|
||||
// Header
|
||||
for b in self.header.to_bytes() {
|
||||
let _ = bytes.push(b);
|
||||
}
|
||||
|
||||
// Payload
|
||||
for &b in &self.payload {
|
||||
let _ = bytes.push(b);
|
||||
}
|
||||
|
||||
// Checksum
|
||||
let _ = bytes.push(self.checksum);
|
||||
|
||||
bytes
|
||||
}
|
||||
|
||||
/// Deserialize from bytes
|
||||
pub fn from_bytes(bytes: &[u8]) -> crate::Result<Self> {
|
||||
if bytes.len() < MessageHeader::SIZE + 1 {
|
||||
return Err(crate::Error::InvalidModel("Message too short"));
|
||||
}
|
||||
|
||||
let header = MessageHeader::from_bytes(bytes)
|
||||
.ok_or(crate::Error::InvalidModel("Invalid header"))?;
|
||||
|
||||
let payload_end = MessageHeader::SIZE + header.payload_len as usize;
|
||||
if bytes.len() < payload_end + 1 {
|
||||
return Err(crate::Error::InvalidModel("Payload incomplete"));
|
||||
}
|
||||
|
||||
let mut payload = HVec::new();
|
||||
for &b in &bytes[MessageHeader::SIZE..payload_end] {
|
||||
payload.push(b).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
let checksum = bytes[payload_end];
|
||||
|
||||
let msg = Self {
|
||||
header,
|
||||
payload,
|
||||
checksum,
|
||||
};
|
||||
|
||||
if !msg.verify_checksum() {
|
||||
return Err(crate::Error::InvalidModel("Checksum mismatch"));
|
||||
}
|
||||
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
/// Extract activation data from payload
|
||||
pub fn get_activation_data(&self) -> Option<(u8, u16, &[u8])> {
|
||||
if self.header.msg_type != MessageType::Activation as u8 {
|
||||
return None;
|
||||
}
|
||||
if self.payload.len() < 3 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let layer_idx = self.payload[0];
|
||||
let position = (self.payload[1] as u16) | ((self.payload[2] as u16) << 8);
|
||||
let data = &self.payload[3..];
|
||||
|
||||
Some((layer_idx, position, data))
|
||||
}
|
||||
|
||||
/// Extract token from payload
|
||||
pub fn get_token(&self) -> Option<u16> {
|
||||
if self.header.msg_type != MessageType::Token as u8 {
|
||||
return None;
|
||||
}
|
||||
if self.payload.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((self.payload[0] as u16) | ((self.payload[1] as u16) << 8))
|
||||
}
|
||||
}
|
||||
|
||||
/// Communication statistics
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct CommStats {
|
||||
/// Messages sent
|
||||
pub messages_sent: u32,
|
||||
/// Messages received
|
||||
pub messages_received: u32,
|
||||
/// Bytes sent
|
||||
pub bytes_sent: u32,
|
||||
/// Bytes received
|
||||
pub bytes_received: u32,
|
||||
/// Checksum errors
|
||||
pub checksum_errors: u32,
|
||||
/// Timeouts
|
||||
pub timeouts: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_message_header() {
|
||||
let header = MessageHeader::new(
|
||||
MessageType::Activation,
|
||||
ChipId(0),
|
||||
ChipId(1),
|
||||
42,
|
||||
100,
|
||||
);
|
||||
|
||||
let bytes = header.to_bytes();
|
||||
let decoded = MessageHeader::from_bytes(&bytes).unwrap();
|
||||
|
||||
assert_eq!(decoded.msg_type, MessageType::Activation as u8);
|
||||
assert_eq!(decoded.src, 0);
|
||||
assert_eq!(decoded.dst, 1);
|
||||
// Copy packed fields to avoid UB from unaligned references
|
||||
let seq = decoded.seq;
|
||||
let payload_len = decoded.payload_len;
|
||||
assert_eq!(seq, 42);
|
||||
assert_eq!(payload_len, 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_activation_message() {
|
||||
let data: [i8; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
|
||||
let msg = FederationMessage::activation(
|
||||
ChipId(0),
|
||||
ChipId(1),
|
||||
1,
|
||||
0,
|
||||
10,
|
||||
&data,
|
||||
).unwrap();
|
||||
|
||||
let bytes = msg.to_bytes();
|
||||
let decoded = FederationMessage::from_bytes(&bytes).unwrap();
|
||||
|
||||
let (layer, pos, act_data) = decoded.get_activation_data().unwrap();
|
||||
assert_eq!(layer, 0);
|
||||
assert_eq!(pos, 10);
|
||||
assert_eq!(act_data.len(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_token_message() {
|
||||
let msg = FederationMessage::token(ChipId(4), ChipId(0), 100, 12345);
|
||||
|
||||
let bytes = msg.to_bytes();
|
||||
let decoded = FederationMessage::from_bytes(&bytes).unwrap();
|
||||
|
||||
assert_eq!(decoded.get_token(), Some(12345));
|
||||
}
|
||||
}
|
||||
143
vendor/ruvector/examples/ruvLLM/esp32/src/federation/sharding.rs
vendored
Normal file
143
vendor/ruvector/examples/ruvLLM/esp32/src/federation/sharding.rs
vendored
Normal file
@@ -0,0 +1,143 @@
|
||||
//! Embedding Sharding - Distribute Vocabulary Across Chips
|
||||
//!
|
||||
//! For large vocabularies, shard embeddings across chips.
|
||||
//! Each chip holds a portion of the embedding table.
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::ChipId;
|
||||
|
||||
/// Sharding configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ShardConfig {
|
||||
/// Total vocabulary size
|
||||
pub vocab_size: usize,
|
||||
/// Number of shards (chips)
|
||||
pub num_shards: usize,
|
||||
/// This chip's shard ID
|
||||
pub shard_id: usize,
|
||||
/// Embedding dimension
|
||||
pub embed_dim: usize,
|
||||
/// Vocab range for this shard
|
||||
pub vocab_start: usize,
|
||||
pub vocab_end: usize,
|
||||
}
|
||||
|
||||
impl ShardConfig {
|
||||
/// Create config for a specific shard
|
||||
pub fn for_shard(
|
||||
shard_id: usize,
|
||||
num_shards: usize,
|
||||
vocab_size: usize,
|
||||
embed_dim: usize,
|
||||
) -> Self {
|
||||
let vocab_per_shard = (vocab_size + num_shards - 1) / num_shards;
|
||||
let vocab_start = shard_id * vocab_per_shard;
|
||||
let vocab_end = (vocab_start + vocab_per_shard).min(vocab_size);
|
||||
|
||||
Self {
|
||||
vocab_size,
|
||||
num_shards,
|
||||
shard_id,
|
||||
embed_dim,
|
||||
vocab_start,
|
||||
vocab_end,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if this shard handles a token
|
||||
pub fn handles_token(&self, token_id: u16) -> bool {
|
||||
let t = token_id as usize;
|
||||
t >= self.vocab_start && t < self.vocab_end
|
||||
}
|
||||
|
||||
/// Get shard that handles a token
|
||||
pub fn shard_for_token(token_id: u16, num_shards: usize, vocab_size: usize) -> usize {
|
||||
let vocab_per_shard = (vocab_size + num_shards - 1) / num_shards;
|
||||
(token_id as usize) / vocab_per_shard
|
||||
}
|
||||
|
||||
/// Vocab size for this shard
|
||||
pub fn shard_vocab_size(&self) -> usize {
|
||||
self.vocab_end - self.vocab_start
|
||||
}
|
||||
}
|
||||
|
||||
/// Sharded embedding table
|
||||
pub struct ShardedEmbedding<const MAX_VOCAB: usize, const DIM: usize> {
|
||||
config: ShardConfig,
|
||||
/// Local embedding weights (only our shard)
|
||||
weights: HVec<i8, 8192>, // Max 8KB per shard
|
||||
}
|
||||
|
||||
impl<const MAX_VOCAB: usize, const DIM: usize> ShardedEmbedding<MAX_VOCAB, DIM> {
|
||||
/// Create sharded embedding
|
||||
pub fn new(config: ShardConfig, seed: u32) -> crate::Result<Self> {
|
||||
let shard_size = config.shard_vocab_size() * config.embed_dim;
|
||||
|
||||
let mut weights = HVec::new();
|
||||
let mut rng_state = seed.wrapping_add(config.shard_id as u32 * 12345);
|
||||
|
||||
for _ in 0..shard_size {
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let val = (((rng_state >> 16) & 0xFF) as i16 - 128) as i8;
|
||||
weights.push(val).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
Ok(Self { config, weights })
|
||||
}
|
||||
|
||||
/// Lookup embedding (only works if we have the token)
|
||||
pub fn lookup(&self, token_id: u16, output: &mut [i8]) -> crate::Result<bool> {
|
||||
if !self.config.handles_token(token_id) {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let local_idx = token_id as usize - self.config.vocab_start;
|
||||
let start = local_idx * self.config.embed_dim;
|
||||
let end = start + self.config.embed_dim;
|
||||
|
||||
if end > self.weights.len() || output.len() < self.config.embed_dim {
|
||||
return Err(crate::Error::BufferOverflow);
|
||||
}
|
||||
|
||||
output[..self.config.embed_dim].copy_from_slice(&self.weights[start..end]);
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Memory per shard vs full embedding
|
||||
pub fn memory_saved(&self) -> f32 {
|
||||
self.config.num_shards as f32
|
||||
}
|
||||
|
||||
/// Get responsible chip for a token
|
||||
pub fn responsible_chip(&self, token_id: u16) -> ChipId {
|
||||
let shard = ShardConfig::shard_for_token(
|
||||
token_id,
|
||||
self.config.num_shards,
|
||||
self.config.vocab_size,
|
||||
);
|
||||
ChipId(shard as u8)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sharding() {
|
||||
// 1000 vocab, 5 shards
|
||||
let config = ShardConfig::for_shard(2, 5, 1000, 32);
|
||||
|
||||
assert_eq!(config.vocab_start, 400);
|
||||
assert_eq!(config.vocab_end, 600);
|
||||
assert!(config.handles_token(450));
|
||||
assert!(!config.handles_token(300));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shard_lookup() {
|
||||
let shard = ShardConfig::shard_for_token(450, 5, 1000);
|
||||
assert_eq!(shard, 2);
|
||||
}
|
||||
}
|
||||
294
vendor/ruvector/examples/ruvLLM/esp32/src/federation/speculative.rs
vendored
Normal file
294
vendor/ruvector/examples/ruvLLM/esp32/src/federation/speculative.rs
vendored
Normal file
@@ -0,0 +1,294 @@
|
||||
//! Speculative Decoding - Draft and Verify
|
||||
//!
|
||||
//! Use a smaller/faster model to draft tokens, verify with larger model.
|
||||
//! Perfect for federated setup: one chip drafts, others verify in parallel.
|
||||
//!
|
||||
//! # Benefits
|
||||
//! - 2-4x speedup for autoregressive generation
|
||||
//! - Maintains exact output quality
|
||||
//! - Natural fit for multi-chip setup
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::{ChipId, FederationMessage};
|
||||
|
||||
/// Maximum draft tokens per batch
|
||||
pub const MAX_DRAFT_TOKENS: usize = 8;
|
||||
|
||||
/// Speculative decoding configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DraftVerifyConfig {
|
||||
/// Number of draft tokens to generate
|
||||
pub draft_length: usize,
|
||||
/// Acceptance threshold (0.0-1.0)
|
||||
pub acceptance_threshold: f32,
|
||||
/// Draft chip ID (usually chip 0)
|
||||
pub draft_chip: ChipId,
|
||||
/// Verify chips (all others)
|
||||
pub verify_chips: HVec<ChipId, 4>,
|
||||
/// Enable adaptive draft length
|
||||
pub adaptive: bool,
|
||||
}
|
||||
|
||||
impl Default for DraftVerifyConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
draft_length: 4,
|
||||
acceptance_threshold: 0.9,
|
||||
draft_chip: ChipId(0),
|
||||
verify_chips: HVec::new(),
|
||||
adaptive: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DraftVerifyConfig {
|
||||
/// Create config for 5-chip setup
|
||||
pub fn for_five_chips() -> Self {
|
||||
let mut verify_chips = HVec::new();
|
||||
for i in 1..5 {
|
||||
let _ = verify_chips.push(ChipId(i));
|
||||
}
|
||||
|
||||
Self {
|
||||
draft_length: 4,
|
||||
acceptance_threshold: 0.9,
|
||||
draft_chip: ChipId(0),
|
||||
verify_chips,
|
||||
adaptive: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Draft result from drafting chip
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DraftResult {
|
||||
/// Draft token IDs
|
||||
pub tokens: HVec<u16, MAX_DRAFT_TOKENS>,
|
||||
/// Draft token probabilities (fixed-point, 0-255)
|
||||
pub probs: HVec<u8, MAX_DRAFT_TOKENS>,
|
||||
/// Starting position
|
||||
pub start_pos: u16,
|
||||
}
|
||||
|
||||
/// Verification result from verifying chip
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VerifyResult {
|
||||
/// Number of accepted tokens
|
||||
pub accepted_count: usize,
|
||||
/// Correct token for first rejection (if any)
|
||||
pub correction: Option<u16>,
|
||||
/// Verification probabilities
|
||||
pub verify_probs: HVec<u8, MAX_DRAFT_TOKENS>,
|
||||
}
|
||||
|
||||
/// Speculative decoder
|
||||
pub struct SpeculativeDecoder {
|
||||
config: DraftVerifyConfig,
|
||||
/// Is this the draft chip?
|
||||
is_draft_chip: bool,
|
||||
/// Current acceptance rate (for adaptive)
|
||||
acceptance_rate: f32,
|
||||
/// Draft tokens waiting for verification
|
||||
pending_draft: Option<DraftResult>,
|
||||
/// Statistics
|
||||
stats: SpecStats,
|
||||
}
|
||||
|
||||
impl SpeculativeDecoder {
|
||||
/// Create for a specific chip
|
||||
pub fn new(config: DraftVerifyConfig, chip_id: ChipId) -> Self {
|
||||
let is_draft_chip = chip_id == config.draft_chip;
|
||||
|
||||
Self {
|
||||
config,
|
||||
is_draft_chip,
|
||||
acceptance_rate: 0.9,
|
||||
pending_draft: None,
|
||||
stats: SpecStats::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if this is the drafting chip
|
||||
pub fn is_drafter(&self) -> bool {
|
||||
self.is_draft_chip
|
||||
}
|
||||
|
||||
/// Submit draft tokens (drafter only)
|
||||
pub fn submit_draft(&mut self, draft: DraftResult) -> crate::Result<FederationMessage> {
|
||||
if !self.is_draft_chip {
|
||||
return Err(crate::Error::UnsupportedFeature("Not draft chip"));
|
||||
}
|
||||
|
||||
// Create message to broadcast to verify chips
|
||||
let tokens: Vec<u16> = draft.tokens.iter().cloned().collect();
|
||||
let msg = FederationMessage::draft_tokens(
|
||||
self.config.draft_chip,
|
||||
ChipId::BROADCAST,
|
||||
draft.start_pos,
|
||||
&tokens,
|
||||
)?;
|
||||
|
||||
self.pending_draft = Some(draft);
|
||||
self.stats.drafts_sent += 1;
|
||||
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
/// Verify draft tokens (verifier only)
|
||||
pub fn verify_draft<F>(
|
||||
&mut self,
|
||||
draft: &DraftResult,
|
||||
mut get_prob: F,
|
||||
) -> VerifyResult
|
||||
where
|
||||
F: FnMut(u16, u16) -> u8, // (position, token) -> probability
|
||||
{
|
||||
let mut accepted_count = 0;
|
||||
let mut correction = None;
|
||||
let mut verify_probs = HVec::new();
|
||||
|
||||
for (i, &token) in draft.tokens.iter().enumerate() {
|
||||
let pos = draft.start_pos + i as u16;
|
||||
let verify_prob = get_prob(pos, token);
|
||||
let _ = verify_probs.push(verify_prob);
|
||||
|
||||
let draft_prob = draft.probs.get(i).copied().unwrap_or(128);
|
||||
|
||||
// Acceptance criterion: verify_prob >= draft_prob * threshold
|
||||
let threshold = (draft_prob as f32 * self.config.acceptance_threshold) as u8;
|
||||
|
||||
if verify_prob >= threshold {
|
||||
accepted_count += 1;
|
||||
} else {
|
||||
// Rejection - sample correct token
|
||||
// In real impl, would sample from verify distribution
|
||||
correction = Some(token.wrapping_add(1)); // Placeholder
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
VerifyResult {
|
||||
accepted_count,
|
||||
correction,
|
||||
verify_probs,
|
||||
}
|
||||
}
|
||||
|
||||
/// Process verification result (drafter)
|
||||
pub fn process_verification(&mut self, result: &VerifyResult) -> HVec<u16, MAX_DRAFT_TOKENS> {
|
||||
let mut accepted_tokens = HVec::new();
|
||||
|
||||
if let Some(ref draft) = self.pending_draft {
|
||||
// Accept tokens up to rejection point
|
||||
for i in 0..result.accepted_count {
|
||||
if let Some(&token) = draft.tokens.get(i) {
|
||||
let _ = accepted_tokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
// Add correction if any
|
||||
if let Some(correct_token) = result.correction {
|
||||
let _ = accepted_tokens.push(correct_token);
|
||||
}
|
||||
|
||||
self.stats.tokens_accepted += result.accepted_count;
|
||||
self.stats.tokens_rejected += draft.tokens.len() - result.accepted_count;
|
||||
|
||||
// Update acceptance rate
|
||||
let batch_rate = result.accepted_count as f32 / draft.tokens.len() as f32;
|
||||
self.acceptance_rate = 0.9 * self.acceptance_rate + 0.1 * batch_rate;
|
||||
}
|
||||
|
||||
self.pending_draft = None;
|
||||
accepted_tokens
|
||||
}
|
||||
|
||||
/// Get adaptive draft length based on acceptance rate
|
||||
pub fn adaptive_draft_length(&self) -> usize {
|
||||
if !self.config.adaptive {
|
||||
return self.config.draft_length;
|
||||
}
|
||||
|
||||
// Higher acceptance -> longer drafts
|
||||
if self.acceptance_rate > 0.95 {
|
||||
(self.config.draft_length + 2).min(MAX_DRAFT_TOKENS)
|
||||
} else if self.acceptance_rate > 0.8 {
|
||||
self.config.draft_length
|
||||
} else if self.acceptance_rate > 0.5 {
|
||||
(self.config.draft_length - 1).max(1)
|
||||
} else {
|
||||
1 // Fall back to no speculation
|
||||
}
|
||||
}
|
||||
|
||||
/// Get speedup estimate
|
||||
pub fn estimated_speedup(&self) -> f32 {
|
||||
// Speedup = accepted_tokens / (1 + verify_overhead)
|
||||
let avg_accepted = self.acceptance_rate * self.adaptive_draft_length() as f32;
|
||||
let verify_overhead = 0.2; // Verification overhead
|
||||
avg_accepted / (1.0 + verify_overhead)
|
||||
}
|
||||
|
||||
/// Get statistics
|
||||
pub fn stats(&self) -> &SpecStats {
|
||||
&self.stats
|
||||
}
|
||||
}
|
||||
|
||||
/// Speculative decoding statistics
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct SpecStats {
|
||||
/// Total draft batches sent
|
||||
pub drafts_sent: usize,
|
||||
/// Total tokens accepted
|
||||
pub tokens_accepted: usize,
|
||||
/// Total tokens rejected
|
||||
pub tokens_rejected: usize,
|
||||
}
|
||||
|
||||
impl SpecStats {
|
||||
/// Overall acceptance rate
|
||||
pub fn acceptance_rate(&self) -> f32 {
|
||||
let total = self.tokens_accepted + self.tokens_rejected;
|
||||
if total == 0 {
|
||||
0.0
|
||||
} else {
|
||||
self.tokens_accepted as f32 / total as f32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_speculative_config() {
|
||||
let config = DraftVerifyConfig::for_five_chips();
|
||||
|
||||
assert_eq!(config.draft_chip, ChipId(0));
|
||||
assert_eq!(config.verify_chips.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_verify_draft() {
|
||||
let config = DraftVerifyConfig::default();
|
||||
let mut decoder = SpeculativeDecoder::new(config, ChipId(1));
|
||||
|
||||
let mut draft = DraftResult {
|
||||
tokens: HVec::new(),
|
||||
probs: HVec::new(),
|
||||
start_pos: 0,
|
||||
};
|
||||
let _ = draft.tokens.push(100);
|
||||
let _ = draft.tokens.push(101);
|
||||
let _ = draft.probs.push(200);
|
||||
let _ = draft.probs.push(200);
|
||||
|
||||
let result = decoder.verify_draft(&draft, |_pos, _token| 190);
|
||||
|
||||
// Both should be accepted (190 >= 200 * 0.9 = 180)
|
||||
assert_eq!(result.accepted_count, 2);
|
||||
assert!(result.correction.is_none());
|
||||
}
|
||||
}
|
||||
144
vendor/ruvector/examples/ruvLLM/esp32/src/federation/tensor_parallel.rs
vendored
Normal file
144
vendor/ruvector/examples/ruvLLM/esp32/src/federation/tensor_parallel.rs
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
//! Tensor Parallelism - Distributed Attention Heads
|
||||
//!
|
||||
//! Splits attention heads across chips for parallel computation.
|
||||
//! Each chip handles a subset of heads, then results are combined.
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::{ChipId, FederationMessage};
|
||||
|
||||
/// Maximum heads per chip
|
||||
pub const MAX_HEADS_PER_CHIP: usize = 4;
|
||||
|
||||
/// Tensor parallel configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TPConfig {
|
||||
/// Number of chips
|
||||
pub num_chips: usize,
|
||||
/// This chip's ID
|
||||
pub chip_id: ChipId,
|
||||
/// Total attention heads
|
||||
pub total_heads: usize,
|
||||
/// Heads handled by this chip
|
||||
pub my_heads: HVec<usize, MAX_HEADS_PER_CHIP>,
|
||||
/// Embedding dimension per head
|
||||
pub head_dim: usize,
|
||||
}
|
||||
|
||||
impl TPConfig {
|
||||
/// Create config distributing heads across chips
|
||||
pub fn distribute_heads(
|
||||
chip_id: usize,
|
||||
num_chips: usize,
|
||||
total_heads: usize,
|
||||
head_dim: usize,
|
||||
) -> Self {
|
||||
let mut my_heads = HVec::new();
|
||||
|
||||
// Assign heads round-robin style
|
||||
for h in 0..total_heads {
|
||||
if h % num_chips == chip_id {
|
||||
let _ = my_heads.push(h);
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
num_chips,
|
||||
chip_id: ChipId(chip_id as u8),
|
||||
total_heads,
|
||||
my_heads,
|
||||
head_dim,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tensor parallel attention node
|
||||
pub struct TensorParallelNode {
|
||||
config: TPConfig,
|
||||
/// Partial attention outputs from each head
|
||||
partial_outputs: HVec<HVec<i32, 64>, MAX_HEADS_PER_CHIP>,
|
||||
/// Combined output buffer
|
||||
output_buffer: HVec<i32, 256>,
|
||||
}
|
||||
|
||||
impl TensorParallelNode {
|
||||
pub fn new(config: TPConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
partial_outputs: HVec::new(),
|
||||
output_buffer: HVec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get heads this chip handles
|
||||
pub fn my_heads(&self) -> &[usize] {
|
||||
&self.config.my_heads
|
||||
}
|
||||
|
||||
/// Compute partial attention for assigned heads
|
||||
pub fn compute_partial_attention(
|
||||
&mut self,
|
||||
query: &[i8],
|
||||
keys: &[&[i8]],
|
||||
values: &[&[i8]],
|
||||
) -> crate::Result<()> {
|
||||
self.partial_outputs.clear();
|
||||
|
||||
for &head_idx in &self.config.my_heads {
|
||||
let mut head_output = HVec::new();
|
||||
|
||||
// Compute Q @ K^T for this head
|
||||
let head_start = head_idx * self.config.head_dim;
|
||||
let head_end = head_start + self.config.head_dim;
|
||||
|
||||
// Simplified attention: just dot product for now
|
||||
for &val in &values[0][head_start..head_end.min(values[0].len())] {
|
||||
head_output.push(val as i32).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
self.partial_outputs.push(head_output).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create message with partial results
|
||||
pub fn create_partial_result_message(&self, dst: ChipId, seq: u16) -> crate::Result<FederationMessage> {
|
||||
let mut data: Vec<i8> = Vec::new();
|
||||
|
||||
for partial in &self.partial_outputs {
|
||||
for &val in partial {
|
||||
data.push((val >> 8) as i8); // Scale down
|
||||
}
|
||||
}
|
||||
|
||||
FederationMessage::activation(
|
||||
self.config.chip_id,
|
||||
dst,
|
||||
seq,
|
||||
0, // Not layer-based
|
||||
0,
|
||||
&data,
|
||||
)
|
||||
}
|
||||
|
||||
/// Memory saved vs single-chip
|
||||
pub fn memory_reduction(&self) -> f32 {
|
||||
self.config.num_chips as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_head_distribution() {
|
||||
// 4 heads across 5 chips
|
||||
let config0 = TPConfig::distribute_heads(0, 5, 4, 16);
|
||||
let config1 = TPConfig::distribute_heads(1, 5, 4, 16);
|
||||
|
||||
// Chip 0 gets head 0, chip 1 gets head 1, etc.
|
||||
assert_eq!(config0.my_heads.as_slice(), &[0]);
|
||||
assert_eq!(config1.my_heads.as_slice(), &[1]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user