Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,176 @@
//! Federation Module for Multi-Chip Distributed Inference
//!
//! Supports:
//! - Pipeline parallelism (layers across chips)
//! - Tensor parallelism (attention heads across chips)
//! - Speculative decoding (draft/verify)
//! - SPI/I2C/UART/ESP-NOW communication
pub mod protocol;
pub mod pipeline;
pub mod speculative;
pub use protocol::{
ChipId, MessageType, MessageHeader, FederationMessage, CommStats,
MAX_ACTIVATION_SIZE, MAX_PAYLOAD_SIZE,
};
pub use pipeline::{
PipelineNode, PipelineConfig, PipelineRole, PipelineState, PipelineStats,
InFlightToken, calculate_pipeline_efficiency,
MAX_LAYERS_PER_CHIP, MAX_PIPELINE_DEPTH,
};
pub use speculative::{
SpeculativeDecoder, DraftVerifyConfig, DraftResult, VerifyResult, SpecStats,
MAX_DRAFT_TOKENS,
};
/// Maximum chips in federation
pub const MAX_FEDERATION_SIZE: usize = 8;
/// Federation mode
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum FederationMode {
Standalone,
Pipeline,
TensorParallel,
Hybrid,
Speculative,
MixtureOfExperts,
}
/// Communication bus type
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum CommunicationBus {
Spi,
I2c,
Uart,
EspNow,
Parallel,
}
impl CommunicationBus {
pub const fn bandwidth_bytes_per_sec(&self) -> usize {
match self {
Self::Spi => 10_000_000,
Self::I2c => 100_000,
Self::Uart => 500_000,
Self::EspNow => 125_000,
Self::Parallel => 20_000_000,
}
}
pub const fn latency_us(&self) -> usize {
match self {
Self::Spi => 10,
Self::I2c => 50,
Self::Uart => 20,
Self::EspNow => 500,
Self::Parallel => 5,
}
}
}
/// Federation configuration
#[derive(Debug, Clone)]
pub struct FederationConfig {
pub num_chips: usize,
pub chip_id: ChipId,
pub mode: FederationMode,
pub bus: CommunicationBus,
pub layers_per_chip: usize,
pub heads_per_chip: usize,
pub enable_pipelining: bool,
}
impl Default for FederationConfig {
fn default() -> Self {
Self {
num_chips: 5,
chip_id: ChipId(0),
mode: FederationMode::Pipeline,
bus: CommunicationBus::Spi,
layers_per_chip: 2,
heads_per_chip: 1,
enable_pipelining: true,
}
}
}
/// Calculate optimal federation config
pub fn calculate_optimal_config(
model_size: usize,
num_layers: usize,
num_heads: usize,
num_chips: usize,
per_chip_ram: usize,
) -> FederationConfig {
let model_per_chip = model_size / num_chips;
if model_per_chip <= per_chip_ram {
let layers_per_chip = (num_layers + num_chips - 1) / num_chips;
FederationConfig {
num_chips,
chip_id: ChipId(0),
mode: FederationMode::Pipeline,
bus: CommunicationBus::Spi,
layers_per_chip,
heads_per_chip: num_heads,
enable_pipelining: true,
}
} else {
let heads_per_chip = (num_heads + num_chips - 1) / num_chips;
FederationConfig {
num_chips,
chip_id: ChipId(0),
mode: FederationMode::TensorParallel,
bus: CommunicationBus::Spi,
layers_per_chip: num_layers,
heads_per_chip,
enable_pipelining: false,
}
}
}
/// Federation speedup estimates
#[derive(Debug, Clone)]
pub struct FederationSpeedup {
pub throughput_multiplier: f32,
pub latency_reduction: f32,
pub memory_per_chip_reduction: f32,
}
pub fn estimate_speedup(config: &FederationConfig) -> FederationSpeedup {
let n = config.num_chips as f32;
match config.mode {
FederationMode::Standalone => FederationSpeedup {
throughput_multiplier: 1.0,
latency_reduction: 1.0,
memory_per_chip_reduction: 1.0,
},
FederationMode::Pipeline => FederationSpeedup {
throughput_multiplier: n * 0.85,
latency_reduction: 1.0 / (1.0 + 0.1 * (n - 1.0)),
memory_per_chip_reduction: n,
},
FederationMode::TensorParallel => FederationSpeedup {
throughput_multiplier: n * 0.7,
latency_reduction: n * 0.7,
memory_per_chip_reduction: n * 0.8,
},
FederationMode::Hybrid => FederationSpeedup {
throughput_multiplier: n * 0.75,
latency_reduction: (n / 2.0) * 0.8,
memory_per_chip_reduction: n * 0.9,
},
FederationMode::Speculative => FederationSpeedup {
throughput_multiplier: 2.5,
latency_reduction: 2.0,
memory_per_chip_reduction: 1.0,
},
FederationMode::MixtureOfExperts => FederationSpeedup {
throughput_multiplier: n * 0.9,
latency_reduction: 1.5,
memory_per_chip_reduction: n,
},
}
}

View File

@@ -0,0 +1,180 @@
//! Pipeline Parallelism for Multi-ESP32 Inference
use heapless::Vec as HVec;
use super::protocol::{ChipId, FederationMessage};
pub const MAX_LAYERS_PER_CHIP: usize = 4;
pub const MAX_PIPELINE_DEPTH: usize = 8;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum PipelineRole { Head, Middle, Tail, Standalone }
#[derive(Debug, Clone)]
pub struct PipelineConfig {
pub num_chips: usize,
pub position: usize,
pub layer_start: usize,
pub layer_count: usize,
pub total_layers: usize,
pub embed_dim: usize,
pub micro_batch_size: usize,
}
impl PipelineConfig {
pub fn for_chip(chip_pos: usize, num_chips: usize, total_layers: usize, embed_dim: usize) -> Self {
let layers_per_chip = (total_layers + num_chips - 1) / num_chips;
let layer_start = chip_pos * layers_per_chip;
let layer_count = layers_per_chip.min(total_layers - layer_start);
Self { num_chips, position: chip_pos, layer_start, layer_count, total_layers, embed_dim, micro_batch_size: 1 }
}
pub fn role(&self) -> PipelineRole {
if self.num_chips == 1 { PipelineRole::Standalone }
else if self.position == 0 { PipelineRole::Head }
else if self.position == self.num_chips - 1 { PipelineRole::Tail }
else { PipelineRole::Middle }
}
pub fn prev_chip(&self) -> Option<ChipId> {
if self.position > 0 { Some(ChipId((self.position - 1) as u8)) } else { None }
}
pub fn next_chip(&self) -> Option<ChipId> {
if self.position + 1 < self.num_chips { Some(ChipId((self.position + 1) as u8)) } else { None }
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum PipelineState { WaitingInput, Processing, WaitingSend, Idle }
#[derive(Debug, Clone)]
pub struct InFlightToken {
pub seq_pos: u16,
pub token_id: u16,
pub current_layer: u8,
pub activation: HVec<i8, 128>,
}
pub struct PipelineNode {
config: PipelineConfig,
state: PipelineState,
chip_id: ChipId,
seq_counter: u16,
in_flight: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
output_queue: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
barrier_counter: u16,
}
impl PipelineNode {
pub fn new(config: PipelineConfig) -> Self {
Self {
chip_id: ChipId(config.position as u8),
config,
state: PipelineState::Idle,
seq_counter: 0,
in_flight: HVec::new(),
output_queue: HVec::new(),
barrier_counter: 0,
}
}
pub fn state(&self) -> PipelineState { self.state }
pub fn handles_embedding(&self) -> bool { matches!(self.config.role(), PipelineRole::Head | PipelineRole::Standalone) }
pub fn handles_output(&self) -> bool { matches!(self.config.role(), PipelineRole::Tail | PipelineRole::Standalone) }
pub fn start_token(&mut self, token_id: u16) -> crate::Result<()> {
if !self.handles_embedding() { return Err(crate::Error::UnsupportedFeature("Not head chip")); }
if self.in_flight.len() >= MAX_PIPELINE_DEPTH { return Err(crate::Error::BufferOverflow); }
let token = InFlightToken { seq_pos: self.seq_counter, token_id, current_layer: 0, activation: HVec::new() };
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
self.seq_counter += 1;
self.state = PipelineState::Processing;
Ok(())
}
pub fn receive_activation(&mut self, msg: &FederationMessage) -> crate::Result<()> {
let (layer_idx, position, data) = msg.get_activation_data()
.ok_or(crate::Error::InvalidModel("Invalid activation"))?;
let mut activation = HVec::new();
for &d in data { activation.push(d as i8).map_err(|_| crate::Error::BufferOverflow)?; }
let token = InFlightToken { seq_pos: position, token_id: 0, current_layer: layer_idx, activation };
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
self.state = PipelineState::Processing;
Ok(())
}
pub fn process_step<F>(&mut self, mut layer_fn: F) -> crate::Result<bool>
where F: FnMut(usize, &mut [i8]) -> crate::Result<()>
{
if self.in_flight.is_empty() {
self.state = PipelineState::WaitingInput;
return Ok(false);
}
let token = &mut self.in_flight[0];
let relative_layer = token.current_layer as usize - self.config.layer_start;
if relative_layer < self.config.layer_count {
let layer_idx = self.config.layer_start + relative_layer;
layer_fn(layer_idx, &mut token.activation)?;
token.current_layer += 1;
}
let next = token.current_layer as usize;
if next >= self.config.layer_start + self.config.layer_count {
if let Some(completed) = self.in_flight.pop() {
self.output_queue.push(completed).map_err(|_| crate::Error::BufferOverflow)?;
}
self.state = PipelineState::WaitingSend;
}
Ok(true)
}
pub fn get_output(&mut self) -> Option<FederationMessage> {
if self.output_queue.is_empty() { return None; }
let token = self.output_queue.pop()?;
let next_chip = self.config.next_chip()?;
let data: heapless::Vec<i8, 128> = token.activation.iter().cloned().collect();
FederationMessage::activation(self.chip_id, next_chip, token.seq_pos, token.current_layer, token.seq_pos, &data).ok()
}
pub fn has_final_output(&self) -> bool { self.handles_output() && !self.output_queue.is_empty() }
pub fn get_final_output(&mut self) -> Option<HVec<i8, 128>> {
if !self.handles_output() { return None; }
self.output_queue.pop().map(|t| t.activation)
}
pub fn stats(&self) -> PipelineStats {
PipelineStats {
in_flight_count: self.in_flight.len(),
output_queue_len: self.output_queue.len(),
tokens_processed: self.seq_counter as usize,
current_state: self.state,
}
}
pub fn create_barrier(&mut self) -> FederationMessage {
self.barrier_counter += 1;
FederationMessage::barrier(self.chip_id, self.barrier_counter)
}
}
#[derive(Debug, Clone)]
pub struct PipelineStats {
pub in_flight_count: usize,
pub output_queue_len: usize,
pub tokens_processed: usize,
pub current_state: PipelineState,
}
pub fn calculate_pipeline_efficiency(num_chips: usize, tokens: usize) -> f32 {
if tokens <= num_chips {
tokens as f32 / (num_chips as f32 * tokens as f32)
} else {
tokens as f32 / (tokens as f32 + (num_chips - 1) as f32)
}
}

View File

@@ -0,0 +1,187 @@
//! Inter-Chip Communication Protocol
use heapless::Vec as HVec;
pub const MAX_ACTIVATION_SIZE: usize = 256;
pub const MAX_PAYLOAD_SIZE: usize = 512;
pub const PROTOCOL_VERSION: u8 = 1;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub struct ChipId(pub u8);
impl ChipId {
pub const BROADCAST: ChipId = ChipId(0xFF);
pub fn is_broadcast(&self) -> bool { self.0 == 0xFF }
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(u8)]
pub enum MessageType {
Heartbeat = 0x00,
Discovery = 0x01,
Ready = 0x02,
Activation = 0x10,
KVCache = 0x11,
Gradient = 0x12,
EmbedRequest = 0x20,
EmbedResponse = 0x21,
Logits = 0x22,
Token = 0x23,
DraftTokens = 0x30,
VerifyResult = 0x31,
Barrier = 0x40,
Ack = 0x41,
Error = 0xFF,
}
impl From<u8> for MessageType {
fn from(v: u8) -> Self {
match v {
0x00 => Self::Heartbeat, 0x01 => Self::Discovery, 0x02 => Self::Ready,
0x10 => Self::Activation, 0x11 => Self::KVCache, 0x12 => Self::Gradient,
0x20 => Self::EmbedRequest, 0x21 => Self::EmbedResponse,
0x22 => Self::Logits, 0x23 => Self::Token,
0x30 => Self::DraftTokens, 0x31 => Self::VerifyResult,
0x40 => Self::Barrier, 0x41 => Self::Ack,
_ => Self::Error,
}
}
}
#[derive(Debug, Clone, Copy)]
#[repr(C, packed)]
pub struct MessageHeader {
pub version: u8,
pub msg_type: u8,
pub src: u8,
pub dst: u8,
pub seq: u16,
pub payload_len: u16,
}
impl MessageHeader {
pub const SIZE: usize = 8;
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16, payload_len: u16) -> Self {
Self { version: PROTOCOL_VERSION, msg_type: msg_type as u8, src: src.0, dst: dst.0, seq, payload_len }
}
pub fn to_bytes(&self) -> [u8; 8] {
[self.version, self.msg_type, self.src, self.dst,
(self.seq & 0xFF) as u8, (self.seq >> 8) as u8,
(self.payload_len & 0xFF) as u8, (self.payload_len >> 8) as u8]
}
pub fn from_bytes(b: &[u8]) -> Option<Self> {
if b.len() < 8 { return None; }
Some(Self {
version: b[0], msg_type: b[1], src: b[2], dst: b[3],
seq: (b[4] as u16) | ((b[5] as u16) << 8),
payload_len: (b[6] as u16) | ((b[7] as u16) << 8),
})
}
pub fn checksum(&self) -> u8 {
self.to_bytes().iter().fold(0u8, |acc, &b| acc.wrapping_add(b))
}
}
#[derive(Debug, Clone)]
pub struct FederationMessage {
pub header: MessageHeader,
pub payload: HVec<u8, MAX_PAYLOAD_SIZE>,
pub checksum: u8,
}
impl FederationMessage {
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16) -> Self {
Self {
header: MessageHeader::new(msg_type, src, dst, seq, 0),
payload: HVec::new(),
checksum: 0,
}
}
pub fn activation(src: ChipId, dst: ChipId, seq: u16, layer: u8, pos: u16, data: &[i8]) -> crate::Result<Self> {
let mut msg = Self::new(MessageType::Activation, src, dst, seq);
msg.payload.push(layer).map_err(|_| crate::Error::BufferOverflow)?;
msg.payload.push((pos & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
msg.payload.push((pos >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
for &d in data {
msg.payload.push(d as u8).map_err(|_| crate::Error::BufferOverflow)?;
}
msg.header.payload_len = msg.payload.len() as u16;
msg.update_checksum();
Ok(msg)
}
pub fn token(src: ChipId, dst: ChipId, seq: u16, token_id: u16) -> Self {
let mut msg = Self::new(MessageType::Token, src, dst, seq);
let _ = msg.payload.push((token_id & 0xFF) as u8);
let _ = msg.payload.push((token_id >> 8) as u8);
msg.header.payload_len = 2;
msg.update_checksum();
msg
}
pub fn draft_tokens(src: ChipId, dst: ChipId, seq: u16, tokens: &[u16]) -> crate::Result<Self> {
let mut msg = Self::new(MessageType::DraftTokens, src, dst, seq);
msg.payload.push(tokens.len() as u8).map_err(|_| crate::Error::BufferOverflow)?;
for &t in tokens {
msg.payload.push((t & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
msg.payload.push((t >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
}
msg.header.payload_len = msg.payload.len() as u16;
msg.update_checksum();
Ok(msg)
}
pub fn barrier(src: ChipId, barrier_id: u16) -> Self {
let mut msg = Self::new(MessageType::Barrier, src, ChipId::BROADCAST, 0);
let _ = msg.payload.push((barrier_id & 0xFF) as u8);
let _ = msg.payload.push((barrier_id >> 8) as u8);
msg.header.payload_len = 2;
msg.update_checksum();
msg
}
pub fn update_checksum(&mut self) {
let mut sum = self.header.checksum();
for &b in &self.payload { sum = sum.wrapping_add(b); }
self.checksum = sum;
}
pub fn verify_checksum(&self) -> bool {
let mut sum = self.header.checksum();
for &b in &self.payload { sum = sum.wrapping_add(b); }
sum == self.checksum
}
pub fn to_bytes(&self) -> HVec<u8, { MAX_PAYLOAD_SIZE + 16 }> {
let mut bytes = HVec::new();
for b in self.header.to_bytes() { let _ = bytes.push(b); }
for &b in &self.payload { let _ = bytes.push(b); }
let _ = bytes.push(self.checksum);
bytes
}
pub fn get_activation_data(&self) -> Option<(u8, u16, &[u8])> {
if self.header.msg_type != MessageType::Activation as u8 || self.payload.len() < 3 { return None; }
Some((self.payload[0], (self.payload[1] as u16) | ((self.payload[2] as u16) << 8), &self.payload[3..]))
}
pub fn get_token(&self) -> Option<u16> {
if self.header.msg_type != MessageType::Token as u8 || self.payload.len() < 2 { return None; }
Some((self.payload[0] as u16) | ((self.payload[1] as u16) << 8))
}
}
#[derive(Debug, Default, Clone)]
pub struct CommStats {
pub messages_sent: u32,
pub messages_received: u32,
pub bytes_sent: u32,
pub bytes_received: u32,
pub checksum_errors: u32,
pub timeouts: u32,
}

View File

@@ -0,0 +1,146 @@
//! Speculative Decoding - Draft and Verify
use heapless::Vec as HVec;
use super::protocol::{ChipId, FederationMessage};
pub const MAX_DRAFT_TOKENS: usize = 8;
#[derive(Debug, Clone)]
pub struct DraftVerifyConfig {
pub draft_length: usize,
pub acceptance_threshold: f32,
pub draft_chip: ChipId,
pub verify_chips: HVec<ChipId, 4>,
pub adaptive: bool,
}
impl Default for DraftVerifyConfig {
fn default() -> Self {
Self { draft_length: 4, acceptance_threshold: 0.9, draft_chip: ChipId(0), verify_chips: HVec::new(), adaptive: true }
}
}
impl DraftVerifyConfig {
pub fn for_five_chips() -> Self {
let mut verify_chips = HVec::new();
for i in 1..5 { let _ = verify_chips.push(ChipId(i)); }
Self { draft_length: 4, acceptance_threshold: 0.9, draft_chip: ChipId(0), verify_chips, adaptive: true }
}
}
#[derive(Debug, Clone)]
pub struct DraftResult {
pub tokens: HVec<u16, MAX_DRAFT_TOKENS>,
pub probs: HVec<u8, MAX_DRAFT_TOKENS>,
pub start_pos: u16,
}
#[derive(Debug, Clone)]
pub struct VerifyResult {
pub accepted_count: usize,
pub correction: Option<u16>,
pub verify_probs: HVec<u8, MAX_DRAFT_TOKENS>,
}
pub struct SpeculativeDecoder {
config: DraftVerifyConfig,
is_draft_chip: bool,
acceptance_rate: f32,
pending_draft: Option<DraftResult>,
stats: SpecStats,
}
impl SpeculativeDecoder {
pub fn new(config: DraftVerifyConfig, chip_id: ChipId) -> Self {
let is_draft = chip_id == config.draft_chip;
Self { config, is_draft_chip: is_draft, acceptance_rate: 0.9, pending_draft: None, stats: SpecStats::default() }
}
pub fn is_drafter(&self) -> bool { self.is_draft_chip }
pub fn submit_draft(&mut self, draft: DraftResult) -> crate::Result<FederationMessage> {
if !self.is_draft_chip { return Err(crate::Error::UnsupportedFeature("Not draft chip")); }
let tokens: heapless::Vec<u16, MAX_DRAFT_TOKENS> = draft.tokens.iter().cloned().collect();
let msg = FederationMessage::draft_tokens(self.config.draft_chip, ChipId::BROADCAST, draft.start_pos, &tokens)?;
self.pending_draft = Some(draft);
self.stats.drafts_sent += 1;
Ok(msg)
}
pub fn verify_draft<F>(&mut self, draft: &DraftResult, mut get_prob: F) -> VerifyResult
where F: FnMut(u16, u16) -> u8
{
let mut accepted = 0;
let mut correction = None;
let mut verify_probs = HVec::new();
for (i, &token) in draft.tokens.iter().enumerate() {
let pos = draft.start_pos + i as u16;
let verify_prob = get_prob(pos, token);
let _ = verify_probs.push(verify_prob);
let draft_prob = draft.probs.get(i).copied().unwrap_or(128);
let threshold = (draft_prob as f32 * self.config.acceptance_threshold) as u8;
if verify_prob >= threshold {
accepted += 1;
} else {
correction = Some(token.wrapping_add(1));
break;
}
}
VerifyResult { accepted_count: accepted, correction, verify_probs }
}
pub fn process_verification(&mut self, result: &VerifyResult) -> HVec<u16, MAX_DRAFT_TOKENS> {
let mut accepted_tokens = HVec::new();
if let Some(ref draft) = self.pending_draft {
for i in 0..result.accepted_count {
if let Some(&token) = draft.tokens.get(i) {
let _ = accepted_tokens.push(token);
}
}
if let Some(correct) = result.correction {
let _ = accepted_tokens.push(correct);
}
self.stats.tokens_accepted += result.accepted_count;
self.stats.tokens_rejected += draft.tokens.len() - result.accepted_count;
let rate = result.accepted_count as f32 / draft.tokens.len() as f32;
self.acceptance_rate = 0.9 * self.acceptance_rate + 0.1 * rate;
}
self.pending_draft = None;
accepted_tokens
}
pub fn adaptive_draft_length(&self) -> usize {
if !self.config.adaptive { return self.config.draft_length; }
if self.acceptance_rate > 0.95 { (self.config.draft_length + 2).min(MAX_DRAFT_TOKENS) }
else if self.acceptance_rate > 0.8 { self.config.draft_length }
else if self.acceptance_rate > 0.5 { (self.config.draft_length - 1).max(1) }
else { 1 }
}
pub fn estimated_speedup(&self) -> f32 {
let avg = self.acceptance_rate * self.adaptive_draft_length() as f32;
avg / 1.2
}
pub fn stats(&self) -> &SpecStats { &self.stats }
}
#[derive(Debug, Default, Clone)]
pub struct SpecStats {
pub drafts_sent: usize,
pub tokens_accepted: usize,
pub tokens_rejected: usize,
}
impl SpecStats {
pub fn acceptance_rate(&self) -> f32 {
let total = self.tokens_accepted + self.tokens_rejected;
if total == 0 { 0.0 } else { self.tokens_accepted as f32 / total as f32 }
}
}