Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
176
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/mod.rs
vendored
Normal file
176
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/mod.rs
vendored
Normal file
@@ -0,0 +1,176 @@
|
||||
//! Federation Module for Multi-Chip Distributed Inference
|
||||
//!
|
||||
//! Supports:
|
||||
//! - Pipeline parallelism (layers across chips)
|
||||
//! - Tensor parallelism (attention heads across chips)
|
||||
//! - Speculative decoding (draft/verify)
|
||||
//! - SPI/I2C/UART/ESP-NOW communication
|
||||
|
||||
pub mod protocol;
|
||||
pub mod pipeline;
|
||||
pub mod speculative;
|
||||
|
||||
pub use protocol::{
|
||||
ChipId, MessageType, MessageHeader, FederationMessage, CommStats,
|
||||
MAX_ACTIVATION_SIZE, MAX_PAYLOAD_SIZE,
|
||||
};
|
||||
pub use pipeline::{
|
||||
PipelineNode, PipelineConfig, PipelineRole, PipelineState, PipelineStats,
|
||||
InFlightToken, calculate_pipeline_efficiency,
|
||||
MAX_LAYERS_PER_CHIP, MAX_PIPELINE_DEPTH,
|
||||
};
|
||||
pub use speculative::{
|
||||
SpeculativeDecoder, DraftVerifyConfig, DraftResult, VerifyResult, SpecStats,
|
||||
MAX_DRAFT_TOKENS,
|
||||
};
|
||||
|
||||
/// Maximum chips in federation
|
||||
pub const MAX_FEDERATION_SIZE: usize = 8;
|
||||
|
||||
/// Federation mode
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum FederationMode {
|
||||
Standalone,
|
||||
Pipeline,
|
||||
TensorParallel,
|
||||
Hybrid,
|
||||
Speculative,
|
||||
MixtureOfExperts,
|
||||
}
|
||||
|
||||
/// Communication bus type
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum CommunicationBus {
|
||||
Spi,
|
||||
I2c,
|
||||
Uart,
|
||||
EspNow,
|
||||
Parallel,
|
||||
}
|
||||
|
||||
impl CommunicationBus {
|
||||
pub const fn bandwidth_bytes_per_sec(&self) -> usize {
|
||||
match self {
|
||||
Self::Spi => 10_000_000,
|
||||
Self::I2c => 100_000,
|
||||
Self::Uart => 500_000,
|
||||
Self::EspNow => 125_000,
|
||||
Self::Parallel => 20_000_000,
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn latency_us(&self) -> usize {
|
||||
match self {
|
||||
Self::Spi => 10,
|
||||
Self::I2c => 50,
|
||||
Self::Uart => 20,
|
||||
Self::EspNow => 500,
|
||||
Self::Parallel => 5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Federation configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationConfig {
|
||||
pub num_chips: usize,
|
||||
pub chip_id: ChipId,
|
||||
pub mode: FederationMode,
|
||||
pub bus: CommunicationBus,
|
||||
pub layers_per_chip: usize,
|
||||
pub heads_per_chip: usize,
|
||||
pub enable_pipelining: bool,
|
||||
}
|
||||
|
||||
impl Default for FederationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_chips: 5,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::Pipeline,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip: 2,
|
||||
heads_per_chip: 1,
|
||||
enable_pipelining: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate optimal federation config
|
||||
pub fn calculate_optimal_config(
|
||||
model_size: usize,
|
||||
num_layers: usize,
|
||||
num_heads: usize,
|
||||
num_chips: usize,
|
||||
per_chip_ram: usize,
|
||||
) -> FederationConfig {
|
||||
let model_per_chip = model_size / num_chips;
|
||||
|
||||
if model_per_chip <= per_chip_ram {
|
||||
let layers_per_chip = (num_layers + num_chips - 1) / num_chips;
|
||||
FederationConfig {
|
||||
num_chips,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::Pipeline,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip,
|
||||
heads_per_chip: num_heads,
|
||||
enable_pipelining: true,
|
||||
}
|
||||
} else {
|
||||
let heads_per_chip = (num_heads + num_chips - 1) / num_chips;
|
||||
FederationConfig {
|
||||
num_chips,
|
||||
chip_id: ChipId(0),
|
||||
mode: FederationMode::TensorParallel,
|
||||
bus: CommunicationBus::Spi,
|
||||
layers_per_chip: num_layers,
|
||||
heads_per_chip,
|
||||
enable_pipelining: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Federation speedup estimates
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationSpeedup {
|
||||
pub throughput_multiplier: f32,
|
||||
pub latency_reduction: f32,
|
||||
pub memory_per_chip_reduction: f32,
|
||||
}
|
||||
|
||||
pub fn estimate_speedup(config: &FederationConfig) -> FederationSpeedup {
|
||||
let n = config.num_chips as f32;
|
||||
match config.mode {
|
||||
FederationMode::Standalone => FederationSpeedup {
|
||||
throughput_multiplier: 1.0,
|
||||
latency_reduction: 1.0,
|
||||
memory_per_chip_reduction: 1.0,
|
||||
},
|
||||
FederationMode::Pipeline => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.85,
|
||||
latency_reduction: 1.0 / (1.0 + 0.1 * (n - 1.0)),
|
||||
memory_per_chip_reduction: n,
|
||||
},
|
||||
FederationMode::TensorParallel => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.7,
|
||||
latency_reduction: n * 0.7,
|
||||
memory_per_chip_reduction: n * 0.8,
|
||||
},
|
||||
FederationMode::Hybrid => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.75,
|
||||
latency_reduction: (n / 2.0) * 0.8,
|
||||
memory_per_chip_reduction: n * 0.9,
|
||||
},
|
||||
FederationMode::Speculative => FederationSpeedup {
|
||||
throughput_multiplier: 2.5,
|
||||
latency_reduction: 2.0,
|
||||
memory_per_chip_reduction: 1.0,
|
||||
},
|
||||
FederationMode::MixtureOfExperts => FederationSpeedup {
|
||||
throughput_multiplier: n * 0.9,
|
||||
latency_reduction: 1.5,
|
||||
memory_per_chip_reduction: n,
|
||||
},
|
||||
}
|
||||
}
|
||||
180
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/pipeline.rs
vendored
Normal file
180
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/pipeline.rs
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
//! Pipeline Parallelism for Multi-ESP32 Inference
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::{ChipId, FederationMessage};
|
||||
|
||||
pub const MAX_LAYERS_PER_CHIP: usize = 4;
|
||||
pub const MAX_PIPELINE_DEPTH: usize = 8;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum PipelineRole { Head, Middle, Tail, Standalone }
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PipelineConfig {
|
||||
pub num_chips: usize,
|
||||
pub position: usize,
|
||||
pub layer_start: usize,
|
||||
pub layer_count: usize,
|
||||
pub total_layers: usize,
|
||||
pub embed_dim: usize,
|
||||
pub micro_batch_size: usize,
|
||||
}
|
||||
|
||||
impl PipelineConfig {
|
||||
pub fn for_chip(chip_pos: usize, num_chips: usize, total_layers: usize, embed_dim: usize) -> Self {
|
||||
let layers_per_chip = (total_layers + num_chips - 1) / num_chips;
|
||||
let layer_start = chip_pos * layers_per_chip;
|
||||
let layer_count = layers_per_chip.min(total_layers - layer_start);
|
||||
Self { num_chips, position: chip_pos, layer_start, layer_count, total_layers, embed_dim, micro_batch_size: 1 }
|
||||
}
|
||||
|
||||
pub fn role(&self) -> PipelineRole {
|
||||
if self.num_chips == 1 { PipelineRole::Standalone }
|
||||
else if self.position == 0 { PipelineRole::Head }
|
||||
else if self.position == self.num_chips - 1 { PipelineRole::Tail }
|
||||
else { PipelineRole::Middle }
|
||||
}
|
||||
|
||||
pub fn prev_chip(&self) -> Option<ChipId> {
|
||||
if self.position > 0 { Some(ChipId((self.position - 1) as u8)) } else { None }
|
||||
}
|
||||
|
||||
pub fn next_chip(&self) -> Option<ChipId> {
|
||||
if self.position + 1 < self.num_chips { Some(ChipId((self.position + 1) as u8)) } else { None }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum PipelineState { WaitingInput, Processing, WaitingSend, Idle }
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InFlightToken {
|
||||
pub seq_pos: u16,
|
||||
pub token_id: u16,
|
||||
pub current_layer: u8,
|
||||
pub activation: HVec<i8, 128>,
|
||||
}
|
||||
|
||||
pub struct PipelineNode {
|
||||
config: PipelineConfig,
|
||||
state: PipelineState,
|
||||
chip_id: ChipId,
|
||||
seq_counter: u16,
|
||||
in_flight: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
|
||||
output_queue: HVec<InFlightToken, MAX_PIPELINE_DEPTH>,
|
||||
barrier_counter: u16,
|
||||
}
|
||||
|
||||
impl PipelineNode {
|
||||
pub fn new(config: PipelineConfig) -> Self {
|
||||
Self {
|
||||
chip_id: ChipId(config.position as u8),
|
||||
config,
|
||||
state: PipelineState::Idle,
|
||||
seq_counter: 0,
|
||||
in_flight: HVec::new(),
|
||||
output_queue: HVec::new(),
|
||||
barrier_counter: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn state(&self) -> PipelineState { self.state }
|
||||
pub fn handles_embedding(&self) -> bool { matches!(self.config.role(), PipelineRole::Head | PipelineRole::Standalone) }
|
||||
pub fn handles_output(&self) -> bool { matches!(self.config.role(), PipelineRole::Tail | PipelineRole::Standalone) }
|
||||
|
||||
pub fn start_token(&mut self, token_id: u16) -> crate::Result<()> {
|
||||
if !self.handles_embedding() { return Err(crate::Error::UnsupportedFeature("Not head chip")); }
|
||||
if self.in_flight.len() >= MAX_PIPELINE_DEPTH { return Err(crate::Error::BufferOverflow); }
|
||||
|
||||
let token = InFlightToken { seq_pos: self.seq_counter, token_id, current_layer: 0, activation: HVec::new() };
|
||||
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
self.seq_counter += 1;
|
||||
self.state = PipelineState::Processing;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn receive_activation(&mut self, msg: &FederationMessage) -> crate::Result<()> {
|
||||
let (layer_idx, position, data) = msg.get_activation_data()
|
||||
.ok_or(crate::Error::InvalidModel("Invalid activation"))?;
|
||||
|
||||
let mut activation = HVec::new();
|
||||
for &d in data { activation.push(d as i8).map_err(|_| crate::Error::BufferOverflow)?; }
|
||||
|
||||
let token = InFlightToken { seq_pos: position, token_id: 0, current_layer: layer_idx, activation };
|
||||
self.in_flight.push(token).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
self.state = PipelineState::Processing;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_step<F>(&mut self, mut layer_fn: F) -> crate::Result<bool>
|
||||
where F: FnMut(usize, &mut [i8]) -> crate::Result<()>
|
||||
{
|
||||
if self.in_flight.is_empty() {
|
||||
self.state = PipelineState::WaitingInput;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let token = &mut self.in_flight[0];
|
||||
let relative_layer = token.current_layer as usize - self.config.layer_start;
|
||||
|
||||
if relative_layer < self.config.layer_count {
|
||||
let layer_idx = self.config.layer_start + relative_layer;
|
||||
layer_fn(layer_idx, &mut token.activation)?;
|
||||
token.current_layer += 1;
|
||||
}
|
||||
|
||||
let next = token.current_layer as usize;
|
||||
if next >= self.config.layer_start + self.config.layer_count {
|
||||
if let Some(completed) = self.in_flight.pop() {
|
||||
self.output_queue.push(completed).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
self.state = PipelineState::WaitingSend;
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
pub fn get_output(&mut self) -> Option<FederationMessage> {
|
||||
if self.output_queue.is_empty() { return None; }
|
||||
let token = self.output_queue.pop()?;
|
||||
let next_chip = self.config.next_chip()?;
|
||||
let data: heapless::Vec<i8, 128> = token.activation.iter().cloned().collect();
|
||||
FederationMessage::activation(self.chip_id, next_chip, token.seq_pos, token.current_layer, token.seq_pos, &data).ok()
|
||||
}
|
||||
|
||||
pub fn has_final_output(&self) -> bool { self.handles_output() && !self.output_queue.is_empty() }
|
||||
|
||||
pub fn get_final_output(&mut self) -> Option<HVec<i8, 128>> {
|
||||
if !self.handles_output() { return None; }
|
||||
self.output_queue.pop().map(|t| t.activation)
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> PipelineStats {
|
||||
PipelineStats {
|
||||
in_flight_count: self.in_flight.len(),
|
||||
output_queue_len: self.output_queue.len(),
|
||||
tokens_processed: self.seq_counter as usize,
|
||||
current_state: self.state,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_barrier(&mut self) -> FederationMessage {
|
||||
self.barrier_counter += 1;
|
||||
FederationMessage::barrier(self.chip_id, self.barrier_counter)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PipelineStats {
|
||||
pub in_flight_count: usize,
|
||||
pub output_queue_len: usize,
|
||||
pub tokens_processed: usize,
|
||||
pub current_state: PipelineState,
|
||||
}
|
||||
|
||||
pub fn calculate_pipeline_efficiency(num_chips: usize, tokens: usize) -> f32 {
|
||||
if tokens <= num_chips {
|
||||
tokens as f32 / (num_chips as f32 * tokens as f32)
|
||||
} else {
|
||||
tokens as f32 / (tokens as f32 + (num_chips - 1) as f32)
|
||||
}
|
||||
}
|
||||
187
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/protocol.rs
vendored
Normal file
187
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/protocol.rs
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
//! Inter-Chip Communication Protocol
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
|
||||
pub const MAX_ACTIVATION_SIZE: usize = 256;
|
||||
pub const MAX_PAYLOAD_SIZE: usize = 512;
|
||||
pub const PROTOCOL_VERSION: u8 = 1;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
|
||||
pub struct ChipId(pub u8);
|
||||
|
||||
impl ChipId {
|
||||
pub const BROADCAST: ChipId = ChipId(0xFF);
|
||||
pub fn is_broadcast(&self) -> bool { self.0 == 0xFF }
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum MessageType {
|
||||
Heartbeat = 0x00,
|
||||
Discovery = 0x01,
|
||||
Ready = 0x02,
|
||||
Activation = 0x10,
|
||||
KVCache = 0x11,
|
||||
Gradient = 0x12,
|
||||
EmbedRequest = 0x20,
|
||||
EmbedResponse = 0x21,
|
||||
Logits = 0x22,
|
||||
Token = 0x23,
|
||||
DraftTokens = 0x30,
|
||||
VerifyResult = 0x31,
|
||||
Barrier = 0x40,
|
||||
Ack = 0x41,
|
||||
Error = 0xFF,
|
||||
}
|
||||
|
||||
impl From<u8> for MessageType {
|
||||
fn from(v: u8) -> Self {
|
||||
match v {
|
||||
0x00 => Self::Heartbeat, 0x01 => Self::Discovery, 0x02 => Self::Ready,
|
||||
0x10 => Self::Activation, 0x11 => Self::KVCache, 0x12 => Self::Gradient,
|
||||
0x20 => Self::EmbedRequest, 0x21 => Self::EmbedResponse,
|
||||
0x22 => Self::Logits, 0x23 => Self::Token,
|
||||
0x30 => Self::DraftTokens, 0x31 => Self::VerifyResult,
|
||||
0x40 => Self::Barrier, 0x41 => Self::Ack,
|
||||
_ => Self::Error,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[repr(C, packed)]
|
||||
pub struct MessageHeader {
|
||||
pub version: u8,
|
||||
pub msg_type: u8,
|
||||
pub src: u8,
|
||||
pub dst: u8,
|
||||
pub seq: u16,
|
||||
pub payload_len: u16,
|
||||
}
|
||||
|
||||
impl MessageHeader {
|
||||
pub const SIZE: usize = 8;
|
||||
|
||||
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16, payload_len: u16) -> Self {
|
||||
Self { version: PROTOCOL_VERSION, msg_type: msg_type as u8, src: src.0, dst: dst.0, seq, payload_len }
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> [u8; 8] {
|
||||
[self.version, self.msg_type, self.src, self.dst,
|
||||
(self.seq & 0xFF) as u8, (self.seq >> 8) as u8,
|
||||
(self.payload_len & 0xFF) as u8, (self.payload_len >> 8) as u8]
|
||||
}
|
||||
|
||||
pub fn from_bytes(b: &[u8]) -> Option<Self> {
|
||||
if b.len() < 8 { return None; }
|
||||
Some(Self {
|
||||
version: b[0], msg_type: b[1], src: b[2], dst: b[3],
|
||||
seq: (b[4] as u16) | ((b[5] as u16) << 8),
|
||||
payload_len: (b[6] as u16) | ((b[7] as u16) << 8),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn checksum(&self) -> u8 {
|
||||
self.to_bytes().iter().fold(0u8, |acc, &b| acc.wrapping_add(b))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FederationMessage {
|
||||
pub header: MessageHeader,
|
||||
pub payload: HVec<u8, MAX_PAYLOAD_SIZE>,
|
||||
pub checksum: u8,
|
||||
}
|
||||
|
||||
impl FederationMessage {
|
||||
pub fn new(msg_type: MessageType, src: ChipId, dst: ChipId, seq: u16) -> Self {
|
||||
Self {
|
||||
header: MessageHeader::new(msg_type, src, dst, seq, 0),
|
||||
payload: HVec::new(),
|
||||
checksum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn activation(src: ChipId, dst: ChipId, seq: u16, layer: u8, pos: u16, data: &[i8]) -> crate::Result<Self> {
|
||||
let mut msg = Self::new(MessageType::Activation, src, dst, seq);
|
||||
msg.payload.push(layer).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((pos & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((pos >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
for &d in data {
|
||||
msg.payload.push(d as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
msg.header.payload_len = msg.payload.len() as u16;
|
||||
msg.update_checksum();
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
pub fn token(src: ChipId, dst: ChipId, seq: u16, token_id: u16) -> Self {
|
||||
let mut msg = Self::new(MessageType::Token, src, dst, seq);
|
||||
let _ = msg.payload.push((token_id & 0xFF) as u8);
|
||||
let _ = msg.payload.push((token_id >> 8) as u8);
|
||||
msg.header.payload_len = 2;
|
||||
msg.update_checksum();
|
||||
msg
|
||||
}
|
||||
|
||||
pub fn draft_tokens(src: ChipId, dst: ChipId, seq: u16, tokens: &[u16]) -> crate::Result<Self> {
|
||||
let mut msg = Self::new(MessageType::DraftTokens, src, dst, seq);
|
||||
msg.payload.push(tokens.len() as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
for &t in tokens {
|
||||
msg.payload.push((t & 0xFF) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
msg.payload.push((t >> 8) as u8).map_err(|_| crate::Error::BufferOverflow)?;
|
||||
}
|
||||
msg.header.payload_len = msg.payload.len() as u16;
|
||||
msg.update_checksum();
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
pub fn barrier(src: ChipId, barrier_id: u16) -> Self {
|
||||
let mut msg = Self::new(MessageType::Barrier, src, ChipId::BROADCAST, 0);
|
||||
let _ = msg.payload.push((barrier_id & 0xFF) as u8);
|
||||
let _ = msg.payload.push((barrier_id >> 8) as u8);
|
||||
msg.header.payload_len = 2;
|
||||
msg.update_checksum();
|
||||
msg
|
||||
}
|
||||
|
||||
pub fn update_checksum(&mut self) {
|
||||
let mut sum = self.header.checksum();
|
||||
for &b in &self.payload { sum = sum.wrapping_add(b); }
|
||||
self.checksum = sum;
|
||||
}
|
||||
|
||||
pub fn verify_checksum(&self) -> bool {
|
||||
let mut sum = self.header.checksum();
|
||||
for &b in &self.payload { sum = sum.wrapping_add(b); }
|
||||
sum == self.checksum
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> HVec<u8, { MAX_PAYLOAD_SIZE + 16 }> {
|
||||
let mut bytes = HVec::new();
|
||||
for b in self.header.to_bytes() { let _ = bytes.push(b); }
|
||||
for &b in &self.payload { let _ = bytes.push(b); }
|
||||
let _ = bytes.push(self.checksum);
|
||||
bytes
|
||||
}
|
||||
|
||||
pub fn get_activation_data(&self) -> Option<(u8, u16, &[u8])> {
|
||||
if self.header.msg_type != MessageType::Activation as u8 || self.payload.len() < 3 { return None; }
|
||||
Some((self.payload[0], (self.payload[1] as u16) | ((self.payload[2] as u16) << 8), &self.payload[3..]))
|
||||
}
|
||||
|
||||
pub fn get_token(&self) -> Option<u16> {
|
||||
if self.header.msg_type != MessageType::Token as u8 || self.payload.len() < 2 { return None; }
|
||||
Some((self.payload[0] as u16) | ((self.payload[1] as u16) << 8))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct CommStats {
|
||||
pub messages_sent: u32,
|
||||
pub messages_received: u32,
|
||||
pub bytes_sent: u32,
|
||||
pub bytes_received: u32,
|
||||
pub checksum_errors: u32,
|
||||
pub timeouts: u32,
|
||||
}
|
||||
146
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/speculative.rs
vendored
Normal file
146
vendor/ruvector/examples/ruvLLM/esp32-flash/src/federation/speculative.rs
vendored
Normal file
@@ -0,0 +1,146 @@
|
||||
//! Speculative Decoding - Draft and Verify
|
||||
|
||||
use heapless::Vec as HVec;
|
||||
use super::protocol::{ChipId, FederationMessage};
|
||||
|
||||
pub const MAX_DRAFT_TOKENS: usize = 8;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DraftVerifyConfig {
|
||||
pub draft_length: usize,
|
||||
pub acceptance_threshold: f32,
|
||||
pub draft_chip: ChipId,
|
||||
pub verify_chips: HVec<ChipId, 4>,
|
||||
pub adaptive: bool,
|
||||
}
|
||||
|
||||
impl Default for DraftVerifyConfig {
|
||||
fn default() -> Self {
|
||||
Self { draft_length: 4, acceptance_threshold: 0.9, draft_chip: ChipId(0), verify_chips: HVec::new(), adaptive: true }
|
||||
}
|
||||
}
|
||||
|
||||
impl DraftVerifyConfig {
|
||||
pub fn for_five_chips() -> Self {
|
||||
let mut verify_chips = HVec::new();
|
||||
for i in 1..5 { let _ = verify_chips.push(ChipId(i)); }
|
||||
Self { draft_length: 4, acceptance_threshold: 0.9, draft_chip: ChipId(0), verify_chips, adaptive: true }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DraftResult {
|
||||
pub tokens: HVec<u16, MAX_DRAFT_TOKENS>,
|
||||
pub probs: HVec<u8, MAX_DRAFT_TOKENS>,
|
||||
pub start_pos: u16,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VerifyResult {
|
||||
pub accepted_count: usize,
|
||||
pub correction: Option<u16>,
|
||||
pub verify_probs: HVec<u8, MAX_DRAFT_TOKENS>,
|
||||
}
|
||||
|
||||
pub struct SpeculativeDecoder {
|
||||
config: DraftVerifyConfig,
|
||||
is_draft_chip: bool,
|
||||
acceptance_rate: f32,
|
||||
pending_draft: Option<DraftResult>,
|
||||
stats: SpecStats,
|
||||
}
|
||||
|
||||
impl SpeculativeDecoder {
|
||||
pub fn new(config: DraftVerifyConfig, chip_id: ChipId) -> Self {
|
||||
let is_draft = chip_id == config.draft_chip;
|
||||
Self { config, is_draft_chip: is_draft, acceptance_rate: 0.9, pending_draft: None, stats: SpecStats::default() }
|
||||
}
|
||||
|
||||
pub fn is_drafter(&self) -> bool { self.is_draft_chip }
|
||||
|
||||
pub fn submit_draft(&mut self, draft: DraftResult) -> crate::Result<FederationMessage> {
|
||||
if !self.is_draft_chip { return Err(crate::Error::UnsupportedFeature("Not draft chip")); }
|
||||
let tokens: heapless::Vec<u16, MAX_DRAFT_TOKENS> = draft.tokens.iter().cloned().collect();
|
||||
let msg = FederationMessage::draft_tokens(self.config.draft_chip, ChipId::BROADCAST, draft.start_pos, &tokens)?;
|
||||
self.pending_draft = Some(draft);
|
||||
self.stats.drafts_sent += 1;
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
pub fn verify_draft<F>(&mut self, draft: &DraftResult, mut get_prob: F) -> VerifyResult
|
||||
where F: FnMut(u16, u16) -> u8
|
||||
{
|
||||
let mut accepted = 0;
|
||||
let mut correction = None;
|
||||
let mut verify_probs = HVec::new();
|
||||
|
||||
for (i, &token) in draft.tokens.iter().enumerate() {
|
||||
let pos = draft.start_pos + i as u16;
|
||||
let verify_prob = get_prob(pos, token);
|
||||
let _ = verify_probs.push(verify_prob);
|
||||
let draft_prob = draft.probs.get(i).copied().unwrap_or(128);
|
||||
let threshold = (draft_prob as f32 * self.config.acceptance_threshold) as u8;
|
||||
|
||||
if verify_prob >= threshold {
|
||||
accepted += 1;
|
||||
} else {
|
||||
correction = Some(token.wrapping_add(1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
VerifyResult { accepted_count: accepted, correction, verify_probs }
|
||||
}
|
||||
|
||||
pub fn process_verification(&mut self, result: &VerifyResult) -> HVec<u16, MAX_DRAFT_TOKENS> {
|
||||
let mut accepted_tokens = HVec::new();
|
||||
|
||||
if let Some(ref draft) = self.pending_draft {
|
||||
for i in 0..result.accepted_count {
|
||||
if let Some(&token) = draft.tokens.get(i) {
|
||||
let _ = accepted_tokens.push(token);
|
||||
}
|
||||
}
|
||||
if let Some(correct) = result.correction {
|
||||
let _ = accepted_tokens.push(correct);
|
||||
}
|
||||
|
||||
self.stats.tokens_accepted += result.accepted_count;
|
||||
self.stats.tokens_rejected += draft.tokens.len() - result.accepted_count;
|
||||
let rate = result.accepted_count as f32 / draft.tokens.len() as f32;
|
||||
self.acceptance_rate = 0.9 * self.acceptance_rate + 0.1 * rate;
|
||||
}
|
||||
|
||||
self.pending_draft = None;
|
||||
accepted_tokens
|
||||
}
|
||||
|
||||
pub fn adaptive_draft_length(&self) -> usize {
|
||||
if !self.config.adaptive { return self.config.draft_length; }
|
||||
if self.acceptance_rate > 0.95 { (self.config.draft_length + 2).min(MAX_DRAFT_TOKENS) }
|
||||
else if self.acceptance_rate > 0.8 { self.config.draft_length }
|
||||
else if self.acceptance_rate > 0.5 { (self.config.draft_length - 1).max(1) }
|
||||
else { 1 }
|
||||
}
|
||||
|
||||
pub fn estimated_speedup(&self) -> f32 {
|
||||
let avg = self.acceptance_rate * self.adaptive_draft_length() as f32;
|
||||
avg / 1.2
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> &SpecStats { &self.stats }
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct SpecStats {
|
||||
pub drafts_sent: usize,
|
||||
pub tokens_accepted: usize,
|
||||
pub tokens_rejected: usize,
|
||||
}
|
||||
|
||||
impl SpecStats {
|
||||
pub fn acceptance_rate(&self) -> f32 {
|
||||
let total = self.tokens_accepted + self.tokens_rejected;
|
||||
if total == 0 { 0.0 } else { self.tokens_accepted as f32 / total as f32 }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user