Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,360 @@
//! RuvLLM ESP32 Demo Application
//!
//! Demonstrates tiny LLM inference on ESP32 microcontrollers.
#![cfg_attr(feature = "no_std", no_std)]
#![cfg_attr(feature = "no_std", no_main)]
#[cfg(feature = "esp32-std")]
use esp_idf_svc::hal::prelude::*;
#[cfg(feature = "no_std")]
extern crate alloc;
// For host testing, import from crate
#[cfg(feature = "host-test")]
use ruvllm_esp32::prelude::*;
#[cfg(feature = "host-test")]
use ruvllm_esp32::model::ModelConfig;
#[cfg(feature = "host-test")]
use ruvllm_esp32::embedding::SimpleTokenizer;
// For ESP32 builds
#[cfg(feature = "esp32-std")]
use ruvllm_esp32::prelude::*;
#[cfg(feature = "esp32-std")]
use ruvllm_esp32::model::ModelConfig;
#[cfg(feature = "esp32-std")]
use ruvllm_esp32::embedding::SimpleTokenizer;
#[cfg(feature = "esp32-std")]
fn main() -> anyhow::Result<()> {
// Initialize ESP-IDF
esp_idf_svc::sys::link_patches();
esp_idf_svc::log::EspLogger::initialize_default();
log::info!("=== RuvLLM ESP32 Demo ===");
log::info!("Initializing...");
// Detect ESP32 variant and create appropriate model
let variant = detect_variant();
log::info!("Detected variant: {:?}", variant);
log::info!("Available RAM: {} KB", variant.sram_bytes() / 1024);
log::info!("Max model RAM: {} KB", variant.max_model_ram() / 1024);
// Create model config for this variant
let config = ModelConfig::for_variant(variant);
log::info!("Model config:");
log::info!(" Vocab size: {}", config.vocab_size);
log::info!(" Embed dim: {}", config.embed_dim);
log::info!(" Hidden dim: {}", config.hidden_dim);
log::info!(" Layers: {}", config.num_layers);
log::info!(" Heads: {}", config.num_heads);
log::info!(" Estimated size: {} KB", config.estimate_size() / 1024);
// Create the model
log::info!("Creating model...");
let model = TinyModel::new(config)?;
log::info!("Model created, actual size: {} KB", model.memory_size() / 1024);
// Create inference engine
log::info!("Creating inference engine...");
let mut engine = MicroEngine::new(model)?;
let usage = engine.memory_usage();
log::info!("Memory usage breakdown:");
log::info!(" Model weights: {} KB", usage.model_weights / 1024);
log::info!(" Activation buffers: {} KB", usage.activation_buffers / 1024);
log::info!(" KV cache: {} KB", usage.kv_cache / 1024);
log::info!(" Total: {} KB", usage.total / 1024);
// Run inference benchmark
log::info!("Running inference benchmark...");
run_benchmark(&mut engine)?;
// Interactive demo (if UART available)
log::info!("Starting interactive demo...");
run_interactive(&mut engine)?;
Ok(())
}
// Host test main function
#[cfg(feature = "host-test")]
fn main() -> anyhow::Result<()> {
println!("=== RuvLLM ESP32 Demo (Host Simulation) ===");
println!("Initializing...");
// Detect ESP32 variant (simulated)
let variant = Esp32Variant::Esp32;
println!("Simulating variant: {:?}", variant);
println!("Available RAM: {} KB", variant.sram_bytes() / 1024);
println!("Max model RAM: {} KB", variant.max_model_ram() / 1024);
// Create model config for this variant
let config = ModelConfig::for_variant(variant);
println!("Model config:");
println!(" Vocab size: {}", config.vocab_size);
println!(" Embed dim: {}", config.embed_dim);
println!(" Hidden dim: {}", config.hidden_dim);
println!(" Layers: {}", config.num_layers);
println!(" Heads: {}", config.num_heads);
println!(" Estimated size: {} KB", config.estimate_size() / 1024);
// Create the model
println!("Creating model...");
let model = TinyModel::new(config)?;
println!("Model created, actual size: {} KB", model.memory_size() / 1024);
// Create inference engine
println!("Creating inference engine...");
let mut engine = MicroEngine::new(model)?;
let usage = engine.memory_usage();
println!("Memory usage breakdown:");
println!(" Model weights: {} KB", usage.model_weights / 1024);
println!(" Activation buffers: {} KB", usage.activation_buffers / 1024);
println!(" KV cache: {} KB", usage.kv_cache / 1024);
println!(" Total: {} KB", usage.total / 1024);
// Run inference benchmark
println!("\nRunning inference benchmark...");
run_benchmark_host(&mut engine)?;
// Interactive demo
println!("\nStarting interactive demo...");
run_interactive_host(&mut engine)?;
Ok(())
}
#[cfg(feature = "host-test")]
fn run_benchmark_host(engine: &mut MicroEngine) -> anyhow::Result<()> {
use std::time::Instant;
let config = InferenceConfig {
max_tokens: 10,
greedy: true,
..Default::default()
};
// Warmup
println!("Warmup run...");
let prompt = [1u16, 2, 3, 4, 5];
let _ = engine.generate(&prompt, &config)?;
engine.reset();
// Benchmark runs
const NUM_RUNS: usize = 10;
let mut total_time_us = 0u64;
let mut total_tokens = 0usize;
println!("Running {} benchmark iterations...", NUM_RUNS);
for i in 0..NUM_RUNS {
let start = Instant::now();
let result = engine.generate(&prompt, &config)?;
let elapsed = start.elapsed();
total_time_us += elapsed.as_micros() as u64;
total_tokens += result.tokens.len();
println!(
" Run {}: {} tokens in {} us ({:.1} tok/s)",
i + 1,
result.tokens.len(),
elapsed.as_micros(),
result.tokens.len() as f32 / elapsed.as_secs_f32()
);
engine.reset();
}
let avg_time_us = total_time_us / NUM_RUNS as u64;
let avg_tokens = total_tokens / NUM_RUNS;
let tokens_per_sec = (avg_tokens as f32 * 1_000_000.0) / avg_time_us as f32;
println!("=== Benchmark Results ===");
println!("Average time: {} us", avg_time_us);
println!("Average tokens: {}", avg_tokens);
println!("Throughput: {:.1} tokens/sec", tokens_per_sec);
println!("Latency per token: {:.1} us", avg_time_us as f32 / avg_tokens.max(1) as f32);
// Estimate ESP32 performance (roughly 15x slower)
let esp32_time_us = avg_time_us * 15;
let esp32_tokens_per_sec = tokens_per_sec / 15.0;
println!("\nEstimated ESP32 performance:");
println!(" Time: {} us ({:.2} ms)", esp32_time_us, esp32_time_us as f32 / 1000.0);
println!(" Throughput: {:.1} tokens/sec", esp32_tokens_per_sec);
// Performance counters
let counters = engine.perf_counters();
println!("\nPerformance counters:");
println!(" Embeddings: {}", counters.embeddings);
println!(" Attention ops: {}", counters.attention_ops);
println!(" FFN ops: {}", counters.ffn_ops);
Ok(())
}
#[cfg(feature = "host-test")]
fn run_interactive_host(engine: &mut MicroEngine) -> anyhow::Result<()> {
let tokenizer = SimpleTokenizer::ascii();
let config = InferenceConfig {
max_tokens: 20,
greedy: true,
..Default::default()
};
// Simple demo prompts
let prompts = [
"Hello",
"The quick brown",
"1 + 1 =",
];
for prompt in &prompts {
println!("Prompt: '{}'", prompt);
let tokens = tokenizer.encode(prompt);
let prompt_ids: heapless::Vec<u16, 64> = tokens.iter().copied().collect();
engine.reset();
let result = engine.generate(&prompt_ids, &config)?;
let output = tokenizer.decode(&result.tokens);
let output_str = core::str::from_utf8(&output).unwrap_or("<invalid>");
println!("Generated: '{}'", output_str);
println!("Tokens: {:?}", result.tokens.as_slice());
println!("---");
}
Ok(())
}
#[cfg(not(any(feature = "host-test", feature = "esp32-std")))]
#[no_mangle]
pub extern "C" fn main() -> ! {
// Bare-metal entry point
// Initialize heap, etc.
loop {}
}
/// Detect ESP32 variant at runtime
fn detect_variant() -> Esp32Variant {
// In real code, this would check chip ID
// For now, default to ESP32
#[cfg(feature = "esp32s3-simd")]
return Esp32Variant::Esp32S3;
#[cfg(not(feature = "esp32s3-simd"))]
Esp32Variant::Esp32
}
/// Run inference benchmark
#[cfg(feature = "std")]
fn run_benchmark(engine: &mut MicroEngine) -> anyhow::Result<()> {
use std::time::Instant;
let config = InferenceConfig {
max_tokens: 10,
greedy: true,
..Default::default()
};
// Warmup
log::info!("Warmup run...");
let prompt = [1u16, 2, 3, 4, 5];
let _ = engine.generate(&prompt, &config)?;
engine.reset();
// Benchmark runs
const NUM_RUNS: usize = 10;
let mut total_time_us = 0u64;
let mut total_tokens = 0usize;
log::info!("Running {} benchmark iterations...", NUM_RUNS);
for i in 0..NUM_RUNS {
let start = Instant::now();
let result = engine.generate(&prompt, &config)?;
let elapsed = start.elapsed();
total_time_us += elapsed.as_micros() as u64;
total_tokens += result.tokens.len();
log::info!(
" Run {}: {} tokens in {} us ({:.1} tok/s)",
i + 1,
result.tokens.len(),
elapsed.as_micros(),
result.tokens.len() as f32 / elapsed.as_secs_f32()
);
engine.reset();
}
let avg_time_us = total_time_us / NUM_RUNS as u64;
let avg_tokens = total_tokens / NUM_RUNS;
let tokens_per_sec = (avg_tokens as f32 * 1_000_000.0) / avg_time_us as f32;
log::info!("=== Benchmark Results ===");
log::info!("Average time: {} us", avg_time_us);
log::info!("Average tokens: {}", avg_tokens);
log::info!("Throughput: {:.1} tokens/sec", tokens_per_sec);
log::info!("Latency per token: {:.1} us", avg_time_us as f32 / avg_tokens as f32);
// Memory stats
let counters = engine.perf_counters();
log::info!("Performance counters:");
log::info!(" Embeddings: {}", counters.embeddings);
log::info!(" Attention ops: {}", counters.attention_ops);
log::info!(" FFN ops: {}", counters.ffn_ops);
Ok(())
}
/// Run interactive text generation
#[cfg(feature = "std")]
fn run_interactive(engine: &mut MicroEngine) -> anyhow::Result<()> {
let tokenizer = SimpleTokenizer::ascii();
let config = InferenceConfig {
max_tokens: 20,
greedy: true,
..Default::default()
};
// Simple demo prompts
let prompts = [
"Hello",
"The quick brown",
"1 + 1 =",
];
for prompt in &prompts {
log::info!("Prompt: '{}'", prompt);
let tokens = tokenizer.encode(prompt);
let prompt_ids: heapless::Vec<u16, 64> = tokens.iter().copied().collect();
engine.reset();
let result = engine.generate(&prompt_ids, &config)?;
let output = tokenizer.decode(&result.tokens);
let output_str = core::str::from_utf8(&output).unwrap_or("<invalid>");
log::info!("Generated: '{}'", output_str);
log::info!("Tokens: {:?}", result.tokens.as_slice());
log::info!("---");
}
Ok(())
}
// Panic handler for no_std
#[cfg(all(feature = "no_std", not(test)))]
#[panic_handler]
fn panic(_info: &core::panic::PanicInfo) -> ! {
loop {}
}