wifi-densepose/examples/ruvLLM/esp32/examples/medium_scale_demo.rs

//! Medium Scale Federation Demo - 100 to 500 Chip Clusters
//!
//! Shows the "sweet spot" for ESP32 federation where you get:
//! - High efficiency (40-70%)
//! - Great throughput (50K-100K tokens/sec)
//! - Practical costs ($400-$2,000)
//! - Real model capabilities (Small to Base models)

use ruvllm_esp32::federation::{
    MediumClusterConfig, ScaleComparison, MediumScaleAnalyzer,
    ModelCategory, HardwareConfig, BusType,
    MEDIUM_SCALE_MIN, MEDIUM_SCALE_MAX, MEDIUM_SCALE_OPTIMAL,
};

fn main() {
    println!("╔═══════════════════════════════════════════════════════════════════════╗");
    println!("║     RuvLLM ESP32 - Medium Scale Federation (100-500 Chips)            ║");
    println!("║     The Sweet Spot for Practical Distributed Inference                ║");
    println!("╚═══════════════════════════════════════════════════════════════════════╝\n");

    // ============================================================
    // 1. Why 100-500 Chips is the Sweet Spot
    // ============================================================
    println!("═══ Why 100-500 Chips? ═══\n");

    println!("  The 100-500 chip range is optimal because:");
    println!("  • High efficiency (40-70%) - minimal wasted compute");
    println!("  • Communication overhead stays low (<50%)");
    println!("  • Cost-effective ($400-$2,000 total)");
    println!("  • Can run meaningful models (5M-100M parameters)");
    println!("  • Practical hardware: fits in 1-2 rack units");
    println!();

    // ============================================================
    // 2. Standard Configurations
    // ============================================================
    println!("═══ Standard Medium-Scale Configurations ═══\n");

    println!("┌─────────┬───────────────┬────────────────┬────────────┬──────────┬──────────┐");
    println!("│  Chips  │   Topology    │   Throughput   │ Efficiency │   Cost   │  Power   │");
    println!("│         │  (clusters)   │   (tok/sec)    │            │   ($)    │   (W)    │");
    println!("├─────────┼───────────────┼────────────────┼────────────┼──────────┼──────────┤");

    for config in MediumClusterConfig::standard_configs() {
        println!("│ {:>7} │ {:>5} × {:>5} │ {:>14.0} │ {:>9.1}% │ {:>8.0} │ {:>8.1} │",
            config.total_chips,
            config.clusters,
            config.chips_per_cluster,
            config.expected_throughput,
            config.expected_efficiency * 100.0,
            config.cost_usd,
            config.power_watts,
        );
    }

    println!("└─────────┴───────────────┴────────────────┴────────────┴──────────┴──────────┘\n");

    // ============================================================
    // 3. Comparison vs Smaller Clusters
    // ============================================================
    println!("═══ Performance Comparison: Small vs Medium Clusters ═══\n");

    let key_sizes = [100, 256, 500];

    for chips in key_sizes {
        let comparison = ScaleComparison::analyze(chips);

        println!("  {} Chips vs Baselines:", chips);
        println!("  ┌───────────────┬─────────────────┬────────────────┐");
        println!("  │ Configuration │ Throughput      │ Improvement    │");
        println!("  ├───────────────┼─────────────────┼────────────────┤");
        println!("  │ 1 chip        │ {:>13.0} │ (baseline)     │",
            comparison.single_chip.throughput_tokens_sec);
        println!("  │ 5 chips       │ {:>13.0} │ {:>11.1}x    │",
            comparison.small_cluster.throughput_tokens_sec,
            comparison.small_cluster.throughput_tokens_sec / comparison.single_chip.throughput_tokens_sec);
        println!("  │ {} chips     │ {:>13.0} │ {:>11.1}x    │",
            chips,
            comparison.medium_cluster.throughput_tokens_sec,
            comparison.throughput_multiplier);
        println!("  └───────────────┴─────────────────┴────────────────┘");
        println!("    Cost per 1K tok/s: ${:.2}\n", comparison.cost_per_1k_tokens);
    }

    // ============================================================
    // 4. Model Capabilities at Each Scale
    // ============================================================
    println!("═══ What Models Can You Run? ═══\n");

    println!("┌─────────┬───────────────┬────────────────────────────────────────────────┐");
    println!("│  Chips  │  Model Size   │  Example Models                                │");
    println!("├─────────┼───────────────┼────────────────────────────────────────────────┤");

    for chips in [100, 150, 200, 256, 300, 400, 500] {
        let category = ModelCategory::for_chip_count(chips);
        let (min_params, max_params) = category.param_range();
        println!("│ {:>7} │ {:>5}-{:>5} │ {:46} │",
            chips,
            format_params(min_params),
            format_params(max_params),
            category.examples(),
        );
    }

    println!("└─────────┴───────────────┴────────────────────────────────────────────────┘\n");

    // ============================================================
    // 5. Hardware Requirements
    // ============================================================
    println!("═══ Hardware Requirements for Deployment ═══\n");

    println!("┌─────────┬────────────┬──────────┬─────────────┬───────────────────────────┐");
    println!("│  Chips  │ PCBs Req'd │ Chip/PCB │ Power (W)   │ Form Factor               │");
    println!("├─────────┼────────────┼──────────┼─────────────┼───────────────────────────┤");

    for chips in [100, 144, 256, 400, 500] {
        let hw = HardwareConfig::for_cluster(chips);
        println!("│ {:>7} │ {:>10} │ {:>8} │ {:>11.0} │ {:25} │",
            chips,
            hw.num_boards,
            hw.chips_per_board,
            hw.power_supply_watts,
            hw.form_factor,
        );
    }

    println!("└─────────┴────────────┴──────────┴─────────────┴───────────────────────────┘\n");

    println!("  Communication Bus Options:");
    println!("  ┌──────────────┬───────────────┬────────────────────────────────────────┐");
    println!("  │ Bus Type     │ Bandwidth     │ Best For                               │");
    println!("  ├──────────────┼───────────────┼────────────────────────────────────────┤");
    println!("  │ SPI          │ {:>11} │ Small clusters, simple wiring          │",
        format_bandwidth(BusType::Spi.bandwidth_bytes_sec()));
    println!("  │ I2C          │ {:>11} │ Slow but many devices                  │",
        format_bandwidth(BusType::I2c.bandwidth_bytes_sec()));
    println!("  │ UART Mesh    │ {:>11} │ Medium clusters, flexible              │",
        format_bandwidth(BusType::Uart.bandwidth_bytes_sec()));
    println!("  │ High-Speed   │ {:>11} │ Large clusters, custom hardware        │",
        format_bandwidth(BusType::HighSpeed.bandwidth_bytes_sec()));
    println!("  └──────────────┴───────────────┴────────────────────────────────────────┘\n");

    // ============================================================
    // 6. Optimization: Find Best Config for Your Needs
    // ============================================================
    println!("═══ Find Your Optimal Configuration ═══\n");

    // By throughput target
    println!("  Target Throughput → Recommended Chips:");
    println!("  ┌─────────────────────┬─────────┬────────────────┬──────────┐");
    println!("  │ Target (tok/sec)    │  Chips  │ Actual Output  │   Cost   │");
    println!("  ├─────────────────────┼─────────┼────────────────┼──────────┤");

    for target in [50_000.0, 60_000.0, 70_000.0, 80_000.0] {
        if let Some(config) = MediumScaleAnalyzer::optimize_for_throughput(target) {
            println!("  │ {:>19.0} │ {:>7} │ {:>14.0} │ ${:>7.0} │",
                target,
                config.total_chips,
                config.expected_throughput,
                config.cost_usd,
            );
        }
    }
    println!("  └─────────────────────┴─────────┴────────────────┴──────────┘\n");

    // By budget
    println!("  Budget → Maximum Configuration:");
    println!("  ┌─────────────────────┬─────────┬────────────────┬────────────┐");
    println!("  │ Budget ($)          │  Chips  │   Throughput   │ Efficiency │");
    println!("  ├─────────────────────┼─────────┼────────────────┼────────────┤");

    for budget in [500.0, 1000.0, 1500.0, 2000.0] {
        let config = MediumScaleAnalyzer::optimize_for_budget(budget);
        println!("  │ ${:>18.0} │ {:>7} │ {:>14.0} │ {:>9.1}% │",
            budget,
            config.total_chips,
            config.expected_throughput,
            config.expected_efficiency * 100.0,
        );
    }
    println!("  └─────────────────────┴─────────┴────────────────┴────────────┘\n");

    // ============================================================
    // 7. Summary: The Sweet Spot
    // ============================================================
    println!("╔═══════════════════════════════════════════════════════════════════════╗");
    println!("║                    MEDIUM SCALE SUMMARY                               ║");
    println!("╠═══════════════════════════════════════════════════════════════════════╣");
    println!("║                                                                       ║");
    println!("║  The 100-500 chip range is ideal for:                                 ║");
    println!("║                                                                       ║");
    println!("║  ✓ HOME/OFFICE: 100 chips ($400) = 53K tok/s, 70% efficient           ║");
    println!("║    - Runs Small models (5-20M params)                                 ║");
    println!("║    - Fits in single rack unit                                         ║");
    println!("║    - 50W power consumption                                            ║");
    println!("║                                                                       ║");
    println!("║  ✓ WORKSTATION: 256 chips ($1,024) = 88K tok/s, 55% efficient         ║");
    println!("║    - Runs Base models (20-100M params)                                ║");
    println!("║    - 2U rack mount                                                    ║");
    println!("║    - 130W power consumption                                           ║");
    println!("║                                                                       ║");
    println!("║  ✓ SERVER: 500 chips ($2,000) = 106K tok/s, 40% efficient             ║");
    println!("║    - Runs Large models (100M+ params)                                 ║");
    println!("║    - Full rack unit                                                   ║");
    println!("║    - 250W power consumption                                           ║");
    println!("║                                                                       ║");
    println!("║  KEY INSIGHT: Beyond 500 chips, efficiency drops significantly.       ║");
    println!("║  For larger models, use multiple 256-500 chip clusters in parallel.   ║");
    println!("║                                                                       ║");
    println!("╚═══════════════════════════════════════════════════════════════════════╝");
}

fn format_params(n: usize) -> String {
    if n >= 1_000_000_000 {
        format!("{:.0}B", n as f64 / 1_000_000_000.0)
    } else if n >= 1_000_000 {
        format!("{:.0}M", n as f64 / 1_000_000.0)
    } else if n >= 1_000 {
        format!("{:.0}K", n as f64 / 1_000.0)
    } else {
        format!("{}", n)
    }
}

fn format_bandwidth(bps: usize) -> String {
    if bps >= 1_000_000 {
        format!("{} MB/s", bps / 1_000_000)
    } else if bps >= 1_000 {
        format!("{} KB/s", bps / 1_000)
    } else {
        format!("{} B/s", bps)
    }
}