//! RuvLLM CLI - Model Management and Inference for Apple Silicon //! //! A command-line interface for downloading, managing, and running LLM models //! optimized for Mac M4 Pro and other Apple Silicon devices. //! //! ## Commands //! //! - `ruvllm download ` - Download model from HuggingFace Hub //! - `ruvllm list` - List available/downloaded models //! - `ruvllm info ` - Show model information //! - `ruvllm serve ` - Start inference server //! - `ruvllm chat ` - Interactive chat mode //! - `ruvllm benchmark ` - Run performance benchmarks //! - `ruvllm quantize ` - Quantize model to GGUF format use clap::{Parser, Subcommand}; use colored::Colorize; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; mod commands; mod models; use commands::{benchmark, chat, download, info, list, quantize, serve}; /// RuvLLM - High-performance LLM inference for Apple Silicon #[derive(Parser)] #[command(name = "ruvllm")] #[command(author, version, about, long_about = None)] #[command(propagate_version = true)] struct Cli { /// Enable verbose logging #[arg(short, long, global = true)] verbose: bool, /// Disable colored output #[arg(long, global = true)] no_color: bool, /// Custom cache directory for models #[arg(long, global = true, env = "RUVLLM_CACHE_DIR")] cache_dir: Option, #[command(subcommand)] command: Commands, } #[derive(Subcommand)] enum Commands { /// Download a model from HuggingFace Hub #[command(alias = "dl")] Download { /// Model identifier (HuggingFace model ID or alias) /// /// Aliases: qwen, mistral, phi, llama model: String, /// Quantization format (q4k, q8, f16, none) #[arg(short, long, default_value = "q4k")] quantization: String, /// Force re-download even if model exists #[arg(short, long)] force: bool, /// Specific revision/branch to download #[arg(long)] revision: Option, }, /// List available and downloaded models #[command(alias = "ls")] List { /// Show only downloaded models #[arg(short, long)] downloaded: bool, /// Show detailed information #[arg(short, long)] long: bool, }, /// Show detailed model information Info { /// Model identifier or alias model: String, }, /// Start an OpenAI-compatible inference server Serve { /// Model to serve model: String, /// Host to bind to #[arg(long, default_value = "127.0.0.1")] host: String, /// Port to bind to #[arg(short, long, default_value = "8080")] port: u16, /// Maximum concurrent requests #[arg(long, default_value = "4")] max_concurrent: usize, /// Maximum context length #[arg(long, default_value = "4096")] max_context: usize, /// Quantization format #[arg(short, long, default_value = "q4k")] quantization: String, }, /// Interactive chat mode Chat { /// Model to use for chat model: String, /// System prompt #[arg(short, long)] system: Option, /// Maximum tokens to generate per response #[arg(long, default_value = "512")] max_tokens: usize, /// Temperature for sampling (0.0 = deterministic) #[arg(short, long, default_value = "0.7")] temperature: f32, /// Quantization format #[arg(short, long, default_value = "q4k")] quantization: String, /// Enable speculative decoding with a draft model /// /// Provide the draft model path/ID. Recommended pairings: /// - Qwen2.5-14B + Qwen2.5-0.5B /// - Mistral-7B + TinyLlama-1.1B /// - Llama-3.2-3B + Llama-3.2-1B #[arg(long)] speculative: Option, /// Number of speculative tokens to generate ahead (2-8) #[arg(long, default_value = "4")] speculative_lookahead: usize, }, /// Run performance benchmarks #[command(alias = "bench")] Benchmark { /// Model to benchmark model: String, /// Number of warmup iterations #[arg(long, default_value = "3")] warmup: usize, /// Number of benchmark iterations #[arg(short, long, default_value = "10")] iterations: usize, /// Prompt length for benchmarking #[arg(long, default_value = "128")] prompt_length: usize, /// Generation length for benchmarking #[arg(long, default_value = "64")] gen_length: usize, /// Quantization format #[arg(short, long, default_value = "q4k")] quantization: String, /// Output format (text, json, csv) #[arg(long, default_value = "text")] format: String, }, /// Quantize a model to GGUF format /// /// Supports Q4_K_M (4-bit), Q5_K_M (5-bit), and Q8_0 (8-bit) quantization. /// Optimized for Apple Neural Engine (ANE) inference on M4 Pro. /// /// Examples: /// ruvllm quantize --model qwen-0.5b --output ruvltra-small-q4.gguf --quant q4_k_m /// ruvllm quantize --model ./model.safetensors --quant q8_0 --ane-optimize #[command(alias = "quant")] Quantize { /// Model to quantize (path or HuggingFace ID) #[arg(short, long)] model: String, /// Output file path (default: -.gguf) #[arg(short, long, default_value = "")] output: String, /// Quantization format: q4_k_m, q5_k_m, q8_0, f16 /// /// Memory estimates for 0.5B model: /// - q4_k_m: ~300 MB (best quality/size tradeoff) /// - q5_k_m: ~375 MB (higher quality) /// - q8_0: ~500 MB (near-lossless) #[arg(short, long, default_value = "q4_k_m")] quant: String, /// Enable ANE-optimized weight layouts (16-byte aligned, tiled) #[arg(long, default_value = "true")] ane_optimize: bool, /// Keep embedding layer in FP16 (recommended for quality) #[arg(long, default_value = "true")] keep_embed_fp16: bool, /// Keep output/LM head layer in FP16 (recommended for quality) #[arg(long, default_value = "true")] keep_output_fp16: bool, /// Show detailed progress and statistics #[arg(long)] verbose: bool, }, } #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); // Initialize logging let log_level = if cli.verbose { "debug" } else { "info" }; tracing_subscriber::registry() .with( tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| log_level.into()), ) .with(tracing_subscriber::fmt::layer().with_target(false)) .init(); // Set up colored output if cli.no_color { colored::control::set_override(false); } // Get cache directory let cache_dir = cli.cache_dir.unwrap_or_else(|| { dirs::cache_dir() .unwrap_or_else(|| std::path::PathBuf::from(".")) .join("ruvllm") .to_string_lossy() .to_string() }); // Execute command let result = match cli.command { Commands::Download { model, quantization, force, revision, } => { download::run( &model, &quantization, force, revision.as_deref(), &cache_dir, ) .await } Commands::List { downloaded, long } => list::run(downloaded, long, &cache_dir).await, Commands::Info { model } => info::run(&model, &cache_dir).await, Commands::Serve { model, host, port, max_concurrent, max_context, quantization, } => { serve::run( &model, &host, port, max_concurrent, max_context, &quantization, &cache_dir, ) .await } Commands::Chat { model, system, max_tokens, temperature, quantization, speculative, speculative_lookahead, } => { chat::run( &model, system.as_deref(), max_tokens, temperature, &quantization, &cache_dir, speculative.as_deref(), speculative_lookahead, ) .await } Commands::Benchmark { model, warmup, iterations, prompt_length, gen_length, quantization, format, } => { benchmark::run( &model, warmup, iterations, prompt_length, gen_length, &quantization, &format, &cache_dir, ) .await } Commands::Quantize { model, output, quant, ane_optimize, keep_embed_fp16, keep_output_fp16, verbose, } => { quantize::run( &model, &output, &quant, ane_optimize, keep_embed_fp16, keep_output_fp16, verbose, &cache_dir, ) .await } }; if let Err(e) = result { eprintln!("{} {}", "Error:".red().bold(), e); std::process::exit(1); } Ok(()) }