Files
wifi-densepose/vendor/ruvector/crates/ruvllm-cli/src/main.rs

369 lines
9.8 KiB
Rust

//! RuvLLM CLI - Model Management and Inference for Apple Silicon
//!
//! A command-line interface for downloading, managing, and running LLM models
//! optimized for Mac M4 Pro and other Apple Silicon devices.
//!
//! ## Commands
//!
//! - `ruvllm download <model>` - Download model from HuggingFace Hub
//! - `ruvllm list` - List available/downloaded models
//! - `ruvllm info <model>` - Show model information
//! - `ruvllm serve <model>` - Start inference server
//! - `ruvllm chat <model>` - Interactive chat mode
//! - `ruvllm benchmark <model>` - Run performance benchmarks
//! - `ruvllm quantize <model>` - Quantize model to GGUF format
use clap::{Parser, Subcommand};
use colored::Colorize;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
mod commands;
mod models;
use commands::{benchmark, chat, download, info, list, quantize, serve};
/// RuvLLM - High-performance LLM inference for Apple Silicon
#[derive(Parser)]
#[command(name = "ruvllm")]
#[command(author, version, about, long_about = None)]
#[command(propagate_version = true)]
struct Cli {
/// Enable verbose logging
#[arg(short, long, global = true)]
verbose: bool,
/// Disable colored output
#[arg(long, global = true)]
no_color: bool,
/// Custom cache directory for models
#[arg(long, global = true, env = "RUVLLM_CACHE_DIR")]
cache_dir: Option<String>,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Download a model from HuggingFace Hub
#[command(alias = "dl")]
Download {
/// Model identifier (HuggingFace model ID or alias)
///
/// Aliases: qwen, mistral, phi, llama
model: String,
/// Quantization format (q4k, q8, f16, none)
#[arg(short, long, default_value = "q4k")]
quantization: String,
/// Force re-download even if model exists
#[arg(short, long)]
force: bool,
/// Specific revision/branch to download
#[arg(long)]
revision: Option<String>,
},
/// List available and downloaded models
#[command(alias = "ls")]
List {
/// Show only downloaded models
#[arg(short, long)]
downloaded: bool,
/// Show detailed information
#[arg(short, long)]
long: bool,
},
/// Show detailed model information
Info {
/// Model identifier or alias
model: String,
},
/// Start an OpenAI-compatible inference server
Serve {
/// Model to serve
model: String,
/// Host to bind to
#[arg(long, default_value = "127.0.0.1")]
host: String,
/// Port to bind to
#[arg(short, long, default_value = "8080")]
port: u16,
/// Maximum concurrent requests
#[arg(long, default_value = "4")]
max_concurrent: usize,
/// Maximum context length
#[arg(long, default_value = "4096")]
max_context: usize,
/// Quantization format
#[arg(short, long, default_value = "q4k")]
quantization: String,
},
/// Interactive chat mode
Chat {
/// Model to use for chat
model: String,
/// System prompt
#[arg(short, long)]
system: Option<String>,
/// Maximum tokens to generate per response
#[arg(long, default_value = "512")]
max_tokens: usize,
/// Temperature for sampling (0.0 = deterministic)
#[arg(short, long, default_value = "0.7")]
temperature: f32,
/// Quantization format
#[arg(short, long, default_value = "q4k")]
quantization: String,
/// Enable speculative decoding with a draft model
///
/// Provide the draft model path/ID. Recommended pairings:
/// - Qwen2.5-14B + Qwen2.5-0.5B
/// - Mistral-7B + TinyLlama-1.1B
/// - Llama-3.2-3B + Llama-3.2-1B
#[arg(long)]
speculative: Option<String>,
/// Number of speculative tokens to generate ahead (2-8)
#[arg(long, default_value = "4")]
speculative_lookahead: usize,
},
/// Run performance benchmarks
#[command(alias = "bench")]
Benchmark {
/// Model to benchmark
model: String,
/// Number of warmup iterations
#[arg(long, default_value = "3")]
warmup: usize,
/// Number of benchmark iterations
#[arg(short, long, default_value = "10")]
iterations: usize,
/// Prompt length for benchmarking
#[arg(long, default_value = "128")]
prompt_length: usize,
/// Generation length for benchmarking
#[arg(long, default_value = "64")]
gen_length: usize,
/// Quantization format
#[arg(short, long, default_value = "q4k")]
quantization: String,
/// Output format (text, json, csv)
#[arg(long, default_value = "text")]
format: String,
},
/// Quantize a model to GGUF format
///
/// Supports Q4_K_M (4-bit), Q5_K_M (5-bit), and Q8_0 (8-bit) quantization.
/// Optimized for Apple Neural Engine (ANE) inference on M4 Pro.
///
/// Examples:
/// ruvllm quantize --model qwen-0.5b --output ruvltra-small-q4.gguf --quant q4_k_m
/// ruvllm quantize --model ./model.safetensors --quant q8_0 --ane-optimize
#[command(alias = "quant")]
Quantize {
/// Model to quantize (path or HuggingFace ID)
#[arg(short, long)]
model: String,
/// Output file path (default: <model>-<quant>.gguf)
#[arg(short, long, default_value = "")]
output: String,
/// Quantization format: q4_k_m, q5_k_m, q8_0, f16
///
/// Memory estimates for 0.5B model:
/// - q4_k_m: ~300 MB (best quality/size tradeoff)
/// - q5_k_m: ~375 MB (higher quality)
/// - q8_0: ~500 MB (near-lossless)
#[arg(short, long, default_value = "q4_k_m")]
quant: String,
/// Enable ANE-optimized weight layouts (16-byte aligned, tiled)
#[arg(long, default_value = "true")]
ane_optimize: bool,
/// Keep embedding layer in FP16 (recommended for quality)
#[arg(long, default_value = "true")]
keep_embed_fp16: bool,
/// Keep output/LM head layer in FP16 (recommended for quality)
#[arg(long, default_value = "true")]
keep_output_fp16: bool,
/// Show detailed progress and statistics
#[arg(long)]
verbose: bool,
},
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
// Initialize logging
let log_level = if cli.verbose { "debug" } else { "info" };
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| log_level.into()),
)
.with(tracing_subscriber::fmt::layer().with_target(false))
.init();
// Set up colored output
if cli.no_color {
colored::control::set_override(false);
}
// Get cache directory
let cache_dir = cli.cache_dir.unwrap_or_else(|| {
dirs::cache_dir()
.unwrap_or_else(|| std::path::PathBuf::from("."))
.join("ruvllm")
.to_string_lossy()
.to_string()
});
// Execute command
let result = match cli.command {
Commands::Download {
model,
quantization,
force,
revision,
} => {
download::run(
&model,
&quantization,
force,
revision.as_deref(),
&cache_dir,
)
.await
}
Commands::List { downloaded, long } => list::run(downloaded, long, &cache_dir).await,
Commands::Info { model } => info::run(&model, &cache_dir).await,
Commands::Serve {
model,
host,
port,
max_concurrent,
max_context,
quantization,
} => {
serve::run(
&model,
&host,
port,
max_concurrent,
max_context,
&quantization,
&cache_dir,
)
.await
}
Commands::Chat {
model,
system,
max_tokens,
temperature,
quantization,
speculative,
speculative_lookahead,
} => {
chat::run(
&model,
system.as_deref(),
max_tokens,
temperature,
&quantization,
&cache_dir,
speculative.as_deref(),
speculative_lookahead,
)
.await
}
Commands::Benchmark {
model,
warmup,
iterations,
prompt_length,
gen_length,
quantization,
format,
} => {
benchmark::run(
&model,
warmup,
iterations,
prompt_length,
gen_length,
&quantization,
&format,
&cache_dir,
)
.await
}
Commands::Quantize {
model,
output,
quant,
ane_optimize,
keep_embed_fp16,
keep_output_fp16,
verbose,
} => {
quantize::run(
&model,
&output,
&quant,
ane_optimize,
keep_embed_fp16,
keep_output_fp16,
verbose,
&cache_dir,
)
.await
}
};
if let Err(e) = result {
eprintln!("{} {}", "Error:".red().bold(), e);
std::process::exit(1);
}
Ok(())
}