Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
368
vendor/ruvector/crates/ruvllm-cli/src/main.rs
vendored
Normal file
368
vendor/ruvector/crates/ruvllm-cli/src/main.rs
vendored
Normal file
@@ -0,0 +1,368 @@
|
||||
//! RuvLLM CLI - Model Management and Inference for Apple Silicon
|
||||
//!
|
||||
//! A command-line interface for downloading, managing, and running LLM models
|
||||
//! optimized for Mac M4 Pro and other Apple Silicon devices.
|
||||
//!
|
||||
//! ## Commands
|
||||
//!
|
||||
//! - `ruvllm download <model>` - Download model from HuggingFace Hub
|
||||
//! - `ruvllm list` - List available/downloaded models
|
||||
//! - `ruvllm info <model>` - Show model information
|
||||
//! - `ruvllm serve <model>` - Start inference server
|
||||
//! - `ruvllm chat <model>` - Interactive chat mode
|
||||
//! - `ruvllm benchmark <model>` - Run performance benchmarks
|
||||
//! - `ruvllm quantize <model>` - Quantize model to GGUF format
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use colored::Colorize;
|
||||
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
|
||||
|
||||
mod commands;
|
||||
mod models;
|
||||
|
||||
use commands::{benchmark, chat, download, info, list, quantize, serve};
|
||||
|
||||
/// RuvLLM - High-performance LLM inference for Apple Silicon
|
||||
#[derive(Parser)]
|
||||
#[command(name = "ruvllm")]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(propagate_version = true)]
|
||||
struct Cli {
|
||||
/// Enable verbose logging
|
||||
#[arg(short, long, global = true)]
|
||||
verbose: bool,
|
||||
|
||||
/// Disable colored output
|
||||
#[arg(long, global = true)]
|
||||
no_color: bool,
|
||||
|
||||
/// Custom cache directory for models
|
||||
#[arg(long, global = true, env = "RUVLLM_CACHE_DIR")]
|
||||
cache_dir: Option<String>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Download a model from HuggingFace Hub
|
||||
#[command(alias = "dl")]
|
||||
Download {
|
||||
/// Model identifier (HuggingFace model ID or alias)
|
||||
///
|
||||
/// Aliases: qwen, mistral, phi, llama
|
||||
model: String,
|
||||
|
||||
/// Quantization format (q4k, q8, f16, none)
|
||||
#[arg(short, long, default_value = "q4k")]
|
||||
quantization: String,
|
||||
|
||||
/// Force re-download even if model exists
|
||||
#[arg(short, long)]
|
||||
force: bool,
|
||||
|
||||
/// Specific revision/branch to download
|
||||
#[arg(long)]
|
||||
revision: Option<String>,
|
||||
},
|
||||
|
||||
/// List available and downloaded models
|
||||
#[command(alias = "ls")]
|
||||
List {
|
||||
/// Show only downloaded models
|
||||
#[arg(short, long)]
|
||||
downloaded: bool,
|
||||
|
||||
/// Show detailed information
|
||||
#[arg(short, long)]
|
||||
long: bool,
|
||||
},
|
||||
|
||||
/// Show detailed model information
|
||||
Info {
|
||||
/// Model identifier or alias
|
||||
model: String,
|
||||
},
|
||||
|
||||
/// Start an OpenAI-compatible inference server
|
||||
Serve {
|
||||
/// Model to serve
|
||||
model: String,
|
||||
|
||||
/// Host to bind to
|
||||
#[arg(long, default_value = "127.0.0.1")]
|
||||
host: String,
|
||||
|
||||
/// Port to bind to
|
||||
#[arg(short, long, default_value = "8080")]
|
||||
port: u16,
|
||||
|
||||
/// Maximum concurrent requests
|
||||
#[arg(long, default_value = "4")]
|
||||
max_concurrent: usize,
|
||||
|
||||
/// Maximum context length
|
||||
#[arg(long, default_value = "4096")]
|
||||
max_context: usize,
|
||||
|
||||
/// Quantization format
|
||||
#[arg(short, long, default_value = "q4k")]
|
||||
quantization: String,
|
||||
},
|
||||
|
||||
/// Interactive chat mode
|
||||
Chat {
|
||||
/// Model to use for chat
|
||||
model: String,
|
||||
|
||||
/// System prompt
|
||||
#[arg(short, long)]
|
||||
system: Option<String>,
|
||||
|
||||
/// Maximum tokens to generate per response
|
||||
#[arg(long, default_value = "512")]
|
||||
max_tokens: usize,
|
||||
|
||||
/// Temperature for sampling (0.0 = deterministic)
|
||||
#[arg(short, long, default_value = "0.7")]
|
||||
temperature: f32,
|
||||
|
||||
/// Quantization format
|
||||
#[arg(short, long, default_value = "q4k")]
|
||||
quantization: String,
|
||||
|
||||
/// Enable speculative decoding with a draft model
|
||||
///
|
||||
/// Provide the draft model path/ID. Recommended pairings:
|
||||
/// - Qwen2.5-14B + Qwen2.5-0.5B
|
||||
/// - Mistral-7B + TinyLlama-1.1B
|
||||
/// - Llama-3.2-3B + Llama-3.2-1B
|
||||
#[arg(long)]
|
||||
speculative: Option<String>,
|
||||
|
||||
/// Number of speculative tokens to generate ahead (2-8)
|
||||
#[arg(long, default_value = "4")]
|
||||
speculative_lookahead: usize,
|
||||
},
|
||||
|
||||
/// Run performance benchmarks
|
||||
#[command(alias = "bench")]
|
||||
Benchmark {
|
||||
/// Model to benchmark
|
||||
model: String,
|
||||
|
||||
/// Number of warmup iterations
|
||||
#[arg(long, default_value = "3")]
|
||||
warmup: usize,
|
||||
|
||||
/// Number of benchmark iterations
|
||||
#[arg(short, long, default_value = "10")]
|
||||
iterations: usize,
|
||||
|
||||
/// Prompt length for benchmarking
|
||||
#[arg(long, default_value = "128")]
|
||||
prompt_length: usize,
|
||||
|
||||
/// Generation length for benchmarking
|
||||
#[arg(long, default_value = "64")]
|
||||
gen_length: usize,
|
||||
|
||||
/// Quantization format
|
||||
#[arg(short, long, default_value = "q4k")]
|
||||
quantization: String,
|
||||
|
||||
/// Output format (text, json, csv)
|
||||
#[arg(long, default_value = "text")]
|
||||
format: String,
|
||||
},
|
||||
|
||||
/// Quantize a model to GGUF format
|
||||
///
|
||||
/// Supports Q4_K_M (4-bit), Q5_K_M (5-bit), and Q8_0 (8-bit) quantization.
|
||||
/// Optimized for Apple Neural Engine (ANE) inference on M4 Pro.
|
||||
///
|
||||
/// Examples:
|
||||
/// ruvllm quantize --model qwen-0.5b --output ruvltra-small-q4.gguf --quant q4_k_m
|
||||
/// ruvllm quantize --model ./model.safetensors --quant q8_0 --ane-optimize
|
||||
#[command(alias = "quant")]
|
||||
Quantize {
|
||||
/// Model to quantize (path or HuggingFace ID)
|
||||
#[arg(short, long)]
|
||||
model: String,
|
||||
|
||||
/// Output file path (default: <model>-<quant>.gguf)
|
||||
#[arg(short, long, default_value = "")]
|
||||
output: String,
|
||||
|
||||
/// Quantization format: q4_k_m, q5_k_m, q8_0, f16
|
||||
///
|
||||
/// Memory estimates for 0.5B model:
|
||||
/// - q4_k_m: ~300 MB (best quality/size tradeoff)
|
||||
/// - q5_k_m: ~375 MB (higher quality)
|
||||
/// - q8_0: ~500 MB (near-lossless)
|
||||
#[arg(short, long, default_value = "q4_k_m")]
|
||||
quant: String,
|
||||
|
||||
/// Enable ANE-optimized weight layouts (16-byte aligned, tiled)
|
||||
#[arg(long, default_value = "true")]
|
||||
ane_optimize: bool,
|
||||
|
||||
/// Keep embedding layer in FP16 (recommended for quality)
|
||||
#[arg(long, default_value = "true")]
|
||||
keep_embed_fp16: bool,
|
||||
|
||||
/// Keep output/LM head layer in FP16 (recommended for quality)
|
||||
#[arg(long, default_value = "true")]
|
||||
keep_output_fp16: bool,
|
||||
|
||||
/// Show detailed progress and statistics
|
||||
#[arg(long)]
|
||||
verbose: bool,
|
||||
},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
// Initialize logging
|
||||
let log_level = if cli.verbose { "debug" } else { "info" };
|
||||
tracing_subscriber::registry()
|
||||
.with(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| log_level.into()),
|
||||
)
|
||||
.with(tracing_subscriber::fmt::layer().with_target(false))
|
||||
.init();
|
||||
|
||||
// Set up colored output
|
||||
if cli.no_color {
|
||||
colored::control::set_override(false);
|
||||
}
|
||||
|
||||
// Get cache directory
|
||||
let cache_dir = cli.cache_dir.unwrap_or_else(|| {
|
||||
dirs::cache_dir()
|
||||
.unwrap_or_else(|| std::path::PathBuf::from("."))
|
||||
.join("ruvllm")
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
});
|
||||
|
||||
// Execute command
|
||||
let result = match cli.command {
|
||||
Commands::Download {
|
||||
model,
|
||||
quantization,
|
||||
force,
|
||||
revision,
|
||||
} => {
|
||||
download::run(
|
||||
&model,
|
||||
&quantization,
|
||||
force,
|
||||
revision.as_deref(),
|
||||
&cache_dir,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
Commands::List { downloaded, long } => list::run(downloaded, long, &cache_dir).await,
|
||||
|
||||
Commands::Info { model } => info::run(&model, &cache_dir).await,
|
||||
|
||||
Commands::Serve {
|
||||
model,
|
||||
host,
|
||||
port,
|
||||
max_concurrent,
|
||||
max_context,
|
||||
quantization,
|
||||
} => {
|
||||
serve::run(
|
||||
&model,
|
||||
&host,
|
||||
port,
|
||||
max_concurrent,
|
||||
max_context,
|
||||
&quantization,
|
||||
&cache_dir,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
Commands::Chat {
|
||||
model,
|
||||
system,
|
||||
max_tokens,
|
||||
temperature,
|
||||
quantization,
|
||||
speculative,
|
||||
speculative_lookahead,
|
||||
} => {
|
||||
chat::run(
|
||||
&model,
|
||||
system.as_deref(),
|
||||
max_tokens,
|
||||
temperature,
|
||||
&quantization,
|
||||
&cache_dir,
|
||||
speculative.as_deref(),
|
||||
speculative_lookahead,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
Commands::Benchmark {
|
||||
model,
|
||||
warmup,
|
||||
iterations,
|
||||
prompt_length,
|
||||
gen_length,
|
||||
quantization,
|
||||
format,
|
||||
} => {
|
||||
benchmark::run(
|
||||
&model,
|
||||
warmup,
|
||||
iterations,
|
||||
prompt_length,
|
||||
gen_length,
|
||||
&quantization,
|
||||
&format,
|
||||
&cache_dir,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
Commands::Quantize {
|
||||
model,
|
||||
output,
|
||||
quant,
|
||||
ane_optimize,
|
||||
keep_embed_fp16,
|
||||
keep_output_fp16,
|
||||
verbose,
|
||||
} => {
|
||||
quantize::run(
|
||||
&model,
|
||||
&output,
|
||||
&quant,
|
||||
ane_optimize,
|
||||
keep_embed_fp16,
|
||||
keep_output_fp16,
|
||||
verbose,
|
||||
&cache_dir,
|
||||
)
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = result {
|
||||
eprintln!("{} {}", "Error:".red().bold(), e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user