#![allow( clippy::all, unused_imports, unused_variables, dead_code, unused_mut, unused_assignments, non_camel_case_types, clippy::approx_constant, unexpected_cfgs, unused_must_use, unused_parens )] //! RuvLLM Evaluation CLI //! //! Run real LLM evaluations using SWE-Bench tasks with the full RuvLLM stack. //! //! ## Usage //! //! ```bash //! # Run evaluation with a GGUF model on sample tasks //! cargo run -p ruvllm --example run_eval --features candle -- \ //! --model ./models/llama-7b-q4.gguf \ //! --tasks sample //! //! # Run on SWE-bench-lite (downloads and caches) //! cargo run -p ruvllm --example run_eval --features candle -- \ //! --model ./models/llama-7b-q4.gguf \ //! --tasks swe-bench-lite \ //! --max-tasks 50 //! //! # Run with specific ablation modes //! cargo run -p ruvllm --example run_eval --features candle -- \ //! --model ./models/llama-7b-q4.gguf \ //! --tasks sample \ //! --modes baseline,full //! //! # Run on local JSON file //! cargo run -p ruvllm --example run_eval --features candle -- \ //! --model ./models/llama-7b-q4.gguf \ //! --tasks ./my-tasks.json \ //! --output ./results.json //! ``` //! //! ## Environment Variables //! //! - `RUVLLM_MODELS_DIR`: Default directory for model files //! - `RUVLLM_CACHE_DIR`: Cache directory for downloaded datasets use ruvllm::backends::ModelConfig; use ruvllm::evaluation::{ swe_bench::{SweBenchConfig, SweBenchLoader}, AblationMode, EvalConfig, EvalTask, RealEvaluationHarness, RealInferenceConfig, }; use std::env; use std::path::PathBuf; use std::process; fn main() { // Initialize logging if env::var("RUST_LOG").is_err() { env::set_var("RUST_LOG", "info"); } tracing_subscriber::fmt::init(); let args: Vec = env::args().collect(); if args.len() < 2 || args.contains(&"--help".to_string()) || args.contains(&"-h".to_string()) { print_help(); return; } // Parse arguments let config = match parse_args(&args[1..]) { Ok(c) => c, Err(e) => { eprintln!("Error: {}", e); eprintln!("\nRun with --help for usage information."); process::exit(1); } }; // Run evaluation if let Err(e) = run_evaluation(config) { eprintln!("Evaluation failed: {}", e); process::exit(1); } } fn print_help() { println!( r#"RuvLLM Evaluation CLI Run real LLM evaluations on SWE-Bench tasks with SONA learning and HNSW routing. USAGE: run_eval [OPTIONS] --model OPTIONS: --model Path to GGUF model file (required) --tasks Task source: sample, swe-bench-lite, swe-bench, or file path (default: sample) --max-tasks Maximum number of tasks to evaluate (default: all) --modes Comma-separated ablation modes (default: all) Options: baseline, retrieval, adapters, retrieval+adapters, full --seeds Comma-separated random seeds (default: 42,123,456) --output Output file for results JSON (default: stdout summary) --quality-threshold Minimum quality score for acceptance (default: 0.7) --cost-target Target cost per patch in dollars (default: 0.10) --no-sona Disable SONA learning --no-hnsw Disable HNSW routing --repo Filter tasks by repository name --verbose Enable verbose output -h, --help Show this help message EXAMPLES: # Quick test with sample tasks run_eval --model ./model.gguf --tasks sample # Run SWE-bench-lite evaluation run_eval --model ./model.gguf --tasks swe-bench-lite --max-tasks 100 # Compare baseline vs full mode run_eval --model ./model.gguf --modes baseline,full --output results.json # Run on custom task file run_eval --model ./model.gguf --tasks ./my-tasks.json --verbose "# ); } #[derive(Debug)] struct CliConfig { model_path: PathBuf, task_source: TaskSource, max_tasks: Option, ablation_modes: Vec, seeds: Vec, output_path: Option, quality_threshold: f64, cost_target: f64, enable_sona: bool, enable_hnsw: bool, repo_filter: Option, verbose: bool, } #[derive(Debug)] enum TaskSource { Sample, SweBenchLite, SweBenchFull, File(PathBuf), } fn parse_args(args: &[String]) -> Result { let mut model_path: Option = None; let mut task_source = TaskSource::Sample; let mut max_tasks = None; let mut ablation_modes = Vec::new(); let mut seeds = vec![42, 123, 456]; let mut output_path = None; let mut quality_threshold = 0.7; let mut cost_target = 0.10; let mut enable_sona = true; let mut enable_hnsw = true; let mut repo_filter = None; let mut verbose = false; let mut i = 0; while i < args.len() { match args[i].as_str() { "--model" => { i += 1; model_path = Some(PathBuf::from(args.get(i).ok_or("--model requires a path")?)); } "--tasks" => { i += 1; let source = args.get(i).ok_or("--tasks requires a value")?; task_source = match source.as_str() { "sample" => TaskSource::Sample, "swe-bench-lite" => TaskSource::SweBenchLite, "swe-bench" => TaskSource::SweBenchFull, path => TaskSource::File(PathBuf::from(path)), }; } "--max-tasks" => { i += 1; let n: usize = args .get(i) .ok_or("--max-tasks requires a number")? .parse() .map_err(|_| "Invalid number for --max-tasks")?; max_tasks = Some(n); } "--modes" => { i += 1; let modes_str = args.get(i).ok_or("--modes requires a value")?; ablation_modes = parse_modes(modes_str)?; } "--seeds" => { i += 1; let seeds_str = args.get(i).ok_or("--seeds requires a value")?; seeds = seeds_str .split(',') .map(|s| s.trim().parse().map_err(|_| "Invalid seed")) .collect::, _>>()?; } "--output" => { i += 1; output_path = Some(PathBuf::from( args.get(i).ok_or("--output requires a path")?, )); } "--quality-threshold" => { i += 1; quality_threshold = args .get(i) .ok_or("--quality-threshold requires a value")? .parse() .map_err(|_| "Invalid quality threshold")?; } "--cost-target" => { i += 1; cost_target = args .get(i) .ok_or("--cost-target requires a value")? .parse() .map_err(|_| "Invalid cost target")?; } "--repo" => { i += 1; repo_filter = Some(args.get(i).ok_or("--repo requires a value")?.clone()); } "--no-sona" => enable_sona = false, "--no-hnsw" => enable_hnsw = false, "--verbose" => verbose = true, arg => { if arg.starts_with('-') { return Err(format!("Unknown option: {}", arg)); } } } i += 1; } let model_path = model_path.ok_or("--model is required")?; // Default to all modes if none specified if ablation_modes.is_empty() { ablation_modes = vec![ AblationMode::Baseline, AblationMode::RetrievalOnly, AblationMode::AdaptersOnly, AblationMode::RetrievalPlusAdapters, AblationMode::Full, ]; } Ok(CliConfig { model_path, task_source, max_tasks, ablation_modes, seeds, output_path, quality_threshold, cost_target, enable_sona, enable_hnsw, repo_filter, verbose, }) } fn parse_modes(modes_str: &str) -> Result, String> { modes_str .split(',') .map(|s| match s.trim().to_lowercase().as_str() { "baseline" => Ok(AblationMode::Baseline), "retrieval" | "retrieval-only" | "retrieval_only" => Ok(AblationMode::RetrievalOnly), "adapters" | "adapters-only" | "adapters_only" => Ok(AblationMode::AdaptersOnly), "retrieval+adapters" | "retrieval_plus_adapters" => { Ok(AblationMode::RetrievalPlusAdapters) } "full" => Ok(AblationMode::Full), other => Err(format!("Unknown ablation mode: {}", other)), }) .collect() } fn run_evaluation(config: CliConfig) -> Result<(), Box> { println!("RuvLLM Evaluation"); println!("=================\n"); // Verify model exists if !config.model_path.exists() { return Err(format!("Model not found: {}", config.model_path.display()).into()); } println!("Model: {}", config.model_path.display()); // Load tasks println!("\nLoading tasks..."); let tasks = load_tasks(&config)?; println!("Loaded {} tasks", tasks.len()); if config.verbose { for task in tasks.iter().take(5) { println!(" - {} ({})", task.id, task.repo); } if tasks.len() > 5 { println!(" ... and {} more", tasks.len() - 5); } } // Configure evaluation let eval_config = EvalConfig { task_count: config.max_tasks.unwrap_or(tasks.len()), seeds: config.seeds.clone(), ablation_modes: config.ablation_modes.clone(), quality_threshold: config.quality_threshold, cost_target: config.cost_target, ..Default::default() }; println!("\nConfiguration:"); println!(" Tasks: {}", eval_config.task_count); println!(" Seeds: {:?}", eval_config.seeds); println!( " Modes: {:?}", eval_config .ablation_modes .iter() .map(|m| m.name()) .collect::>() ); println!( " Quality threshold: {:.0}%", eval_config.quality_threshold * 100.0 ); println!( " SONA: {}", if config.enable_sona { "enabled" } else { "disabled" } ); println!( " HNSW: {}", if config.enable_hnsw { "enabled" } else { "disabled" } ); // Configure inference let inference_config = RealInferenceConfig { model_path: config.model_path.to_string_lossy().to_string(), model_config: ModelConfig::default(), enable_sona: config.enable_sona, enable_hnsw: config.enable_hnsw, ..Default::default() }; // Create harness println!("\nInitializing evaluation harness..."); let mut harness = RealEvaluationHarness::with_config(eval_config, inference_config)?; // Check if model loaded if !harness.is_model_loaded() { return Err("Failed to load model".into()); } println!("Model loaded successfully!"); // Run evaluation println!("\nRunning evaluation..."); println!("This may take a while depending on model size and task count.\n"); let runtime = tokio::runtime::Runtime::new()?; let report = runtime.block_on(harness.run_evaluation(&tasks))?; // Output results println!("\n{}", "=".repeat(60)); println!("EVALUATION COMPLETE"); println!("{}\n", "=".repeat(60)); // Print summary println!("{}", report.summary()); println!(); // Print leaderboard println!("Leaderboard:"); println!("{:-<60}", ""); println!( "{:<5} {:<20} {:>10} {:>10} {:>10}", "Rank", "Mode", "Success%", "Quality", "$/patch" ); println!("{:-<60}", ""); for entry in report.to_leaderboard_entries() { println!( "{:<5} {:<20} {:>9.1}% {:>10.2} {:>10.4}", entry.rank, entry.mode.name(), entry.success_rate * 100.0, entry.quality_score, entry.cost_per_patch ); } println!(); // Print ablation analysis println!("Ablation Analysis vs Baseline:"); for comparison in report.compare_all_to_baseline() { let direction = if comparison.success_delta > 0.0 { "+" } else { "" }; let sig = if comparison.is_significant { "*" } else { "" }; println!( " {}: {}{:.1}%{} success rate", comparison.target.name(), direction, comparison.success_delta * 100.0, sig ); } // Save to file if requested if let Some(output_path) = config.output_path { println!("\nSaving results to {}...", output_path.display()); let json = report.to_json()?; std::fs::write(&output_path, json)?; println!("Results saved!"); // Also save markdown report let md_path = output_path.with_extension("md"); std::fs::write(&md_path, report.to_markdown())?; println!("Markdown report saved to {}", md_path.display()); } Ok(()) } fn load_tasks(config: &CliConfig) -> Result, Box> { let swe_config = SweBenchConfig { max_tasks: config.max_tasks, repo_filter: config.repo_filter.clone(), ..Default::default() }; let loader = SweBenchLoader::new(swe_config); let tasks: Vec = match &config.task_source { TaskSource::Sample => { println!("Using sample tasks (3 tasks)"); SweBenchLoader::sample_tasks() .into_iter() .map(|t| t.into()) .collect() } TaskSource::SweBenchLite => { println!("Loading SWE-bench-lite dataset..."); // For now, use sample tasks since we don't have async download in sync context // In a real implementation, we'd use tokio::runtime to download println!("Note: Using sample tasks. Run with async for full dataset download."); SweBenchLoader::sample_tasks() .into_iter() .map(|t| t.into()) .collect() } TaskSource::SweBenchFull => { println!("Loading full SWE-bench dataset..."); println!("Note: Using sample tasks. Run with async for full dataset download."); SweBenchLoader::sample_tasks() .into_iter() .map(|t| t.into()) .collect() } TaskSource::File(path) => { println!("Loading tasks from {}...", path.display()); let swe_tasks = if path.extension().map_or(false, |e| e == "jsonl") { loader.load_from_jsonl(path)? } else { loader.load_from_file(path)? }; // Print stats let stats = SweBenchLoader::stats(&swe_tasks); if config.verbose { println!("{}", stats); } swe_tasks.into_iter().map(|t| t.into()).collect() } }; // Apply max_tasks filter let tasks = if let Some(max) = config.max_tasks { tasks.into_iter().take(max).collect() } else { tasks }; Ok(tasks) }