510 lines
16 KiB
Rust
510 lines
16 KiB
Rust
#![allow(
|
|
clippy::all,
|
|
unused_imports,
|
|
unused_variables,
|
|
dead_code,
|
|
unused_mut,
|
|
unused_assignments,
|
|
non_camel_case_types,
|
|
clippy::approx_constant,
|
|
unexpected_cfgs,
|
|
unused_must_use,
|
|
unused_parens
|
|
)]
|
|
//! RuvLLM Evaluation CLI
|
|
//!
|
|
//! Run real LLM evaluations using SWE-Bench tasks with the full RuvLLM stack.
|
|
//!
|
|
//! ## Usage
|
|
//!
|
|
//! ```bash
|
|
//! # Run evaluation with a GGUF model on sample tasks
|
|
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
|
//! --model ./models/llama-7b-q4.gguf \
|
|
//! --tasks sample
|
|
//!
|
|
//! # Run on SWE-bench-lite (downloads and caches)
|
|
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
|
//! --model ./models/llama-7b-q4.gguf \
|
|
//! --tasks swe-bench-lite \
|
|
//! --max-tasks 50
|
|
//!
|
|
//! # Run with specific ablation modes
|
|
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
|
//! --model ./models/llama-7b-q4.gguf \
|
|
//! --tasks sample \
|
|
//! --modes baseline,full
|
|
//!
|
|
//! # Run on local JSON file
|
|
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
|
//! --model ./models/llama-7b-q4.gguf \
|
|
//! --tasks ./my-tasks.json \
|
|
//! --output ./results.json
|
|
//! ```
|
|
//!
|
|
//! ## Environment Variables
|
|
//!
|
|
//! - `RUVLLM_MODELS_DIR`: Default directory for model files
|
|
//! - `RUVLLM_CACHE_DIR`: Cache directory for downloaded datasets
|
|
|
|
use ruvllm::backends::ModelConfig;
|
|
use ruvllm::evaluation::{
|
|
swe_bench::{SweBenchConfig, SweBenchLoader},
|
|
AblationMode, EvalConfig, EvalTask, RealEvaluationHarness, RealInferenceConfig,
|
|
};
|
|
use std::env;
|
|
use std::path::PathBuf;
|
|
use std::process;
|
|
|
|
fn main() {
|
|
// Initialize logging
|
|
if env::var("RUST_LOG").is_err() {
|
|
env::set_var("RUST_LOG", "info");
|
|
}
|
|
tracing_subscriber::fmt::init();
|
|
|
|
let args: Vec<String> = env::args().collect();
|
|
|
|
if args.len() < 2 || args.contains(&"--help".to_string()) || args.contains(&"-h".to_string()) {
|
|
print_help();
|
|
return;
|
|
}
|
|
|
|
// Parse arguments
|
|
let config = match parse_args(&args[1..]) {
|
|
Ok(c) => c,
|
|
Err(e) => {
|
|
eprintln!("Error: {}", e);
|
|
eprintln!("\nRun with --help for usage information.");
|
|
process::exit(1);
|
|
}
|
|
};
|
|
|
|
// Run evaluation
|
|
if let Err(e) = run_evaluation(config) {
|
|
eprintln!("Evaluation failed: {}", e);
|
|
process::exit(1);
|
|
}
|
|
}
|
|
|
|
fn print_help() {
|
|
println!(
|
|
r#"RuvLLM Evaluation CLI
|
|
|
|
Run real LLM evaluations on SWE-Bench tasks with SONA learning and HNSW routing.
|
|
|
|
USAGE:
|
|
run_eval [OPTIONS] --model <PATH>
|
|
|
|
OPTIONS:
|
|
--model <PATH> Path to GGUF model file (required)
|
|
--tasks <SOURCE> Task source: sample, swe-bench-lite, swe-bench, or file path
|
|
(default: sample)
|
|
--max-tasks <N> Maximum number of tasks to evaluate (default: all)
|
|
--modes <MODES> Comma-separated ablation modes (default: all)
|
|
Options: baseline, retrieval, adapters, retrieval+adapters, full
|
|
--seeds <SEEDS> Comma-separated random seeds (default: 42,123,456)
|
|
--output <PATH> Output file for results JSON (default: stdout summary)
|
|
--quality-threshold <F> Minimum quality score for acceptance (default: 0.7)
|
|
--cost-target <F> Target cost per patch in dollars (default: 0.10)
|
|
--no-sona Disable SONA learning
|
|
--no-hnsw Disable HNSW routing
|
|
--repo <NAME> Filter tasks by repository name
|
|
--verbose Enable verbose output
|
|
-h, --help Show this help message
|
|
|
|
EXAMPLES:
|
|
# Quick test with sample tasks
|
|
run_eval --model ./model.gguf --tasks sample
|
|
|
|
# Run SWE-bench-lite evaluation
|
|
run_eval --model ./model.gguf --tasks swe-bench-lite --max-tasks 100
|
|
|
|
# Compare baseline vs full mode
|
|
run_eval --model ./model.gguf --modes baseline,full --output results.json
|
|
|
|
# Run on custom task file
|
|
run_eval --model ./model.gguf --tasks ./my-tasks.json --verbose
|
|
"#
|
|
);
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct CliConfig {
|
|
model_path: PathBuf,
|
|
task_source: TaskSource,
|
|
max_tasks: Option<usize>,
|
|
ablation_modes: Vec<AblationMode>,
|
|
seeds: Vec<u64>,
|
|
output_path: Option<PathBuf>,
|
|
quality_threshold: f64,
|
|
cost_target: f64,
|
|
enable_sona: bool,
|
|
enable_hnsw: bool,
|
|
repo_filter: Option<String>,
|
|
verbose: bool,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
enum TaskSource {
|
|
Sample,
|
|
SweBenchLite,
|
|
SweBenchFull,
|
|
File(PathBuf),
|
|
}
|
|
|
|
fn parse_args(args: &[String]) -> Result<CliConfig, String> {
|
|
let mut model_path: Option<PathBuf> = None;
|
|
let mut task_source = TaskSource::Sample;
|
|
let mut max_tasks = None;
|
|
let mut ablation_modes = Vec::new();
|
|
let mut seeds = vec![42, 123, 456];
|
|
let mut output_path = None;
|
|
let mut quality_threshold = 0.7;
|
|
let mut cost_target = 0.10;
|
|
let mut enable_sona = true;
|
|
let mut enable_hnsw = true;
|
|
let mut repo_filter = None;
|
|
let mut verbose = false;
|
|
|
|
let mut i = 0;
|
|
while i < args.len() {
|
|
match args[i].as_str() {
|
|
"--model" => {
|
|
i += 1;
|
|
model_path = Some(PathBuf::from(args.get(i).ok_or("--model requires a path")?));
|
|
}
|
|
"--tasks" => {
|
|
i += 1;
|
|
let source = args.get(i).ok_or("--tasks requires a value")?;
|
|
task_source = match source.as_str() {
|
|
"sample" => TaskSource::Sample,
|
|
"swe-bench-lite" => TaskSource::SweBenchLite,
|
|
"swe-bench" => TaskSource::SweBenchFull,
|
|
path => TaskSource::File(PathBuf::from(path)),
|
|
};
|
|
}
|
|
"--max-tasks" => {
|
|
i += 1;
|
|
let n: usize = args
|
|
.get(i)
|
|
.ok_or("--max-tasks requires a number")?
|
|
.parse()
|
|
.map_err(|_| "Invalid number for --max-tasks")?;
|
|
max_tasks = Some(n);
|
|
}
|
|
"--modes" => {
|
|
i += 1;
|
|
let modes_str = args.get(i).ok_or("--modes requires a value")?;
|
|
ablation_modes = parse_modes(modes_str)?;
|
|
}
|
|
"--seeds" => {
|
|
i += 1;
|
|
let seeds_str = args.get(i).ok_or("--seeds requires a value")?;
|
|
seeds = seeds_str
|
|
.split(',')
|
|
.map(|s| s.trim().parse().map_err(|_| "Invalid seed"))
|
|
.collect::<Result<Vec<_>, _>>()?;
|
|
}
|
|
"--output" => {
|
|
i += 1;
|
|
output_path = Some(PathBuf::from(
|
|
args.get(i).ok_or("--output requires a path")?,
|
|
));
|
|
}
|
|
"--quality-threshold" => {
|
|
i += 1;
|
|
quality_threshold = args
|
|
.get(i)
|
|
.ok_or("--quality-threshold requires a value")?
|
|
.parse()
|
|
.map_err(|_| "Invalid quality threshold")?;
|
|
}
|
|
"--cost-target" => {
|
|
i += 1;
|
|
cost_target = args
|
|
.get(i)
|
|
.ok_or("--cost-target requires a value")?
|
|
.parse()
|
|
.map_err(|_| "Invalid cost target")?;
|
|
}
|
|
"--repo" => {
|
|
i += 1;
|
|
repo_filter = Some(args.get(i).ok_or("--repo requires a value")?.clone());
|
|
}
|
|
"--no-sona" => enable_sona = false,
|
|
"--no-hnsw" => enable_hnsw = false,
|
|
"--verbose" => verbose = true,
|
|
arg => {
|
|
if arg.starts_with('-') {
|
|
return Err(format!("Unknown option: {}", arg));
|
|
}
|
|
}
|
|
}
|
|
i += 1;
|
|
}
|
|
|
|
let model_path = model_path.ok_or("--model is required")?;
|
|
|
|
// Default to all modes if none specified
|
|
if ablation_modes.is_empty() {
|
|
ablation_modes = vec![
|
|
AblationMode::Baseline,
|
|
AblationMode::RetrievalOnly,
|
|
AblationMode::AdaptersOnly,
|
|
AblationMode::RetrievalPlusAdapters,
|
|
AblationMode::Full,
|
|
];
|
|
}
|
|
|
|
Ok(CliConfig {
|
|
model_path,
|
|
task_source,
|
|
max_tasks,
|
|
ablation_modes,
|
|
seeds,
|
|
output_path,
|
|
quality_threshold,
|
|
cost_target,
|
|
enable_sona,
|
|
enable_hnsw,
|
|
repo_filter,
|
|
verbose,
|
|
})
|
|
}
|
|
|
|
fn parse_modes(modes_str: &str) -> Result<Vec<AblationMode>, String> {
|
|
modes_str
|
|
.split(',')
|
|
.map(|s| match s.trim().to_lowercase().as_str() {
|
|
"baseline" => Ok(AblationMode::Baseline),
|
|
"retrieval" | "retrieval-only" | "retrieval_only" => Ok(AblationMode::RetrievalOnly),
|
|
"adapters" | "adapters-only" | "adapters_only" => Ok(AblationMode::AdaptersOnly),
|
|
"retrieval+adapters" | "retrieval_plus_adapters" => {
|
|
Ok(AblationMode::RetrievalPlusAdapters)
|
|
}
|
|
"full" => Ok(AblationMode::Full),
|
|
other => Err(format!("Unknown ablation mode: {}", other)),
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn run_evaluation(config: CliConfig) -> Result<(), Box<dyn std::error::Error>> {
|
|
println!("RuvLLM Evaluation");
|
|
println!("=================\n");
|
|
|
|
// Verify model exists
|
|
if !config.model_path.exists() {
|
|
return Err(format!("Model not found: {}", config.model_path.display()).into());
|
|
}
|
|
println!("Model: {}", config.model_path.display());
|
|
|
|
// Load tasks
|
|
println!("\nLoading tasks...");
|
|
let tasks = load_tasks(&config)?;
|
|
println!("Loaded {} tasks", tasks.len());
|
|
|
|
if config.verbose {
|
|
for task in tasks.iter().take(5) {
|
|
println!(" - {} ({})", task.id, task.repo);
|
|
}
|
|
if tasks.len() > 5 {
|
|
println!(" ... and {} more", tasks.len() - 5);
|
|
}
|
|
}
|
|
|
|
// Configure evaluation
|
|
let eval_config = EvalConfig {
|
|
task_count: config.max_tasks.unwrap_or(tasks.len()),
|
|
seeds: config.seeds.clone(),
|
|
ablation_modes: config.ablation_modes.clone(),
|
|
quality_threshold: config.quality_threshold,
|
|
cost_target: config.cost_target,
|
|
..Default::default()
|
|
};
|
|
|
|
println!("\nConfiguration:");
|
|
println!(" Tasks: {}", eval_config.task_count);
|
|
println!(" Seeds: {:?}", eval_config.seeds);
|
|
println!(
|
|
" Modes: {:?}",
|
|
eval_config
|
|
.ablation_modes
|
|
.iter()
|
|
.map(|m| m.name())
|
|
.collect::<Vec<_>>()
|
|
);
|
|
println!(
|
|
" Quality threshold: {:.0}%",
|
|
eval_config.quality_threshold * 100.0
|
|
);
|
|
println!(
|
|
" SONA: {}",
|
|
if config.enable_sona {
|
|
"enabled"
|
|
} else {
|
|
"disabled"
|
|
}
|
|
);
|
|
println!(
|
|
" HNSW: {}",
|
|
if config.enable_hnsw {
|
|
"enabled"
|
|
} else {
|
|
"disabled"
|
|
}
|
|
);
|
|
|
|
// Configure inference
|
|
let inference_config = RealInferenceConfig {
|
|
model_path: config.model_path.to_string_lossy().to_string(),
|
|
model_config: ModelConfig::default(),
|
|
enable_sona: config.enable_sona,
|
|
enable_hnsw: config.enable_hnsw,
|
|
..Default::default()
|
|
};
|
|
|
|
// Create harness
|
|
println!("\nInitializing evaluation harness...");
|
|
let mut harness = RealEvaluationHarness::with_config(eval_config, inference_config)?;
|
|
|
|
// Check if model loaded
|
|
if !harness.is_model_loaded() {
|
|
return Err("Failed to load model".into());
|
|
}
|
|
println!("Model loaded successfully!");
|
|
|
|
// Run evaluation
|
|
println!("\nRunning evaluation...");
|
|
println!("This may take a while depending on model size and task count.\n");
|
|
|
|
let runtime = tokio::runtime::Runtime::new()?;
|
|
let report = runtime.block_on(harness.run_evaluation(&tasks))?;
|
|
|
|
// Output results
|
|
println!("\n{}", "=".repeat(60));
|
|
println!("EVALUATION COMPLETE");
|
|
println!("{}\n", "=".repeat(60));
|
|
|
|
// Print summary
|
|
println!("{}", report.summary());
|
|
println!();
|
|
|
|
// Print leaderboard
|
|
println!("Leaderboard:");
|
|
println!("{:-<60}", "");
|
|
println!(
|
|
"{:<5} {:<20} {:>10} {:>10} {:>10}",
|
|
"Rank", "Mode", "Success%", "Quality", "$/patch"
|
|
);
|
|
println!("{:-<60}", "");
|
|
|
|
for entry in report.to_leaderboard_entries() {
|
|
println!(
|
|
"{:<5} {:<20} {:>9.1}% {:>10.2} {:>10.4}",
|
|
entry.rank,
|
|
entry.mode.name(),
|
|
entry.success_rate * 100.0,
|
|
entry.quality_score,
|
|
entry.cost_per_patch
|
|
);
|
|
}
|
|
println!();
|
|
|
|
// Print ablation analysis
|
|
println!("Ablation Analysis vs Baseline:");
|
|
for comparison in report.compare_all_to_baseline() {
|
|
let direction = if comparison.success_delta > 0.0 {
|
|
"+"
|
|
} else {
|
|
""
|
|
};
|
|
let sig = if comparison.is_significant { "*" } else { "" };
|
|
println!(
|
|
" {}: {}{:.1}%{} success rate",
|
|
comparison.target.name(),
|
|
direction,
|
|
comparison.success_delta * 100.0,
|
|
sig
|
|
);
|
|
}
|
|
|
|
// Save to file if requested
|
|
if let Some(output_path) = config.output_path {
|
|
println!("\nSaving results to {}...", output_path.display());
|
|
let json = report.to_json()?;
|
|
std::fs::write(&output_path, json)?;
|
|
println!("Results saved!");
|
|
|
|
// Also save markdown report
|
|
let md_path = output_path.with_extension("md");
|
|
std::fs::write(&md_path, report.to_markdown())?;
|
|
println!("Markdown report saved to {}", md_path.display());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn load_tasks(config: &CliConfig) -> Result<Vec<EvalTask>, Box<dyn std::error::Error>> {
|
|
let swe_config = SweBenchConfig {
|
|
max_tasks: config.max_tasks,
|
|
repo_filter: config.repo_filter.clone(),
|
|
..Default::default()
|
|
};
|
|
|
|
let loader = SweBenchLoader::new(swe_config);
|
|
|
|
let tasks: Vec<EvalTask> = match &config.task_source {
|
|
TaskSource::Sample => {
|
|
println!("Using sample tasks (3 tasks)");
|
|
SweBenchLoader::sample_tasks()
|
|
.into_iter()
|
|
.map(|t| t.into())
|
|
.collect()
|
|
}
|
|
TaskSource::SweBenchLite => {
|
|
println!("Loading SWE-bench-lite dataset...");
|
|
// For now, use sample tasks since we don't have async download in sync context
|
|
// In a real implementation, we'd use tokio::runtime to download
|
|
println!("Note: Using sample tasks. Run with async for full dataset download.");
|
|
SweBenchLoader::sample_tasks()
|
|
.into_iter()
|
|
.map(|t| t.into())
|
|
.collect()
|
|
}
|
|
TaskSource::SweBenchFull => {
|
|
println!("Loading full SWE-bench dataset...");
|
|
println!("Note: Using sample tasks. Run with async for full dataset download.");
|
|
SweBenchLoader::sample_tasks()
|
|
.into_iter()
|
|
.map(|t| t.into())
|
|
.collect()
|
|
}
|
|
TaskSource::File(path) => {
|
|
println!("Loading tasks from {}...", path.display());
|
|
let swe_tasks = if path.extension().map_or(false, |e| e == "jsonl") {
|
|
loader.load_from_jsonl(path)?
|
|
} else {
|
|
loader.load_from_file(path)?
|
|
};
|
|
|
|
// Print stats
|
|
let stats = SweBenchLoader::stats(&swe_tasks);
|
|
if config.verbose {
|
|
println!("{}", stats);
|
|
}
|
|
|
|
swe_tasks.into_iter().map(|t| t.into()).collect()
|
|
}
|
|
};
|
|
|
|
// Apply max_tasks filter
|
|
let tasks = if let Some(max) = config.max_tasks {
|
|
tasks.into_iter().take(max).collect()
|
|
} else {
|
|
tasks
|
|
};
|
|
|
|
Ok(tasks)
|
|
}
|