Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
509
vendor/ruvector/crates/ruvllm/examples/run_eval.rs
vendored
Normal file
509
vendor/ruvector/crates/ruvllm/examples/run_eval.rs
vendored
Normal file
@@ -0,0 +1,509 @@
|
||||
#![allow(
|
||||
clippy::all,
|
||||
unused_imports,
|
||||
unused_variables,
|
||||
dead_code,
|
||||
unused_mut,
|
||||
unused_assignments,
|
||||
non_camel_case_types,
|
||||
clippy::approx_constant,
|
||||
unexpected_cfgs,
|
||||
unused_must_use,
|
||||
unused_parens
|
||||
)]
|
||||
//! RuvLLM Evaluation CLI
|
||||
//!
|
||||
//! Run real LLM evaluations using SWE-Bench tasks with the full RuvLLM stack.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! ```bash
|
||||
//! # Run evaluation with a GGUF model on sample tasks
|
||||
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
||||
//! --model ./models/llama-7b-q4.gguf \
|
||||
//! --tasks sample
|
||||
//!
|
||||
//! # Run on SWE-bench-lite (downloads and caches)
|
||||
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
||||
//! --model ./models/llama-7b-q4.gguf \
|
||||
//! --tasks swe-bench-lite \
|
||||
//! --max-tasks 50
|
||||
//!
|
||||
//! # Run with specific ablation modes
|
||||
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
||||
//! --model ./models/llama-7b-q4.gguf \
|
||||
//! --tasks sample \
|
||||
//! --modes baseline,full
|
||||
//!
|
||||
//! # Run on local JSON file
|
||||
//! cargo run -p ruvllm --example run_eval --features candle -- \
|
||||
//! --model ./models/llama-7b-q4.gguf \
|
||||
//! --tasks ./my-tasks.json \
|
||||
//! --output ./results.json
|
||||
//! ```
|
||||
//!
|
||||
//! ## Environment Variables
|
||||
//!
|
||||
//! - `RUVLLM_MODELS_DIR`: Default directory for model files
|
||||
//! - `RUVLLM_CACHE_DIR`: Cache directory for downloaded datasets
|
||||
|
||||
use ruvllm::backends::ModelConfig;
|
||||
use ruvllm::evaluation::{
|
||||
swe_bench::{SweBenchConfig, SweBenchLoader},
|
||||
AblationMode, EvalConfig, EvalTask, RealEvaluationHarness, RealInferenceConfig,
|
||||
};
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::process;
|
||||
|
||||
fn main() {
|
||||
// Initialize logging
|
||||
if env::var("RUST_LOG").is_err() {
|
||||
env::set_var("RUST_LOG", "info");
|
||||
}
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
if args.len() < 2 || args.contains(&"--help".to_string()) || args.contains(&"-h".to_string()) {
|
||||
print_help();
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse arguments
|
||||
let config = match parse_args(&args[1..]) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
eprintln!("Error: {}", e);
|
||||
eprintln!("\nRun with --help for usage information.");
|
||||
process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Run evaluation
|
||||
if let Err(e) = run_evaluation(config) {
|
||||
eprintln!("Evaluation failed: {}", e);
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn print_help() {
|
||||
println!(
|
||||
r#"RuvLLM Evaluation CLI
|
||||
|
||||
Run real LLM evaluations on SWE-Bench tasks with SONA learning and HNSW routing.
|
||||
|
||||
USAGE:
|
||||
run_eval [OPTIONS] --model <PATH>
|
||||
|
||||
OPTIONS:
|
||||
--model <PATH> Path to GGUF model file (required)
|
||||
--tasks <SOURCE> Task source: sample, swe-bench-lite, swe-bench, or file path
|
||||
(default: sample)
|
||||
--max-tasks <N> Maximum number of tasks to evaluate (default: all)
|
||||
--modes <MODES> Comma-separated ablation modes (default: all)
|
||||
Options: baseline, retrieval, adapters, retrieval+adapters, full
|
||||
--seeds <SEEDS> Comma-separated random seeds (default: 42,123,456)
|
||||
--output <PATH> Output file for results JSON (default: stdout summary)
|
||||
--quality-threshold <F> Minimum quality score for acceptance (default: 0.7)
|
||||
--cost-target <F> Target cost per patch in dollars (default: 0.10)
|
||||
--no-sona Disable SONA learning
|
||||
--no-hnsw Disable HNSW routing
|
||||
--repo <NAME> Filter tasks by repository name
|
||||
--verbose Enable verbose output
|
||||
-h, --help Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Quick test with sample tasks
|
||||
run_eval --model ./model.gguf --tasks sample
|
||||
|
||||
# Run SWE-bench-lite evaluation
|
||||
run_eval --model ./model.gguf --tasks swe-bench-lite --max-tasks 100
|
||||
|
||||
# Compare baseline vs full mode
|
||||
run_eval --model ./model.gguf --modes baseline,full --output results.json
|
||||
|
||||
# Run on custom task file
|
||||
run_eval --model ./model.gguf --tasks ./my-tasks.json --verbose
|
||||
"#
|
||||
);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct CliConfig {
|
||||
model_path: PathBuf,
|
||||
task_source: TaskSource,
|
||||
max_tasks: Option<usize>,
|
||||
ablation_modes: Vec<AblationMode>,
|
||||
seeds: Vec<u64>,
|
||||
output_path: Option<PathBuf>,
|
||||
quality_threshold: f64,
|
||||
cost_target: f64,
|
||||
enable_sona: bool,
|
||||
enable_hnsw: bool,
|
||||
repo_filter: Option<String>,
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum TaskSource {
|
||||
Sample,
|
||||
SweBenchLite,
|
||||
SweBenchFull,
|
||||
File(PathBuf),
|
||||
}
|
||||
|
||||
fn parse_args(args: &[String]) -> Result<CliConfig, String> {
|
||||
let mut model_path: Option<PathBuf> = None;
|
||||
let mut task_source = TaskSource::Sample;
|
||||
let mut max_tasks = None;
|
||||
let mut ablation_modes = Vec::new();
|
||||
let mut seeds = vec![42, 123, 456];
|
||||
let mut output_path = None;
|
||||
let mut quality_threshold = 0.7;
|
||||
let mut cost_target = 0.10;
|
||||
let mut enable_sona = true;
|
||||
let mut enable_hnsw = true;
|
||||
let mut repo_filter = None;
|
||||
let mut verbose = false;
|
||||
|
||||
let mut i = 0;
|
||||
while i < args.len() {
|
||||
match args[i].as_str() {
|
||||
"--model" => {
|
||||
i += 1;
|
||||
model_path = Some(PathBuf::from(args.get(i).ok_or("--model requires a path")?));
|
||||
}
|
||||
"--tasks" => {
|
||||
i += 1;
|
||||
let source = args.get(i).ok_or("--tasks requires a value")?;
|
||||
task_source = match source.as_str() {
|
||||
"sample" => TaskSource::Sample,
|
||||
"swe-bench-lite" => TaskSource::SweBenchLite,
|
||||
"swe-bench" => TaskSource::SweBenchFull,
|
||||
path => TaskSource::File(PathBuf::from(path)),
|
||||
};
|
||||
}
|
||||
"--max-tasks" => {
|
||||
i += 1;
|
||||
let n: usize = args
|
||||
.get(i)
|
||||
.ok_or("--max-tasks requires a number")?
|
||||
.parse()
|
||||
.map_err(|_| "Invalid number for --max-tasks")?;
|
||||
max_tasks = Some(n);
|
||||
}
|
||||
"--modes" => {
|
||||
i += 1;
|
||||
let modes_str = args.get(i).ok_or("--modes requires a value")?;
|
||||
ablation_modes = parse_modes(modes_str)?;
|
||||
}
|
||||
"--seeds" => {
|
||||
i += 1;
|
||||
let seeds_str = args.get(i).ok_or("--seeds requires a value")?;
|
||||
seeds = seeds_str
|
||||
.split(',')
|
||||
.map(|s| s.trim().parse().map_err(|_| "Invalid seed"))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
}
|
||||
"--output" => {
|
||||
i += 1;
|
||||
output_path = Some(PathBuf::from(
|
||||
args.get(i).ok_or("--output requires a path")?,
|
||||
));
|
||||
}
|
||||
"--quality-threshold" => {
|
||||
i += 1;
|
||||
quality_threshold = args
|
||||
.get(i)
|
||||
.ok_or("--quality-threshold requires a value")?
|
||||
.parse()
|
||||
.map_err(|_| "Invalid quality threshold")?;
|
||||
}
|
||||
"--cost-target" => {
|
||||
i += 1;
|
||||
cost_target = args
|
||||
.get(i)
|
||||
.ok_or("--cost-target requires a value")?
|
||||
.parse()
|
||||
.map_err(|_| "Invalid cost target")?;
|
||||
}
|
||||
"--repo" => {
|
||||
i += 1;
|
||||
repo_filter = Some(args.get(i).ok_or("--repo requires a value")?.clone());
|
||||
}
|
||||
"--no-sona" => enable_sona = false,
|
||||
"--no-hnsw" => enable_hnsw = false,
|
||||
"--verbose" => verbose = true,
|
||||
arg => {
|
||||
if arg.starts_with('-') {
|
||||
return Err(format!("Unknown option: {}", arg));
|
||||
}
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let model_path = model_path.ok_or("--model is required")?;
|
||||
|
||||
// Default to all modes if none specified
|
||||
if ablation_modes.is_empty() {
|
||||
ablation_modes = vec![
|
||||
AblationMode::Baseline,
|
||||
AblationMode::RetrievalOnly,
|
||||
AblationMode::AdaptersOnly,
|
||||
AblationMode::RetrievalPlusAdapters,
|
||||
AblationMode::Full,
|
||||
];
|
||||
}
|
||||
|
||||
Ok(CliConfig {
|
||||
model_path,
|
||||
task_source,
|
||||
max_tasks,
|
||||
ablation_modes,
|
||||
seeds,
|
||||
output_path,
|
||||
quality_threshold,
|
||||
cost_target,
|
||||
enable_sona,
|
||||
enable_hnsw,
|
||||
repo_filter,
|
||||
verbose,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_modes(modes_str: &str) -> Result<Vec<AblationMode>, String> {
|
||||
modes_str
|
||||
.split(',')
|
||||
.map(|s| match s.trim().to_lowercase().as_str() {
|
||||
"baseline" => Ok(AblationMode::Baseline),
|
||||
"retrieval" | "retrieval-only" | "retrieval_only" => Ok(AblationMode::RetrievalOnly),
|
||||
"adapters" | "adapters-only" | "adapters_only" => Ok(AblationMode::AdaptersOnly),
|
||||
"retrieval+adapters" | "retrieval_plus_adapters" => {
|
||||
Ok(AblationMode::RetrievalPlusAdapters)
|
||||
}
|
||||
"full" => Ok(AblationMode::Full),
|
||||
other => Err(format!("Unknown ablation mode: {}", other)),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn run_evaluation(config: CliConfig) -> Result<(), Box<dyn std::error::Error>> {
|
||||
println!("RuvLLM Evaluation");
|
||||
println!("=================\n");
|
||||
|
||||
// Verify model exists
|
||||
if !config.model_path.exists() {
|
||||
return Err(format!("Model not found: {}", config.model_path.display()).into());
|
||||
}
|
||||
println!("Model: {}", config.model_path.display());
|
||||
|
||||
// Load tasks
|
||||
println!("\nLoading tasks...");
|
||||
let tasks = load_tasks(&config)?;
|
||||
println!("Loaded {} tasks", tasks.len());
|
||||
|
||||
if config.verbose {
|
||||
for task in tasks.iter().take(5) {
|
||||
println!(" - {} ({})", task.id, task.repo);
|
||||
}
|
||||
if tasks.len() > 5 {
|
||||
println!(" ... and {} more", tasks.len() - 5);
|
||||
}
|
||||
}
|
||||
|
||||
// Configure evaluation
|
||||
let eval_config = EvalConfig {
|
||||
task_count: config.max_tasks.unwrap_or(tasks.len()),
|
||||
seeds: config.seeds.clone(),
|
||||
ablation_modes: config.ablation_modes.clone(),
|
||||
quality_threshold: config.quality_threshold,
|
||||
cost_target: config.cost_target,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("\nConfiguration:");
|
||||
println!(" Tasks: {}", eval_config.task_count);
|
||||
println!(" Seeds: {:?}", eval_config.seeds);
|
||||
println!(
|
||||
" Modes: {:?}",
|
||||
eval_config
|
||||
.ablation_modes
|
||||
.iter()
|
||||
.map(|m| m.name())
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
println!(
|
||||
" Quality threshold: {:.0}%",
|
||||
eval_config.quality_threshold * 100.0
|
||||
);
|
||||
println!(
|
||||
" SONA: {}",
|
||||
if config.enable_sona {
|
||||
"enabled"
|
||||
} else {
|
||||
"disabled"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" HNSW: {}",
|
||||
if config.enable_hnsw {
|
||||
"enabled"
|
||||
} else {
|
||||
"disabled"
|
||||
}
|
||||
);
|
||||
|
||||
// Configure inference
|
||||
let inference_config = RealInferenceConfig {
|
||||
model_path: config.model_path.to_string_lossy().to_string(),
|
||||
model_config: ModelConfig::default(),
|
||||
enable_sona: config.enable_sona,
|
||||
enable_hnsw: config.enable_hnsw,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Create harness
|
||||
println!("\nInitializing evaluation harness...");
|
||||
let mut harness = RealEvaluationHarness::with_config(eval_config, inference_config)?;
|
||||
|
||||
// Check if model loaded
|
||||
if !harness.is_model_loaded() {
|
||||
return Err("Failed to load model".into());
|
||||
}
|
||||
println!("Model loaded successfully!");
|
||||
|
||||
// Run evaluation
|
||||
println!("\nRunning evaluation...");
|
||||
println!("This may take a while depending on model size and task count.\n");
|
||||
|
||||
let runtime = tokio::runtime::Runtime::new()?;
|
||||
let report = runtime.block_on(harness.run_evaluation(&tasks))?;
|
||||
|
||||
// Output results
|
||||
println!("\n{}", "=".repeat(60));
|
||||
println!("EVALUATION COMPLETE");
|
||||
println!("{}\n", "=".repeat(60));
|
||||
|
||||
// Print summary
|
||||
println!("{}", report.summary());
|
||||
println!();
|
||||
|
||||
// Print leaderboard
|
||||
println!("Leaderboard:");
|
||||
println!("{:-<60}", "");
|
||||
println!(
|
||||
"{:<5} {:<20} {:>10} {:>10} {:>10}",
|
||||
"Rank", "Mode", "Success%", "Quality", "$/patch"
|
||||
);
|
||||
println!("{:-<60}", "");
|
||||
|
||||
for entry in report.to_leaderboard_entries() {
|
||||
println!(
|
||||
"{:<5} {:<20} {:>9.1}% {:>10.2} {:>10.4}",
|
||||
entry.rank,
|
||||
entry.mode.name(),
|
||||
entry.success_rate * 100.0,
|
||||
entry.quality_score,
|
||||
entry.cost_per_patch
|
||||
);
|
||||
}
|
||||
println!();
|
||||
|
||||
// Print ablation analysis
|
||||
println!("Ablation Analysis vs Baseline:");
|
||||
for comparison in report.compare_all_to_baseline() {
|
||||
let direction = if comparison.success_delta > 0.0 {
|
||||
"+"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let sig = if comparison.is_significant { "*" } else { "" };
|
||||
println!(
|
||||
" {}: {}{:.1}%{} success rate",
|
||||
comparison.target.name(),
|
||||
direction,
|
||||
comparison.success_delta * 100.0,
|
||||
sig
|
||||
);
|
||||
}
|
||||
|
||||
// Save to file if requested
|
||||
if let Some(output_path) = config.output_path {
|
||||
println!("\nSaving results to {}...", output_path.display());
|
||||
let json = report.to_json()?;
|
||||
std::fs::write(&output_path, json)?;
|
||||
println!("Results saved!");
|
||||
|
||||
// Also save markdown report
|
||||
let md_path = output_path.with_extension("md");
|
||||
std::fs::write(&md_path, report.to_markdown())?;
|
||||
println!("Markdown report saved to {}", md_path.display());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_tasks(config: &CliConfig) -> Result<Vec<EvalTask>, Box<dyn std::error::Error>> {
|
||||
let swe_config = SweBenchConfig {
|
||||
max_tasks: config.max_tasks,
|
||||
repo_filter: config.repo_filter.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let loader = SweBenchLoader::new(swe_config);
|
||||
|
||||
let tasks: Vec<EvalTask> = match &config.task_source {
|
||||
TaskSource::Sample => {
|
||||
println!("Using sample tasks (3 tasks)");
|
||||
SweBenchLoader::sample_tasks()
|
||||
.into_iter()
|
||||
.map(|t| t.into())
|
||||
.collect()
|
||||
}
|
||||
TaskSource::SweBenchLite => {
|
||||
println!("Loading SWE-bench-lite dataset...");
|
||||
// For now, use sample tasks since we don't have async download in sync context
|
||||
// In a real implementation, we'd use tokio::runtime to download
|
||||
println!("Note: Using sample tasks. Run with async for full dataset download.");
|
||||
SweBenchLoader::sample_tasks()
|
||||
.into_iter()
|
||||
.map(|t| t.into())
|
||||
.collect()
|
||||
}
|
||||
TaskSource::SweBenchFull => {
|
||||
println!("Loading full SWE-bench dataset...");
|
||||
println!("Note: Using sample tasks. Run with async for full dataset download.");
|
||||
SweBenchLoader::sample_tasks()
|
||||
.into_iter()
|
||||
.map(|t| t.into())
|
||||
.collect()
|
||||
}
|
||||
TaskSource::File(path) => {
|
||||
println!("Loading tasks from {}...", path.display());
|
||||
let swe_tasks = if path.extension().map_or(false, |e| e == "jsonl") {
|
||||
loader.load_from_jsonl(path)?
|
||||
} else {
|
||||
loader.load_from_file(path)?
|
||||
};
|
||||
|
||||
// Print stats
|
||||
let stats = SweBenchLoader::stats(&swe_tasks);
|
||||
if config.verbose {
|
||||
println!("{}", stats);
|
||||
}
|
||||
|
||||
swe_tasks.into_iter().map(|t| t.into()).collect()
|
||||
}
|
||||
};
|
||||
|
||||
// Apply max_tasks filter
|
||||
let tasks = if let Some(max) = config.max_tasks {
|
||||
tasks.into_iter().take(max).collect()
|
||||
} else {
|
||||
tasks
|
||||
};
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
Reference in New Issue
Block a user