Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
166
vendor/ruvector/examples/benchmarks/src/bin/acceptance_rvf.rs
vendored
Normal file
166
vendor/ruvector/examples/benchmarks/src/bin/acceptance_rvf.rs
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
//! Publishable RVF Acceptance Test — CLI entry point.
|
||||
//!
|
||||
//! Generates or verifies a deterministic acceptance test manifest with
|
||||
//! SHAKE-256 witness chain (rvf-crypto native). Same seed → same outcomes
|
||||
//! → same root hash.
|
||||
//!
|
||||
//! ```bash
|
||||
//! # Generate manifest (JSON + .rvf binary)
|
||||
//! cargo run --bin acceptance-rvf -- generate -o manifest.json
|
||||
//!
|
||||
//! # Generate with custom config
|
||||
//! cargo run --bin acceptance-rvf -- generate -o manifest.json \
|
||||
//! --holdout 200 --training 200 --cycles 5
|
||||
//!
|
||||
//! # Verify a manifest (re-runs and compares root hash)
|
||||
//! cargo run --bin acceptance-rvf -- verify -i manifest.json
|
||||
//!
|
||||
//! # Verify the .rvf binary witness chain
|
||||
//! cargo run --bin acceptance-rvf -- verify-rvf -i acceptance_manifest.rvf
|
||||
//! ```
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use ruvector_benchmarks::acceptance_test::HoldoutConfig;
|
||||
use ruvector_benchmarks::publishable_rvf::{
|
||||
generate_manifest_with_rvf, verify_manifest, verify_rvf_binary,
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "acceptance-rvf")]
|
||||
#[command(about = "Publishable RVF acceptance test with SHAKE-256 witness chain")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Generate a new acceptance test manifest (JSON + .rvf binary)
|
||||
Generate {
|
||||
/// Output JSON file path
|
||||
#[arg(short, long, default_value = "acceptance_manifest.json")]
|
||||
output: String,
|
||||
|
||||
/// Holdout set size
|
||||
#[arg(long, default_value_t = 200)]
|
||||
holdout: usize,
|
||||
|
||||
/// Training puzzles per cycle
|
||||
#[arg(long, default_value_t = 200)]
|
||||
training: usize,
|
||||
|
||||
/// Number of training cycles
|
||||
#[arg(long, default_value_t = 5)]
|
||||
cycles: usize,
|
||||
|
||||
/// Step budget per puzzle
|
||||
#[arg(long, default_value_t = 400)]
|
||||
budget: usize,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
},
|
||||
/// Verify an existing manifest by replaying and comparing root hash
|
||||
Verify {
|
||||
/// Input JSON file path
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
},
|
||||
/// Verify a native .rvf binary witness chain
|
||||
VerifyRvf {
|
||||
/// Input .rvf file path
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Generate {
|
||||
output,
|
||||
holdout,
|
||||
training,
|
||||
cycles,
|
||||
budget,
|
||||
verbose,
|
||||
} => {
|
||||
let config = HoldoutConfig {
|
||||
holdout_size: holdout,
|
||||
training_per_cycle: training,
|
||||
cycles,
|
||||
step_budget: budget,
|
||||
min_accuracy: 0.50,
|
||||
min_dimensions_improved: 1,
|
||||
verbose,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Derive .rvf path from JSON output path
|
||||
let rvf_path = output.replace(".json", ".rvf");
|
||||
|
||||
println!("Generating acceptance test manifest...");
|
||||
println!(
|
||||
" holdout={}, training={}, cycles={}, budget={}",
|
||||
holdout, training, cycles, budget
|
||||
);
|
||||
println!();
|
||||
|
||||
let manifest = generate_manifest_with_rvf(&config, Some(&rvf_path))?;
|
||||
manifest.print_summary();
|
||||
|
||||
let json = serde_json::to_string_pretty(&manifest)?;
|
||||
std::fs::write(&output, &json)?;
|
||||
println!(" JSON manifest: {}", output);
|
||||
println!(" RVF binary: {}", rvf_path);
|
||||
println!(" Chain root hash: {}", manifest.chain_root_hash);
|
||||
println!();
|
||||
|
||||
if manifest.all_passed {
|
||||
std::process::exit(0);
|
||||
} else {
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
Commands::Verify { input } => {
|
||||
println!("Loading manifest from: {}", input);
|
||||
let json = std::fs::read_to_string(&input)?;
|
||||
let manifest: ruvector_benchmarks::publishable_rvf::RvfManifest =
|
||||
serde_json::from_str(&json)?;
|
||||
|
||||
println!(" Chain length: {}", manifest.chain_length);
|
||||
println!(
|
||||
" Expected root: {}",
|
||||
&manifest.chain_root_hash[..32.min(manifest.chain_root_hash.len())]
|
||||
);
|
||||
println!();
|
||||
println!("Re-running acceptance test with same config...");
|
||||
|
||||
let result = verify_manifest(&manifest)?;
|
||||
result.print();
|
||||
|
||||
if result.passed() {
|
||||
println!(" VERIFICATION: PASSED — outcomes are identical");
|
||||
std::process::exit(0);
|
||||
} else {
|
||||
println!(" VERIFICATION: FAILED — outcomes differ");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
Commands::VerifyRvf { input } => {
|
||||
println!("Verifying .rvf witness chain: {}", input);
|
||||
match verify_rvf_binary(&input) {
|
||||
Ok(count) => {
|
||||
println!(" WITNESS_SEG verified: {} entries, chain intact", count);
|
||||
std::process::exit(0);
|
||||
}
|
||||
Err(e) => {
|
||||
println!(" VERIFICATION FAILED: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
204
vendor/ruvector/examples/benchmarks/src/bin/agi_proof_harness.rs
vendored
Normal file
204
vendor/ruvector/examples/benchmarks/src/bin/agi_proof_harness.rs
vendored
Normal file
@@ -0,0 +1,204 @@
|
||||
//! AGI Proof Harness — Nightly runner that publishes contract metrics.
|
||||
//!
|
||||
//! Publishes:
|
||||
//! - Success rate
|
||||
//! - Cost per solve
|
||||
//! - Robustness under noise
|
||||
//! - Policy compliance
|
||||
//! - Contradiction rate
|
||||
//! - Rollback correctness
|
||||
//! - Viability checklist status
|
||||
//! - Autonomy level
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin agi-proof-harness
|
||||
//! cargo run --bin agi-proof-harness -- --holdout 1000 --cycles 10 --verbose
|
||||
//! cargo run --bin agi-proof-harness -- --full # 10K training, 1K holdout, 10 cycles
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::acceptance_test::{
|
||||
run_ablation_comparison, run_acceptance_test, HoldoutConfig,
|
||||
};
|
||||
use ruvector_benchmarks::agi_contract::{AutonomyEvaluator, ContractHealth, ViabilityChecklist};
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "agi-proof-harness")]
|
||||
#[command(about = "AGI contract proof harness — publishes nightly metrics")]
|
||||
struct Args {
|
||||
/// Holdout evaluation set size
|
||||
#[arg(long, default_value = "200")]
|
||||
holdout: usize,
|
||||
|
||||
/// Training tasks per cycle
|
||||
#[arg(long, default_value = "200")]
|
||||
training: usize,
|
||||
|
||||
/// Number of improvement cycles
|
||||
#[arg(long, default_value = "5")]
|
||||
cycles: usize,
|
||||
|
||||
/// Frozen holdout seed
|
||||
#[arg(long, default_value = "3735928559")]
|
||||
holdout_seed: u64,
|
||||
|
||||
/// Training seed
|
||||
#[arg(long, default_value = "42")]
|
||||
training_seed: u64,
|
||||
|
||||
/// Noise injection rate
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per task
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Full acceptance test (10K training, 1K holdout, 10 cycles)
|
||||
#[arg(long)]
|
||||
full: bool,
|
||||
|
||||
/// Minimum accuracy threshold
|
||||
#[arg(long, default_value = "0.80")]
|
||||
min_accuracy: f64,
|
||||
|
||||
/// Run three-mode ablation comparison (A/B/C)
|
||||
#[arg(long)]
|
||||
ablation: bool,
|
||||
|
||||
/// Also run the 5-level SI pathway
|
||||
#[arg(long)]
|
||||
pathway: bool,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ AGI PROOF HARNESS ║");
|
||||
println!("║ Contract-based intelligence measurement ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let config = if args.full {
|
||||
HoldoutConfig {
|
||||
holdout_size: 1000,
|
||||
training_per_cycle: 1000,
|
||||
cycles: 10,
|
||||
holdout_seed: args.holdout_seed,
|
||||
training_seed: args.training_seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
min_accuracy: 0.95,
|
||||
min_dimensions_improved: 2,
|
||||
verbose: args.verbose,
|
||||
}
|
||||
} else {
|
||||
HoldoutConfig {
|
||||
holdout_size: args.holdout,
|
||||
training_per_cycle: args.training,
|
||||
cycles: args.cycles,
|
||||
holdout_seed: args.holdout_seed,
|
||||
training_seed: args.training_seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
min_accuracy: args.min_accuracy,
|
||||
min_dimensions_improved: 2,
|
||||
verbose: args.verbose,
|
||||
}
|
||||
};
|
||||
|
||||
println!(
|
||||
" Config: holdout={}, training/cycle={}, cycles={}, noise={:.0}%",
|
||||
config.holdout_size,
|
||||
config.training_per_cycle,
|
||||
config.cycles,
|
||||
config.noise_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" Seeds: holdout=0x{:X}, training={}",
|
||||
config.holdout_seed, config.training_seed
|
||||
);
|
||||
println!();
|
||||
|
||||
// ─── Run Acceptance Test ─────────────────────────────────────────
|
||||
println!(" Running acceptance test...");
|
||||
let result = run_acceptance_test(&config)?;
|
||||
result.print();
|
||||
|
||||
// ─── Ablation Comparison ─────────────────────────────────────────
|
||||
if args.ablation {
|
||||
println!(" Running ablation comparison (A / B / C)...");
|
||||
let comparison = run_ablation_comparison(&config)?;
|
||||
comparison.print();
|
||||
}
|
||||
|
||||
// ─── Contract Health Summary ─────────────────────────────────────
|
||||
if let Some(last_cycle) = result.cycles.last() {
|
||||
println!();
|
||||
last_cycle.contract_health.print();
|
||||
|
||||
// ─── Autonomy Level ──────────────────────────────────────────
|
||||
let health_history: Vec<ContractHealth> = result
|
||||
.cycles
|
||||
.iter()
|
||||
.map(|c| c.contract_health.clone())
|
||||
.collect();
|
||||
let evaluator = AutonomyEvaluator::default();
|
||||
let level = evaluator.evaluate(&health_history);
|
||||
println!();
|
||||
evaluator.print_status(level, &last_cycle.contract_health);
|
||||
|
||||
// ─── Viability Checklist ─────────────────────────────────────
|
||||
let viability = ViabilityChecklist::evaluate(&health_history);
|
||||
println!();
|
||||
viability.print();
|
||||
}
|
||||
|
||||
// ─── Optional: SI Pathway ────────────────────────────────────────
|
||||
if args.pathway {
|
||||
println!();
|
||||
println!(" Running 5-level SI pathway...");
|
||||
let si_config = SIConfig {
|
||||
episodes_per_level: 6,
|
||||
tasks_per_episode: 15,
|
||||
verbose: args.verbose,
|
||||
..Default::default()
|
||||
};
|
||||
let pathway_result = run_pathway(&si_config)?;
|
||||
pathway_result.print();
|
||||
|
||||
// Show contract health for peak level
|
||||
if let Some(peak) = pathway_result
|
||||
.levels
|
||||
.iter()
|
||||
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
|
||||
{
|
||||
let health = ContractHealth::from_raw(&peak.raw_metrics);
|
||||
println!(" Peak Level ({}) Contract:", peak.name);
|
||||
health.print();
|
||||
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&peak.raw_metrics);
|
||||
println!(" Multi-dimensional IQ: {:.1}", assessment.overall_score);
|
||||
println!(
|
||||
" Cost efficiency: {:.2}",
|
||||
assessment.cost.cost_efficiency
|
||||
);
|
||||
println!(
|
||||
" Robustness score: {:.2}",
|
||||
assessment.robustness.robustness_score
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
Ok(())
|
||||
}
|
||||
355
vendor/ruvector/examples/benchmarks/src/bin/intelligence_assessment.rs
vendored
Normal file
355
vendor/ruvector/examples/benchmarks/src/bin/intelligence_assessment.rs
vendored
Normal file
@@ -0,0 +1,355 @@
|
||||
//! Intelligence Assessment Runner
|
||||
//!
|
||||
//! Runs comprehensive intelligence assessment across all benchmark types.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin intelligence-assessment -- --episodes 10 --puzzles 50
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
intelligence_metrics::{
|
||||
print_intelligence_report, DifficultyStats, EpisodeMetrics, IntelligenceCalculator,
|
||||
RawMetrics,
|
||||
},
|
||||
swarm_regret::SwarmController,
|
||||
temporal::{AdaptiveSolver, TemporalSolver},
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
|
||||
};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "intelligence-assessment")]
|
||||
#[command(about = "Run comprehensive intelligence assessment")]
|
||||
struct Args {
|
||||
/// Number of episodes for regret tracking
|
||||
#[arg(short, long, default_value = "10")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "10")]
|
||||
tasks_per_episode: usize,
|
||||
|
||||
/// Enable calendar tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable adaptive learning (ReasoningBank)
|
||||
#[arg(long, default_value = "true")]
|
||||
adaptive: bool,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Comprehensive Intelligence Assessment ║");
|
||||
println!("║ Measuring Reasoning, Learning & Cognitive Abilities ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize metrics collector
|
||||
let mut raw_metrics = RawMetrics::default();
|
||||
|
||||
// Initialize components
|
||||
let mut controller = SwarmController::new(args.tasks_per_episode);
|
||||
|
||||
// Choose solver based on adaptive flag
|
||||
let mut adaptive_solver = if args.adaptive {
|
||||
Some(AdaptiveSolver::new())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut basic_solver = if !args.adaptive {
|
||||
let mut s = TemporalSolver::with_tools(args.calendar, false);
|
||||
s.max_steps = 100;
|
||||
Some(s)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let puzzle_config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
seed: args.seed,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks_per_episode);
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Adaptive learning:{}", args.adaptive);
|
||||
println!();
|
||||
|
||||
println!("🏃 Running assessment...");
|
||||
println!();
|
||||
|
||||
// Run episodes
|
||||
for ep in 0..args.episodes {
|
||||
controller.start_episode();
|
||||
|
||||
// Generate puzzles for this episode
|
||||
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
|
||||
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
|
||||
|
||||
let mut solved = 0;
|
||||
let mut correct = 0;
|
||||
let mut total_steps = 0;
|
||||
let mut total_tool_calls = 0;
|
||||
let mut total_latency = 0u64;
|
||||
|
||||
// Solve puzzles and collect metrics
|
||||
for puzzle in &puzzles {
|
||||
raw_metrics.tasks_attempted += 1;
|
||||
|
||||
// Use adaptive or basic solver
|
||||
let result = if let Some(ref mut solver) = adaptive_solver {
|
||||
solver.solve(puzzle)?
|
||||
} else if let Some(ref mut solver) = basic_solver {
|
||||
solver.solve(puzzle)?
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
if result.solved {
|
||||
solved += 1;
|
||||
raw_metrics.tasks_completed += 1;
|
||||
}
|
||||
if result.correct {
|
||||
correct += 1;
|
||||
raw_metrics.tasks_correct += 1;
|
||||
}
|
||||
|
||||
total_steps += result.steps;
|
||||
total_tool_calls += result.tool_calls;
|
||||
total_latency += result.latency_ms;
|
||||
|
||||
raw_metrics.total_steps += result.steps;
|
||||
raw_metrics.total_tool_calls += result.tool_calls;
|
||||
raw_metrics.total_latency_ms += result.latency_ms;
|
||||
|
||||
// Track by difficulty
|
||||
let entry = raw_metrics
|
||||
.by_difficulty
|
||||
.entry(puzzle.difficulty)
|
||||
.or_insert(DifficultyStats {
|
||||
attempted: 0,
|
||||
completed: 0,
|
||||
correct: 0,
|
||||
avg_steps: 0.0,
|
||||
});
|
||||
entry.attempted += 1;
|
||||
if result.solved {
|
||||
entry.completed += 1;
|
||||
}
|
||||
if result.correct {
|
||||
entry.correct += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Record episode for swarm controller
|
||||
controller.complete_episode(
|
||||
solved,
|
||||
correct,
|
||||
total_steps,
|
||||
total_tool_calls,
|
||||
total_latency,
|
||||
);
|
||||
|
||||
// Record episode metrics
|
||||
let episode_accuracy = if args.tasks_per_episode > 0 {
|
||||
correct as f64 / args.tasks_per_episode as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let last_ep = controller.regret.episodes.last().unwrap();
|
||||
raw_metrics.episodes.push(EpisodeMetrics {
|
||||
episode: ep + 1,
|
||||
accuracy: episode_accuracy,
|
||||
reward: last_ep.reward,
|
||||
regret: last_ep.regret(),
|
||||
cumulative_regret: controller.regret.current_cumulative_regret(),
|
||||
});
|
||||
|
||||
if args.verbose {
|
||||
println!(
|
||||
" Episode {:2}: Accuracy {:.1}%, Regret {:.2}",
|
||||
ep + 1,
|
||||
episode_accuracy * 100.0,
|
||||
last_ep.regret()
|
||||
);
|
||||
} else {
|
||||
print!(".");
|
||||
use std::io::Write;
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
}
|
||||
|
||||
if !args.verbose {
|
||||
println!();
|
||||
}
|
||||
println!();
|
||||
|
||||
// Update difficulty stats with average steps
|
||||
for (_, stats) in raw_metrics.by_difficulty.iter_mut() {
|
||||
if stats.attempted > 0 {
|
||||
// This is a simplification - we'd need to track this properly
|
||||
stats.avg_steps = raw_metrics.total_steps as f64 / raw_metrics.tasks_attempted as f64;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate intelligence assessment
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&raw_metrics);
|
||||
|
||||
// Print report
|
||||
print_intelligence_report(&assessment);
|
||||
|
||||
// Additional insights
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Performance Summary ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
println!("📊 Task Performance:");
|
||||
println!(" Tasks Attempted: {}", raw_metrics.tasks_attempted);
|
||||
println!(" Tasks Completed: {}", raw_metrics.tasks_completed);
|
||||
println!(" Tasks Correct: {}", raw_metrics.tasks_correct);
|
||||
println!(
|
||||
" Overall Accuracy: {:.1}%",
|
||||
raw_metrics.tasks_correct as f64 / raw_metrics.tasks_attempted as f64 * 100.0
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("📈 Learning Progress:");
|
||||
let regret_summary = controller.regret.summary();
|
||||
println!(" Cumulative Regret: {:.2}", regret_summary.total_regret);
|
||||
println!(" Average Regret: {:.4}", regret_summary.average_regret);
|
||||
println!(
|
||||
" Sublinear: {}",
|
||||
if regret_summary.is_sublinear {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" Regret Trend: {:.4} ({})",
|
||||
regret_summary.regret_trend,
|
||||
if regret_summary.regret_trend < 0.0 {
|
||||
"decreasing ✓"
|
||||
} else {
|
||||
"increasing ✗"
|
||||
}
|
||||
);
|
||||
println!();
|
||||
|
||||
// Grade the overall performance
|
||||
let grade = if assessment.overall_score >= 90.0 {
|
||||
"A+ (Excellent)"
|
||||
} else if assessment.overall_score >= 80.0 {
|
||||
"A (Very Good)"
|
||||
} else if assessment.overall_score >= 70.0 {
|
||||
"B (Good)"
|
||||
} else if assessment.overall_score >= 60.0 {
|
||||
"C (Adequate)"
|
||||
} else if assessment.overall_score >= 50.0 {
|
||||
"D (Below Average)"
|
||||
} else {
|
||||
"F (Needs Improvement)"
|
||||
};
|
||||
|
||||
println!("🎯 Final Grade: {}", grade);
|
||||
println!();
|
||||
|
||||
// Recommendations
|
||||
println!("💡 Recommendations:");
|
||||
if assessment.capabilities.temporal_reasoning < 70.0 {
|
||||
println!(" • Improve temporal reasoning with more constraint examples");
|
||||
}
|
||||
if assessment.learning.regret_sublinearity < 0.5 {
|
||||
println!(" • Increase episodes to achieve sublinear regret");
|
||||
}
|
||||
if assessment.tool_use.utilization_effectiveness < 0.7 {
|
||||
println!(" • Better tool selection needed for complex tasks");
|
||||
}
|
||||
if assessment.meta_cognition.strategy_adaptation < 0.5 {
|
||||
println!(" • Enable adaptive strategy switching");
|
||||
}
|
||||
if assessment.overall_score >= 70.0 {
|
||||
println!(" • Good performance! Consider harder difficulty levels");
|
||||
}
|
||||
|
||||
// Show adaptive learning progress if enabled
|
||||
if let Some(ref solver) = adaptive_solver {
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Adaptive Learning Progress ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let progress = solver.learning_progress();
|
||||
println!("🧠 ReasoningBank Statistics:");
|
||||
println!(" Total trajectories: {}", progress.total_trajectories);
|
||||
println!(
|
||||
" Success rate: {:.1}%",
|
||||
progress.success_rate * 100.0
|
||||
);
|
||||
println!(" Improvement rate: {:.4}", progress.improvement_rate);
|
||||
println!(" Patterns learned: {}", progress.patterns_learned);
|
||||
println!(" Strategies tried: {}", progress.strategies_tried);
|
||||
println!(
|
||||
" Is improving: {}",
|
||||
if progress.is_improving {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
|
||||
// Show learned patterns
|
||||
if !solver.reasoning_bank.patterns.is_empty() {
|
||||
println!();
|
||||
println!("📚 Learned Patterns:");
|
||||
for (constraint_type, patterns) in &solver.reasoning_bank.patterns {
|
||||
for p in patterns.iter().filter(|p| p.observations >= 3) {
|
||||
println!(
|
||||
" • {}: {} strategy ({:.0}% success, {} obs)",
|
||||
constraint_type,
|
||||
p.best_strategy,
|
||||
p.success_rate * 100.0,
|
||||
p.observations
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Show strategy stats
|
||||
if !solver.reasoning_bank.strategy_stats.is_empty() {
|
||||
println!();
|
||||
println!("📊 Strategy Performance:");
|
||||
for (strategy, stats) in &solver.reasoning_bank.strategy_stats {
|
||||
println!(
|
||||
" • {}: {:.1}% success ({} attempts, {:.1} avg steps)",
|
||||
strategy,
|
||||
stats.success_rate() * 100.0,
|
||||
stats.attempts,
|
||||
stats.avg_steps()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
180
vendor/ruvector/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
vendored
Normal file
180
vendor/ruvector/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
//! RVF Intelligence Benchmark Runner
|
||||
//!
|
||||
//! Runs head-to-head comparison across 6 intelligence verticals:
|
||||
//! Baseline (no learning) vs. RVF-Learning (full pipeline).
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin rvf-intelligence-bench -- --episodes 15 --tasks 25 --verbose
|
||||
//! cargo run --bin rvf-intelligence-bench -- --noise 0.4 --step-budget 300
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::rvf_intelligence_bench::{run_comparison, BenchmarkConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "rvf-intelligence-bench")]
|
||||
#[command(about = "Benchmark intelligence with and without RVF learning across 6 verticals")]
|
||||
struct Args {
|
||||
/// Number of episodes per mode
|
||||
#[arg(short, long, default_value = "10")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "20")]
|
||||
tasks: usize,
|
||||
|
||||
/// Minimum difficulty (1-10)
|
||||
#[arg(long, default_value = "1")]
|
||||
min_diff: u8,
|
||||
|
||||
/// Maximum difficulty (1-10)
|
||||
#[arg(long, default_value = "10")]
|
||||
max_diff: u8,
|
||||
|
||||
/// Random seed for reproducibility
|
||||
#[arg(long, default_value = "42")]
|
||||
seed: u64,
|
||||
|
||||
/// Noise probability (0.0-1.0)
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per episode
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Max retries for error recovery (RVF only)
|
||||
#[arg(long, default_value = "2")]
|
||||
max_retries: usize,
|
||||
|
||||
/// Retention fraction (0.0-1.0)
|
||||
#[arg(long, default_value = "0.15")]
|
||||
retention: f64,
|
||||
|
||||
/// Token budget per episode (RVF mode)
|
||||
#[arg(long, default_value = "200000")]
|
||||
token_budget: u32,
|
||||
|
||||
/// Tool call budget per episode (RVF mode)
|
||||
#[arg(long, default_value = "50")]
|
||||
tool_budget: u16,
|
||||
|
||||
/// Verbose per-episode output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("================================================================");
|
||||
println!(" RVF Intelligence Benchmark v2 — Six Verticals");
|
||||
println!(" Baseline vs. RVF-Learning (noise + step limits + retry + transfer)");
|
||||
println!("================================================================");
|
||||
println!();
|
||||
println!(" Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks);
|
||||
println!(" Difficulty: {}-{}", args.min_diff, args.max_diff);
|
||||
println!(" Seed: {}", args.seed);
|
||||
println!(" Noise prob: {:.0}%", args.noise * 100.0);
|
||||
println!(" Step budget/ep: {}", args.step_budget);
|
||||
println!(" Max retries: {}", args.max_retries);
|
||||
println!(" Retention: {:.0}%", args.retention * 100.0);
|
||||
println!();
|
||||
|
||||
let config = BenchmarkConfig {
|
||||
episodes: args.episodes,
|
||||
tasks_per_episode: args.tasks,
|
||||
min_difficulty: args.min_diff,
|
||||
max_difficulty: args.max_diff,
|
||||
seed: Some(args.seed),
|
||||
token_budget: args.token_budget,
|
||||
tool_call_budget: args.tool_budget,
|
||||
verbose: args.verbose,
|
||||
noise_probability: args.noise,
|
||||
step_budget_per_episode: args.step_budget,
|
||||
max_retries: args.max_retries,
|
||||
retention_fraction: args.retention,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!(" Phase 1/2: Running baseline (no learning)...");
|
||||
let report = run_comparison(&config)?;
|
||||
|
||||
// Print comparison report
|
||||
report.print();
|
||||
|
||||
// Full IQ assessment
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
|
||||
println!("----------------------------------------------------------------");
|
||||
println!(" Detailed Intelligence Assessment: Baseline");
|
||||
println!("----------------------------------------------------------------");
|
||||
let base_assessment = calculator.calculate(&report.baseline.raw_metrics);
|
||||
print_compact_assessment(&base_assessment);
|
||||
|
||||
println!();
|
||||
println!("----------------------------------------------------------------");
|
||||
println!(" Detailed Intelligence Assessment: RVF-Learning");
|
||||
println!("----------------------------------------------------------------");
|
||||
let rvf_assessment = calculator.calculate(&report.rvf_learning.raw_metrics);
|
||||
print_compact_assessment(&rvf_assessment);
|
||||
|
||||
// Final IQ comparison
|
||||
println!();
|
||||
println!("================================================================");
|
||||
println!(" Intelligence Score Comparison");
|
||||
println!("================================================================");
|
||||
println!(
|
||||
" Baseline IQ Score: {:.1}/100",
|
||||
base_assessment.overall_score
|
||||
);
|
||||
println!(
|
||||
" RVF-Learning IQ Score: {:.1}/100",
|
||||
rvf_assessment.overall_score
|
||||
);
|
||||
let iq_delta = rvf_assessment.overall_score - base_assessment.overall_score;
|
||||
println!(" Delta: {:+.1}", iq_delta);
|
||||
println!();
|
||||
|
||||
if iq_delta > 10.0 {
|
||||
println!(" >> RVF learning loop provides a DRAMATIC intelligence boost.");
|
||||
} else if iq_delta > 5.0 {
|
||||
println!(" >> RVF learning loop provides a SIGNIFICANT intelligence boost.");
|
||||
} else if iq_delta > 1.0 {
|
||||
println!(" >> RVF learning loop provides a MEASURABLE intelligence improvement.");
|
||||
} else if iq_delta > 0.0 {
|
||||
println!(" >> RVF learning loop provides a MARGINAL intelligence gain.");
|
||||
} else {
|
||||
println!(" >> Performance is comparable. Increase noise or reduce step budget.");
|
||||
}
|
||||
println!();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_compact_assessment(a: &ruvector_benchmarks::intelligence_metrics::IntelligenceAssessment) {
|
||||
println!(" Overall Score: {:.1}/100", a.overall_score);
|
||||
println!(
|
||||
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
|
||||
a.reasoning.logical_coherence, a.reasoning.reasoning_efficiency, a.reasoning.error_rate,
|
||||
);
|
||||
println!(
|
||||
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}, gen={:.2}",
|
||||
a.learning.sample_efficiency,
|
||||
a.learning.regret_sublinearity,
|
||||
a.learning.learning_rate,
|
||||
a.learning.generalization,
|
||||
);
|
||||
println!(
|
||||
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
|
||||
a.capabilities.pattern_recognition, a.capabilities.planning, a.capabilities.adaptation,
|
||||
);
|
||||
println!(
|
||||
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
|
||||
a.meta_cognition.self_correction_rate, a.meta_cognition.strategy_adaptation,
|
||||
);
|
||||
}
|
||||
135
vendor/ruvector/examples/benchmarks/src/bin/superintelligence.rs
vendored
Normal file
135
vendor/ruvector/examples/benchmarks/src/bin/superintelligence.rs
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
//! Superintelligence Pathway Runner
|
||||
//!
|
||||
//! Runs a 5-level recursive intelligence amplification pipeline and tracks
|
||||
//! IQ progression from foundation (~85) toward superintelligence (~98+).
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin superintelligence -- --verbose
|
||||
//! cargo run --bin superintelligence -- --episodes 15 --tasks 30 --target 95
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "superintelligence")]
|
||||
#[command(about = "Run 5-level superintelligence pathway with IQ tracking")]
|
||||
struct Args {
|
||||
/// Episodes per level
|
||||
#[arg(short, long, default_value = "12")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "25")]
|
||||
tasks: usize,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long, default_value = "42")]
|
||||
seed: u64,
|
||||
|
||||
/// Noise injection rate (0.0-1.0)
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per episode
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Target IQ score
|
||||
#[arg(long, default_value = "98.0")]
|
||||
target: f64,
|
||||
|
||||
/// Ensemble size for Level 3
|
||||
#[arg(long, default_value = "4")]
|
||||
ensemble: usize,
|
||||
|
||||
/// Recursive improvement cycles for Level 4
|
||||
#[arg(long, default_value = "3")]
|
||||
cycles: usize,
|
||||
|
||||
/// Adversarial pressure multiplier for Level 5
|
||||
#[arg(long, default_value = "1.5")]
|
||||
pressure: f64,
|
||||
|
||||
/// Verbose per-episode output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ SUPERINTELLIGENCE PATHWAY ENGINE ║");
|
||||
println!("║ 5-Level Recursive Intelligence Amplification ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!(
|
||||
" Config: {} eps/level x {} tasks, noise={:.0}%, target IQ={:.0}",
|
||||
args.episodes,
|
||||
args.tasks,
|
||||
args.noise * 100.0,
|
||||
args.target
|
||||
);
|
||||
println!(
|
||||
" Ensemble={}, Cycles={}, Pressure={:.1}",
|
||||
args.ensemble, args.cycles, args.pressure
|
||||
);
|
||||
println!();
|
||||
|
||||
let config = SIConfig {
|
||||
episodes_per_level: args.episodes,
|
||||
tasks_per_episode: args.tasks,
|
||||
seed: args.seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
target_iq: args.target,
|
||||
ensemble_size: args.ensemble,
|
||||
recursive_cycles: args.cycles,
|
||||
adversarial_pressure: args.pressure,
|
||||
verbose: args.verbose,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = run_pathway(&config)?;
|
||||
result.print();
|
||||
|
||||
// Detailed assessment for peak level
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
if let Some(peak) = result
|
||||
.levels
|
||||
.iter()
|
||||
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
|
||||
{
|
||||
println!(" Peak Level ({}) Assessment:", peak.name);
|
||||
let assessment = calculator.calculate(&peak.raw_metrics);
|
||||
println!(
|
||||
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
|
||||
assessment.reasoning.logical_coherence,
|
||||
assessment.reasoning.reasoning_efficiency,
|
||||
assessment.reasoning.error_rate
|
||||
);
|
||||
println!(
|
||||
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}",
|
||||
assessment.learning.sample_efficiency,
|
||||
assessment.learning.regret_sublinearity,
|
||||
assessment.learning.learning_rate
|
||||
);
|
||||
println!(
|
||||
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
|
||||
assessment.capabilities.pattern_recognition,
|
||||
assessment.capabilities.planning,
|
||||
assessment.capabilities.adaptation
|
||||
);
|
||||
println!(
|
||||
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
|
||||
assessment.meta_cognition.self_correction_rate,
|
||||
assessment.meta_cognition.strategy_adaptation
|
||||
);
|
||||
println!();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
247
vendor/ruvector/examples/benchmarks/src/bin/swarm_regret.rs
vendored
Normal file
247
vendor/ruvector/examples/benchmarks/src/bin/swarm_regret.rs
vendored
Normal file
@@ -0,0 +1,247 @@
|
||||
//! Swarm Regret Tracking Runner
|
||||
//!
|
||||
//! Track sublinear regret across episodes for swarm controller evaluation.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin swarm-regret -- --episodes 20 --tasks-per-episode 20
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
swarm_regret::SwarmController,
|
||||
temporal::TemporalSolver,
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "swarm-regret")]
|
||||
#[command(about = "Track sublinear regret for swarm controller")]
|
||||
struct Args {
|
||||
/// Number of episodes to run
|
||||
#[arg(short, long, default_value = "20")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "20")]
|
||||
tasks_per_episode: usize,
|
||||
|
||||
/// Enable calendar tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable web search tool
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Maximum steps per task
|
||||
#[arg(long, default_value = "100")]
|
||||
max_steps: usize,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/swarm_regret.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Swarm Controller Regret Tracking ║");
|
||||
println!("║ Sublinear Regret for Multi-Agent Control ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting regret tracking", "swarm-regret")?;
|
||||
|
||||
let mut controller = SwarmController::new(args.tasks_per_episode);
|
||||
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
|
||||
solver.max_steps = args.max_steps;
|
||||
|
||||
let puzzle_config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
seed: args.seed,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks_per_episode);
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Max steps/task: {}", args.max_steps);
|
||||
println!();
|
||||
|
||||
println!("🏃 Running episodes...");
|
||||
println!();
|
||||
println!("┌────────┬────────┬─────────┬─────────┬──────────┬───────────┐");
|
||||
println!("│Episode │ Acc(%) │ Regret │ Cum.Reg │ Avg.Reg │ Sublinear │");
|
||||
println!("├────────┼────────┼─────────┼─────────┼──────────┼───────────┤");
|
||||
|
||||
let total_start = Instant::now();
|
||||
|
||||
for ep in 0..args.episodes {
|
||||
controller.start_episode();
|
||||
|
||||
// Generate puzzles for this episode
|
||||
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
|
||||
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
|
||||
|
||||
let mut solved = 0;
|
||||
let mut correct = 0;
|
||||
let mut total_steps = 0;
|
||||
let mut total_tool_calls = 0;
|
||||
let mut total_latency = 0u64;
|
||||
|
||||
// Solve puzzles
|
||||
for puzzle in &puzzles {
|
||||
let result = solver.solve(puzzle)?;
|
||||
if result.solved {
|
||||
solved += 1;
|
||||
}
|
||||
if result.correct {
|
||||
correct += 1;
|
||||
}
|
||||
total_steps += result.steps;
|
||||
total_tool_calls += result.tool_calls;
|
||||
total_latency += result.latency_ms;
|
||||
}
|
||||
|
||||
// Record episode
|
||||
controller.complete_episode(
|
||||
solved,
|
||||
correct,
|
||||
total_steps,
|
||||
total_tool_calls,
|
||||
total_latency,
|
||||
);
|
||||
|
||||
// Get status
|
||||
let summary = controller.regret.summary();
|
||||
let last_episode = controller.regret.episodes.last().unwrap();
|
||||
|
||||
// Log episode
|
||||
logger.log_swarm(
|
||||
ep + 1,
|
||||
args.tasks_per_episode,
|
||||
solved,
|
||||
correct,
|
||||
last_episode.reward,
|
||||
last_episode.oracle_reward,
|
||||
summary.total_regret,
|
||||
summary.average_regret,
|
||||
summary.is_sublinear,
|
||||
)?;
|
||||
|
||||
// Print row
|
||||
let sublinear = if summary.is_sublinear { "✓" } else { "✗" };
|
||||
println!(
|
||||
"│ {:6} │ {:5.1} │ {:7.2} │ {:7.2} │ {:8.4} │ {} │",
|
||||
ep + 1,
|
||||
last_episode.accuracy() * 100.0,
|
||||
last_episode.regret(),
|
||||
summary.total_regret,
|
||||
summary.average_regret,
|
||||
sublinear
|
||||
);
|
||||
}
|
||||
|
||||
println!("└────────┴────────┴─────────┴─────────┴──────────┴───────────┘");
|
||||
println!();
|
||||
|
||||
let total_time = total_start.elapsed();
|
||||
|
||||
// Final summary
|
||||
let summary = controller.regret.summary();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Final Summary ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Regret Analysis:");
|
||||
println!(" Total episodes: {}", summary.total_episodes);
|
||||
println!(" Cumulative regret: {:.2}", summary.total_regret);
|
||||
println!(" Average regret: {:.4}", summary.average_regret);
|
||||
println!(
|
||||
" Regret trend: {:.6} ({})",
|
||||
summary.regret_trend,
|
||||
if summary.regret_trend < 0.0 {
|
||||
"decreasing ✓"
|
||||
} else {
|
||||
"increasing ✗"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" Sublinear: {}",
|
||||
if summary.is_sublinear {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
println!();
|
||||
println!("📈 Performance:");
|
||||
println!(
|
||||
" Average accuracy: {:.1}%",
|
||||
summary.average_accuracy * 100.0
|
||||
);
|
||||
println!(" Average reward: {:.2}", summary.average_reward);
|
||||
println!(
|
||||
" Moving avg reward: {:.2}",
|
||||
summary.moving_average_reward
|
||||
);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Regret curve analysis
|
||||
if controller.regret.average_regret.len() >= 5 {
|
||||
println!("📉 Regret Curve (R_k/k):");
|
||||
let regrets = &controller.regret.average_regret;
|
||||
let step = regrets.len().max(10) / 10;
|
||||
for (i, r) in regrets.iter().enumerate() {
|
||||
if i % step == 0 || i == regrets.len() - 1 {
|
||||
let bar_len = (r * 50.0).min(50.0) as usize;
|
||||
let bar = "█".repeat(bar_len);
|
||||
println!(" Episode {:3}: {:.4} {}", i + 1, r, bar);
|
||||
}
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
// Goal check
|
||||
println!("🎯 Goal Status:");
|
||||
if summary.is_sublinear && summary.regret_trend < 0.0 {
|
||||
println!(" ✓ Achieving sublinear regret - average regret trending to zero");
|
||||
} else if summary.is_sublinear {
|
||||
println!(" ~ Sublinear but trend not clearly decreasing");
|
||||
} else {
|
||||
println!(" ✗ Not yet achieving sublinear regret");
|
||||
println!(" Recommendation: Increase episodes or tune solver parameters");
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
// Save summary
|
||||
let summary_path = args.output.replace(".jsonl", "_summary.json");
|
||||
let summary_json = serde_json::to_string_pretty(&summary)?;
|
||||
std::fs::write(&summary_path, summary_json)?;
|
||||
println!("📝 Summary saved to: {}", summary_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
262
vendor/ruvector/examples/benchmarks/src/bin/temporal_benchmark.rs
vendored
Normal file
262
vendor/ruvector/examples/benchmarks/src/bin/temporal_benchmark.rs
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
//! Temporal Benchmark Runner
|
||||
//!
|
||||
//! Run temporal reasoning benchmarks based on TimePuzzles methodology.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin temporal-benchmark -- --puzzles 50 --calendar --web-search
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
temporal::{BenchmarkConfig, BenchmarkResults, TemporalSolver},
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "temporal-benchmark")]
|
||||
#[command(about = "Run temporal reasoning benchmarks")]
|
||||
struct Args {
|
||||
/// Number of puzzles to run
|
||||
#[arg(short = 'n', long, default_value = "50")]
|
||||
puzzles: usize,
|
||||
|
||||
/// Minimum difficulty (1-10)
|
||||
#[arg(long, default_value = "1")]
|
||||
min_difficulty: u8,
|
||||
|
||||
/// Maximum difficulty (1-10)
|
||||
#[arg(long, default_value = "10")]
|
||||
max_difficulty: u8,
|
||||
|
||||
/// Enable calendar math tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable web search tool
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Maximum steps per puzzle
|
||||
#[arg(long, default_value = "100")]
|
||||
max_steps: usize,
|
||||
|
||||
/// Constraint density (1-5)
|
||||
#[arg(long, default_value = "3")]
|
||||
constraint_density: u8,
|
||||
|
||||
/// Random seed for reproducibility
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/temporal_benchmark.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Use sample puzzles instead of generating
|
||||
#[arg(long)]
|
||||
use_samples: bool,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Temporal Reasoning Benchmark Runner ║");
|
||||
println!("║ Based on TimePuzzles (arXiv:2601.07148) ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize logger
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting benchmark run", "temporal-benchmark")?;
|
||||
|
||||
// Generate or load puzzles
|
||||
let puzzles = if args.use_samples {
|
||||
println!("📚 Using sample puzzle set (50 puzzles)...");
|
||||
SamplePuzzles::mixed_sample()
|
||||
} else {
|
||||
println!(
|
||||
"🎲 Generating {} puzzles (difficulty {}-{})...",
|
||||
args.puzzles, args.min_difficulty, args.max_difficulty
|
||||
);
|
||||
|
||||
let config = PuzzleGeneratorConfig {
|
||||
min_difficulty: args.min_difficulty,
|
||||
max_difficulty: args.max_difficulty,
|
||||
constraint_density: args.constraint_density,
|
||||
cross_cultural: true,
|
||||
relative_constraints: true,
|
||||
year_range: (2000, 2030),
|
||||
seed: args.seed,
|
||||
};
|
||||
|
||||
let mut generator = PuzzleGenerator::new(config);
|
||||
generator.generate_batch(args.puzzles)?
|
||||
};
|
||||
|
||||
println!("✓ Loaded {} puzzles", puzzles.len());
|
||||
println!();
|
||||
|
||||
// Configure solver
|
||||
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
|
||||
solver.max_steps = args.max_steps;
|
||||
|
||||
println!("🔧 Solver configuration:");
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Max steps: {}", args.max_steps);
|
||||
println!();
|
||||
|
||||
// Run benchmarks
|
||||
println!("🏃 Running benchmarks...");
|
||||
println!();
|
||||
|
||||
let benchmark_id = format!(
|
||||
"bench-{}-{}",
|
||||
chrono::Utc::now().format("%Y%m%d-%H%M%S"),
|
||||
args.seed.unwrap_or(0)
|
||||
);
|
||||
|
||||
let mut results = Vec::new();
|
||||
let start = Instant::now();
|
||||
|
||||
for (i, puzzle) in puzzles.iter().enumerate() {
|
||||
let result = solver.solve(puzzle)?;
|
||||
|
||||
// Log result
|
||||
logger.log_temporal(
|
||||
&benchmark_id,
|
||||
&puzzle.id,
|
||||
puzzle.difficulty,
|
||||
result.solved,
|
||||
result.correct,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms,
|
||||
puzzle.constraints.len(),
|
||||
args.calendar,
|
||||
args.web_search,
|
||||
)?;
|
||||
|
||||
if args.verbose {
|
||||
let status = if result.correct {
|
||||
"✓"
|
||||
} else if result.solved {
|
||||
"~"
|
||||
} else {
|
||||
"✗"
|
||||
};
|
||||
println!(
|
||||
" {} Puzzle {:3}: {} (steps: {}, latency: {}ms)",
|
||||
status,
|
||||
i + 1,
|
||||
puzzle.id,
|
||||
result.steps,
|
||||
result.latency_ms
|
||||
);
|
||||
} else if (i + 1) % 10 == 0 {
|
||||
print!(".");
|
||||
use std::io::Write;
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_time = start.elapsed();
|
||||
|
||||
if !args.verbose {
|
||||
println!();
|
||||
}
|
||||
println!();
|
||||
|
||||
// Compute aggregate results
|
||||
let config = BenchmarkConfig {
|
||||
num_puzzles: puzzles.len(),
|
||||
difficulty_range: (args.min_difficulty, args.max_difficulty),
|
||||
calendar_tool: args.calendar,
|
||||
web_search_tool: args.web_search,
|
||||
max_steps: args.max_steps,
|
||||
constraint_density: args.constraint_density,
|
||||
};
|
||||
|
||||
let benchmark_results = BenchmarkResults::from_results(config, results);
|
||||
|
||||
// Print results
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Benchmark Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Summary:");
|
||||
println!(" Total puzzles: {}", benchmark_results.total_puzzles);
|
||||
println!(" Solved: {}", benchmark_results.solved_count);
|
||||
println!(" Correct: {}", benchmark_results.correct_count);
|
||||
println!(
|
||||
" Accuracy: {:.1}%",
|
||||
benchmark_results.accuracy * 100.0
|
||||
);
|
||||
println!();
|
||||
println!("⏱️ Performance:");
|
||||
println!(" Avg steps: {:.1}", benchmark_results.avg_steps);
|
||||
println!(" Avg tool calls: {:.1}", benchmark_results.avg_tool_calls);
|
||||
println!(
|
||||
" Avg latency: {:.1}ms",
|
||||
benchmark_results.avg_latency_ms
|
||||
);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Compute accuracy by difficulty
|
||||
let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
|
||||
std::collections::HashMap::new();
|
||||
for (puzzle, result) in puzzles.iter().zip(benchmark_results.results.iter()) {
|
||||
let entry = by_difficulty.entry(puzzle.difficulty).or_insert((0, 0));
|
||||
entry.0 += 1;
|
||||
if result.correct {
|
||||
entry.1 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
println!("📈 Accuracy by Difficulty:");
|
||||
let mut difficulties: Vec<_> = by_difficulty.keys().copied().collect();
|
||||
difficulties.sort();
|
||||
for d in difficulties {
|
||||
let (total, correct) = by_difficulty[&d];
|
||||
let acc = correct as f64 / total as f64 * 100.0;
|
||||
println!(" Difficulty {}: {:5.1}% ({}/{})", d, acc, correct, total);
|
||||
}
|
||||
println!();
|
||||
|
||||
// Tool usage analysis
|
||||
if args.calendar {
|
||||
let with_rewriting = benchmark_results
|
||||
.results
|
||||
.iter()
|
||||
.filter(|r| r.tool_calls > 0 && r.correct)
|
||||
.count();
|
||||
println!("🔧 Tool Analysis:");
|
||||
println!(
|
||||
" Calendar rewriting success: {}/{}",
|
||||
with_rewriting, benchmark_results.total_puzzles
|
||||
);
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
// Save full results as JSON
|
||||
let results_path = args.output.replace(".jsonl", "_summary.json");
|
||||
let results_json = serde_json::to_string_pretty(&benchmark_results)?;
|
||||
std::fs::write(&results_path, results_json)?;
|
||||
println!("📝 Summary saved to: {}", results_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
308
vendor/ruvector/examples/benchmarks/src/bin/timepuzzle_runner.rs
vendored
Normal file
308
vendor/ruvector/examples/benchmarks/src/bin/timepuzzle_runner.rs
vendored
Normal file
@@ -0,0 +1,308 @@
|
||||
//! TimePuzzle Quick Runner
|
||||
//!
|
||||
//! 10-minute probe for temporal reasoning with tool augmentation.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin timepuzzle-runner -- --quick
|
||||
//! cargo run --bin timepuzzle-runner -- --depth 5
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger, temporal::TemporalSolver, timepuzzles::SamplePuzzles,
|
||||
};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "timepuzzle-runner")]
|
||||
#[command(about = "Quick TimePuzzle probe for agent testing")]
|
||||
struct Args {
|
||||
/// Quick mode: 50 puzzles, depth-limited steps
|
||||
#[arg(long)]
|
||||
quick: bool,
|
||||
|
||||
/// Maximum depth (steps) per puzzle
|
||||
#[arg(short, long, default_value = "50")]
|
||||
depth: usize,
|
||||
|
||||
/// Number of puzzles
|
||||
#[arg(short = 'n', long, default_value = "50")]
|
||||
puzzles: usize,
|
||||
|
||||
/// Tool latency cap (abort if tool > 1.5x median)
|
||||
#[arg(long, default_value = "1.5")]
|
||||
latency_cap: f64,
|
||||
|
||||
/// Timeout in seconds
|
||||
#[arg(long, default_value = "600")]
|
||||
timeout: u64,
|
||||
|
||||
/// Enable constraint rewriting (calendar math)
|
||||
#[arg(long, default_value = "true")]
|
||||
rewrite: bool,
|
||||
|
||||
/// Enable web search (for factual anchors)
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Output file
|
||||
#[arg(short, long, default_value = "logs/timepuzzle_probe.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose mode
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ TimePuzzle Quick Probe Runner ║");
|
||||
println!("║ Tool-Augmented Iterative Temporal Reasoning ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting TimePuzzle probe", "timepuzzle-runner")?;
|
||||
|
||||
// Quick mode settings
|
||||
let (num_puzzles, max_depth) = if args.quick {
|
||||
println!("⚡ Quick mode enabled (50 puzzles, depth {})", args.depth);
|
||||
(50, args.depth)
|
||||
} else {
|
||||
(args.puzzles, args.depth)
|
||||
};
|
||||
|
||||
let timeout = Duration::from_secs(args.timeout);
|
||||
|
||||
println!();
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Puzzles: {}", num_puzzles);
|
||||
println!(" Max depth: {}", max_depth);
|
||||
println!(" Rewriting: {}", args.rewrite);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Latency cap: {}x median", args.latency_cap);
|
||||
println!(" Timeout: {}s", args.timeout);
|
||||
println!();
|
||||
|
||||
// Generate puzzles with varying constraint density
|
||||
println!("🎲 Generating puzzles...");
|
||||
let puzzles = SamplePuzzles::mixed_sample()
|
||||
.into_iter()
|
||||
.take(num_puzzles)
|
||||
.collect::<Vec<_>>();
|
||||
println!("✓ Loaded {} puzzles", puzzles.len());
|
||||
println!();
|
||||
|
||||
// Configure solver
|
||||
let mut solver = TemporalSolver::with_tools(args.rewrite, args.web_search);
|
||||
solver.max_steps = max_depth;
|
||||
|
||||
// Run probe
|
||||
println!("🏃 Running probe...");
|
||||
println!();
|
||||
|
||||
let probe_start = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
let mut latencies: Vec<u64> = Vec::new();
|
||||
let mut median_latency: f64 = 100.0; // Initial estimate
|
||||
|
||||
for (i, puzzle) in puzzles.iter().enumerate() {
|
||||
// Check timeout
|
||||
if probe_start.elapsed() > timeout {
|
||||
println!("⚠️ Timeout reached after {} puzzles", i);
|
||||
break;
|
||||
}
|
||||
|
||||
let result = solver.solve(puzzle)?;
|
||||
|
||||
// Check latency cap
|
||||
if latencies.len() >= 10 {
|
||||
let mut sorted = latencies.clone();
|
||||
sorted.sort();
|
||||
median_latency = sorted[sorted.len() / 2] as f64;
|
||||
|
||||
if result.latency_ms as f64 > median_latency * args.latency_cap {
|
||||
if args.verbose {
|
||||
println!(
|
||||
" ⚠ Puzzle {} aborted: latency {}ms > {:.0}ms cap",
|
||||
puzzle.id,
|
||||
result.latency_ms,
|
||||
median_latency * args.latency_cap
|
||||
);
|
||||
}
|
||||
// Still record but mark as slow
|
||||
}
|
||||
}
|
||||
|
||||
latencies.push(result.latency_ms);
|
||||
|
||||
// Log
|
||||
logger.log_temporal(
|
||||
"timepuzzle-probe",
|
||||
&puzzle.id,
|
||||
puzzle.difficulty,
|
||||
result.solved,
|
||||
result.correct,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms,
|
||||
puzzle.constraints.len(),
|
||||
args.rewrite,
|
||||
args.web_search,
|
||||
)?;
|
||||
|
||||
if args.verbose {
|
||||
let status = if result.correct {
|
||||
"✓"
|
||||
} else if result.solved {
|
||||
"~"
|
||||
} else {
|
||||
"✗"
|
||||
};
|
||||
println!(
|
||||
" {} [{:2}] {}: steps={}, tools={}, {}ms",
|
||||
status,
|
||||
puzzle.difficulty,
|
||||
puzzle.id,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms
|
||||
);
|
||||
}
|
||||
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_time = probe_start.elapsed();
|
||||
println!();
|
||||
|
||||
// Analyze results
|
||||
let solved = results.iter().filter(|r| r.solved).count();
|
||||
let correct = results.iter().filter(|r| r.correct).count();
|
||||
let total = results.len();
|
||||
let accuracy = correct as f64 / total as f64;
|
||||
|
||||
let avg_steps = results.iter().map(|r| r.steps).sum::<usize>() as f64 / total as f64;
|
||||
let avg_tools = results.iter().map(|r| r.tool_calls).sum::<usize>() as f64 / total as f64;
|
||||
let avg_latency = results.iter().map(|r| r.latency_ms).sum::<u64>() as f64 / total as f64;
|
||||
|
||||
// Tool toggle analysis
|
||||
let with_tool_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.tool_calls > 0 && r.correct)
|
||||
.count();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Probe Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Overall Performance:");
|
||||
println!(" Puzzles run: {}", total);
|
||||
println!(
|
||||
" Solved: {} ({:.1}%)",
|
||||
solved,
|
||||
solved as f64 / total as f64 * 100.0
|
||||
);
|
||||
println!(
|
||||
" Correct: {} ({:.1}%)",
|
||||
correct,
|
||||
accuracy * 100.0
|
||||
);
|
||||
println!();
|
||||
println!("⏱️ Efficiency:");
|
||||
println!(" Avg steps: {:.1}", avg_steps);
|
||||
println!(" Avg tool calls: {:.1}", avg_tools);
|
||||
println!(" Avg latency: {:.1}ms", avg_latency);
|
||||
println!(" Median latency: {:.0}ms", median_latency);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Scaling curves
|
||||
println!("📈 Tool Toggle Analysis:");
|
||||
println!(
|
||||
" With rewriting: {}/{} ({:.1}%)",
|
||||
with_tool_correct,
|
||||
total,
|
||||
with_tool_correct as f64 / total as f64 * 100.0
|
||||
);
|
||||
|
||||
// Sensitivity analysis
|
||||
let fast_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms < median_latency as u64 && r.correct)
|
||||
.count();
|
||||
let slow_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms >= median_latency as u64 && r.correct)
|
||||
.count();
|
||||
let fast_total = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms < median_latency as u64)
|
||||
.count();
|
||||
let slow_total = total - fast_total;
|
||||
|
||||
if fast_total > 0 && slow_total > 0 {
|
||||
println!();
|
||||
println!("⚡ Latency Sensitivity:");
|
||||
println!(
|
||||
" Fast (<{:.0}ms): {}/{} ({:.1}%)",
|
||||
median_latency,
|
||||
fast_correct,
|
||||
fast_total,
|
||||
fast_correct as f64 / fast_total as f64 * 100.0
|
||||
);
|
||||
println!(
|
||||
" Slow (>={:.0}ms): {}/{} ({:.1}%)",
|
||||
median_latency,
|
||||
slow_correct,
|
||||
slow_total,
|
||||
slow_correct as f64 / slow_total as f64 * 100.0
|
||||
);
|
||||
}
|
||||
|
||||
// Accuracy by difficulty
|
||||
println!();
|
||||
println!("🎯 Accuracy by Difficulty:");
|
||||
let mut by_diff: std::collections::HashMap<u8, (usize, usize)> =
|
||||
std::collections::HashMap::new();
|
||||
for (p, r) in puzzles.iter().zip(results.iter()) {
|
||||
let e = by_diff.entry(p.difficulty).or_insert((0, 0));
|
||||
e.0 += 1;
|
||||
if r.correct {
|
||||
e.1 += 1;
|
||||
}
|
||||
}
|
||||
let mut diffs: Vec<_> = by_diff.keys().copied().collect();
|
||||
diffs.sort();
|
||||
for d in diffs {
|
||||
let (t, c) = by_diff[&d];
|
||||
let pct = c as f64 / t as f64 * 100.0;
|
||||
let bar = "█".repeat((pct / 5.0) as usize);
|
||||
println!(" Level {:2}: {:5.1}% {}", d, pct, bar);
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
println!();
|
||||
println!("💡 Insights:");
|
||||
if accuracy < 0.5 {
|
||||
println!(" • Low accuracy - consider enabling constraint rewriting");
|
||||
}
|
||||
if avg_steps > max_depth as f64 * 0.8 {
|
||||
println!(" • High step count - search may be inefficient");
|
||||
}
|
||||
if args.web_search && with_tool_correct > correct / 2 {
|
||||
println!(" • Web search providing substantial gains");
|
||||
}
|
||||
if accuracy >= 0.8 {
|
||||
println!(" • Good performance - ready for harder puzzles");
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
248
vendor/ruvector/examples/benchmarks/src/bin/vector_benchmark.rs
vendored
Normal file
248
vendor/ruvector/examples/benchmarks/src/bin/vector_benchmark.rs
vendored
Normal file
@@ -0,0 +1,248 @@
|
||||
//! Vector Index Benchmark Runner
|
||||
//!
|
||||
//! Benchmark vector operations with IVF and coherence gating.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin vector-benchmark -- --dim 128 --vectors 10000
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "vector-benchmark")]
|
||||
#[command(about = "Benchmark vector index operations")]
|
||||
struct Args {
|
||||
/// Vector dimensionality
|
||||
#[arg(short, long, default_value = "128")]
|
||||
dim: usize,
|
||||
|
||||
/// Number of vectors to insert
|
||||
#[arg(short = 'n', long, default_value = "10000")]
|
||||
vectors: usize,
|
||||
|
||||
/// Number of queries to run
|
||||
#[arg(short, long, default_value = "1000")]
|
||||
queries: usize,
|
||||
|
||||
/// Top-k results per query
|
||||
#[arg(short, long, default_value = "10")]
|
||||
top_k: usize,
|
||||
|
||||
/// Enable IVF indexing
|
||||
#[arg(long, default_value = "true")]
|
||||
ivf: bool,
|
||||
|
||||
/// Number of IVF clusters
|
||||
#[arg(long, default_value = "64")]
|
||||
clusters: usize,
|
||||
|
||||
/// Number of clusters to probe
|
||||
#[arg(long, default_value = "4")]
|
||||
probes: usize,
|
||||
|
||||
/// Enable coherence gate
|
||||
#[arg(long)]
|
||||
gate: bool,
|
||||
|
||||
/// Coherence gate threshold
|
||||
#[arg(long, default_value = "0.5")]
|
||||
gate_threshold: f32,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/vector_benchmark.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short = 'V', long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Vector Index Benchmark Runner ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize logger
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting vector benchmark", "vector-benchmark")?;
|
||||
|
||||
// Create index
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Dimensions: {}", args.dim);
|
||||
println!(" Vectors: {}", args.vectors);
|
||||
println!(" Queries: {}", args.queries);
|
||||
println!(" Top-K: {}", args.top_k);
|
||||
println!(" IVF: {}", args.ivf);
|
||||
if args.ivf {
|
||||
println!(" Clusters: {}", args.clusters);
|
||||
println!(" Probes: {}", args.probes);
|
||||
}
|
||||
println!(" Gate: {}", args.gate);
|
||||
if args.gate {
|
||||
println!(" Threshold: {}", args.gate_threshold);
|
||||
}
|
||||
println!();
|
||||
|
||||
let mut index = VectorIndex::new(args.dim);
|
||||
|
||||
if args.gate {
|
||||
index = index.with_gate(CoherenceGate::new(args.gate_threshold));
|
||||
}
|
||||
|
||||
if args.ivf {
|
||||
index = index.with_ivf(IvfConfig::new(args.clusters, args.probes));
|
||||
}
|
||||
|
||||
// Insert vectors
|
||||
println!("📥 Inserting {} vectors...", args.vectors);
|
||||
let insert_start = Instant::now();
|
||||
|
||||
for i in 0..args.vectors {
|
||||
index.insert(DenseVec::random(args.dim))?;
|
||||
if args.verbose && (i + 1) % 1000 == 0 {
|
||||
println!(" Inserted {} vectors", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let insert_time = insert_start.elapsed();
|
||||
println!(
|
||||
"✓ Insert complete ({:.2}s, {:.0} vec/s)",
|
||||
insert_time.as_secs_f64(),
|
||||
args.vectors as f64 / insert_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
|
||||
// Build IVF if enabled
|
||||
if args.ivf {
|
||||
println!("🏗️ Building IVF index...");
|
||||
let build_start = Instant::now();
|
||||
index.rebuild_ivf()?;
|
||||
let build_time = build_start.elapsed();
|
||||
println!("✓ IVF build complete ({:.2}s)", build_time.as_secs_f64());
|
||||
println!();
|
||||
}
|
||||
|
||||
// Print index stats
|
||||
let stats = index.stats();
|
||||
println!("📊 Index Statistics:");
|
||||
println!(" Active vectors: {}", stats.active_vectors);
|
||||
println!(" IVF clusters: {}", stats.ivf_clusters);
|
||||
println!();
|
||||
|
||||
// Run queries
|
||||
println!("🔍 Running {} queries...", args.queries);
|
||||
let query_start = Instant::now();
|
||||
|
||||
let mut latencies: Vec<u64> = Vec::with_capacity(args.queries);
|
||||
let mut total_results = 0usize;
|
||||
|
||||
for i in 0..args.queries {
|
||||
let q = DenseVec::random(args.dim);
|
||||
let coherence = if args.gate {
|
||||
rand::random::<f32>()
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let results = index.search(&q, args.top_k, coherence)?;
|
||||
let latency_us = start.elapsed().as_micros() as u64;
|
||||
|
||||
latencies.push(latency_us);
|
||||
total_results += results.len();
|
||||
|
||||
// Log query
|
||||
logger.log_vector(
|
||||
"search",
|
||||
args.dim,
|
||||
stats.active_vectors,
|
||||
1,
|
||||
args.top_k,
|
||||
args.ivf,
|
||||
coherence,
|
||||
latency_us,
|
||||
results.len(),
|
||||
)?;
|
||||
|
||||
if args.verbose && (i + 1) % 100 == 0 {
|
||||
println!(" Completed {} queries", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let query_time = query_start.elapsed();
|
||||
println!(
|
||||
"✓ Queries complete ({:.2}s, {:.0} q/s)",
|
||||
query_time.as_secs_f64(),
|
||||
args.queries as f64 / query_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
|
||||
// Compute statistics
|
||||
latencies.sort();
|
||||
let p50 = latencies[latencies.len() / 2];
|
||||
let p95 = latencies[latencies.len() * 95 / 100];
|
||||
let p99 = latencies[latencies.len() * 99 / 100];
|
||||
let avg = latencies.iter().sum::<u64>() / latencies.len() as u64;
|
||||
let max = *latencies.last().unwrap();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Benchmark Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("⏱️ Latency (microseconds):");
|
||||
println!(" Average: {}µs", avg);
|
||||
println!(" P50: {}µs", p50);
|
||||
println!(" P95: {}µs", p95);
|
||||
println!(" P99: {}µs", p99);
|
||||
println!(" Max: {}µs", max);
|
||||
println!();
|
||||
println!("📈 Throughput:");
|
||||
println!(
|
||||
" Queries/sec: {:.0}",
|
||||
args.queries as f64 / query_time.as_secs_f64()
|
||||
);
|
||||
println!(
|
||||
" Insert/sec: {:.0}",
|
||||
args.vectors as f64 / insert_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
println!("📊 Results:");
|
||||
println!(" Total results: {}", total_results);
|
||||
println!(
|
||||
" Avg results: {:.2}",
|
||||
total_results as f64 / args.queries as f64
|
||||
);
|
||||
|
||||
if args.gate {
|
||||
let gated = latencies
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &l)| l < 10)
|
||||
.count();
|
||||
println!(
|
||||
" Gated queries: {:.1}%",
|
||||
gated as f64 / args.queries as f64 * 100.0
|
||||
);
|
||||
}
|
||||
|
||||
// Save index
|
||||
println!();
|
||||
let index_path = "data/vector_index.bin";
|
||||
std::fs::create_dir_all("data")?;
|
||||
index.save_to_file(index_path)?;
|
||||
println!("💾 Index saved to: {}", index_path);
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
197
vendor/ruvector/examples/benchmarks/src/bin/wasm_solver_bench.rs
vendored
Normal file
197
vendor/ruvector/examples/benchmarks/src/bin/wasm_solver_bench.rs
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
//! WASM Solver Benchmark — Compares native vs WASM AGI solver performance.
|
||||
//!
|
||||
//! Runs the same acceptance test configuration through:
|
||||
//! 1. Native Rust solver (benchmarks crate)
|
||||
//! 2. Reference metrics comparison
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin wasm-solver-bench [-- --holdout <N> --training <N> --cycles <N>]
|
||||
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::acceptance_test::{run_acceptance_test_mode, AblationMode, HoldoutConfig};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "wasm-solver-bench")]
|
||||
struct Args {
|
||||
#[arg(long, default_value = "50")]
|
||||
holdout: usize,
|
||||
#[arg(long, default_value = "50")]
|
||||
training: usize,
|
||||
#[arg(long, default_value = "3")]
|
||||
cycles: usize,
|
||||
#[arg(long, default_value = "200")]
|
||||
budget: usize,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ WASM vs Native AGI Solver Benchmark ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!(
|
||||
" Config: holdout={}, training={}, cycles={}, budget={}",
|
||||
args.holdout, args.training, args.cycles, args.budget
|
||||
);
|
||||
println!();
|
||||
|
||||
let config = HoldoutConfig {
|
||||
holdout_size: args.holdout,
|
||||
training_per_cycle: args.training,
|
||||
cycles: args.cycles,
|
||||
step_budget: args.budget,
|
||||
holdout_seed: 0xDEAD_BEEF,
|
||||
training_seed: 42,
|
||||
noise_rate: 0.25,
|
||||
min_accuracy: 0.50,
|
||||
min_dimensions_improved: 1,
|
||||
verbose: false,
|
||||
};
|
||||
|
||||
// ── Native Mode A (Baseline) ──────────────────────────────────
|
||||
println!(" Running Native Mode A (baseline)...");
|
||||
let t0 = Instant::now();
|
||||
let native_a = run_acceptance_test_mode(&config, &AblationMode::Baseline).unwrap();
|
||||
let native_a_ms = t0.elapsed().as_millis();
|
||||
|
||||
// ── Native Mode B (Compiler) ──────────────────────────────────
|
||||
println!(" Running Native Mode B (compiler)...");
|
||||
let t0 = Instant::now();
|
||||
let native_b = run_acceptance_test_mode(&config, &AblationMode::CompilerOnly).unwrap();
|
||||
let native_b_ms = t0.elapsed().as_millis();
|
||||
|
||||
// ── Native Mode C (Full learned) ──────────────────────────────
|
||||
println!(" Running Native Mode C (full learned)...");
|
||||
let t0 = Instant::now();
|
||||
let native_c = run_acceptance_test_mode(&config, &AblationMode::Full).unwrap();
|
||||
let native_c_ms = t0.elapsed().as_millis();
|
||||
|
||||
println!();
|
||||
println!(" ┌────────────────────────────────────────────────────────┐");
|
||||
println!(" │ NATIVE SOLVER RESULTS │");
|
||||
println!(" ├────────────────────────────────────────────────────────┤");
|
||||
println!(
|
||||
" │ {:<12} {:>8} {:>10} {:>10} {:>8} {:>8} │",
|
||||
"Mode", "Acc%", "Cost", "Noise%", "Time", "Pass"
|
||||
);
|
||||
println!(" │ {} │", "-".repeat(54));
|
||||
|
||||
for (label, result, ms) in [
|
||||
("A baseline", &native_a, native_a_ms),
|
||||
("B compiler", &native_b, native_b_ms),
|
||||
("C learned", &native_c, native_c_ms),
|
||||
] {
|
||||
let last = result.result.cycles.last().unwrap();
|
||||
println!(
|
||||
" │ {:<12} {:>6.1}% {:>9.1} {:>8.1}% {:>5}ms {:>7} │",
|
||||
label,
|
||||
last.holdout_accuracy * 100.0,
|
||||
last.holdout_cost_per_solve,
|
||||
last.holdout_noise_accuracy * 100.0,
|
||||
ms,
|
||||
if result.result.passed { "PASS" } else { "FAIL" }
|
||||
);
|
||||
}
|
||||
println!(" └────────────────────────────────────────────────────────┘");
|
||||
println!();
|
||||
|
||||
// ── WASM Reference Metrics ────────────────────────────────────
|
||||
// Since we can't run WASM directly from Rust without a runtime,
|
||||
// we output the reference metrics that the WASM module should match.
|
||||
println!(" ┌────────────────────────────────────────────────────────┐");
|
||||
println!(" │ WASM REFERENCE METRICS (for validation) │");
|
||||
println!(" ├────────────────────────────────────────────────────────┤");
|
||||
println!(" │ │");
|
||||
println!(" │ The rvf-solver-wasm module should produce: │");
|
||||
println!(" │ │");
|
||||
|
||||
let total_ms = native_a_ms + native_b_ms + native_c_ms;
|
||||
println!(
|
||||
" │ Native total time: {}ms │",
|
||||
total_ms
|
||||
);
|
||||
println!(
|
||||
" │ WASM expected: ~{}ms (2-5x native) │",
|
||||
total_ms * 3
|
||||
);
|
||||
println!(" │ │");
|
||||
|
||||
// PolicyKernel convergence check
|
||||
println!(" │ Mode C PolicyKernel: │");
|
||||
println!(
|
||||
" │ Context buckets: {} │",
|
||||
native_c.policy_context_buckets
|
||||
);
|
||||
println!(
|
||||
" │ Early commit rate: {:.2}% │",
|
||||
native_c.early_commit_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" │ Compiler hits: {} │",
|
||||
native_c.compiler_hits
|
||||
);
|
||||
println!(" │ │");
|
||||
|
||||
// Thompson Sampling convergence: Mode C should learn differently across contexts
|
||||
let c_unique_modes: std::collections::HashSet<&str> = native_c
|
||||
.skip_mode_distribution
|
||||
.values()
|
||||
.flat_map(|m| m.keys())
|
||||
.map(|s| s.as_str())
|
||||
.collect();
|
||||
println!(" │ Thompson Sampling convergence: │");
|
||||
println!(
|
||||
" │ Unique skip modes: {} (need >=2) │",
|
||||
c_unique_modes.len()
|
||||
);
|
||||
println!(" │ Skip distribution: │");
|
||||
for (bucket, dist) in &native_c.skip_mode_distribution {
|
||||
let total = dist.values().sum::<usize>().max(1);
|
||||
let parts: Vec<String> = dist
|
||||
.iter()
|
||||
.map(|(m, c)| format!("{}:{:.0}%", m, *c as f64 / total as f64 * 100.0))
|
||||
.collect();
|
||||
if parts.len() > 0 {
|
||||
println!(" │ {:<16} {} │", bucket, parts.join(" "));
|
||||
}
|
||||
}
|
||||
println!(" │ │");
|
||||
|
||||
// Ablation assertions
|
||||
let last_a = native_a.result.cycles.last().unwrap();
|
||||
let last_b = native_b.result.cycles.last().unwrap();
|
||||
let last_c = native_c.result.cycles.last().unwrap();
|
||||
let cost_decrease = if last_a.holdout_cost_per_solve > 0.0 {
|
||||
(1.0 - last_b.holdout_cost_per_solve / last_a.holdout_cost_per_solve) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let robustness_gain = (last_c.holdout_noise_accuracy - last_b.holdout_noise_accuracy) * 100.0;
|
||||
|
||||
println!(" │ Ablation assertions: │");
|
||||
println!(
|
||||
" │ B vs A cost decrease: {:.1}% (need >=15%) │",
|
||||
cost_decrease
|
||||
);
|
||||
println!(
|
||||
" │ C vs B robustness: {:.1}% (need >=10%) │",
|
||||
robustness_gain
|
||||
);
|
||||
println!(" │ │");
|
||||
println!(" │ WASM module must match these learning characteristics │");
|
||||
println!(" │ (exact values may differ due to float precision) │");
|
||||
println!(" └────────────────────────────────────────────────────────┘");
|
||||
println!();
|
||||
|
||||
// Final summary
|
||||
let all_passed = native_a.result.passed && native_b.result.passed && native_c.result.passed;
|
||||
if all_passed {
|
||||
println!(" NATIVE BENCHMARK: ALL MODES PASSED");
|
||||
} else {
|
||||
println!(" NATIVE BENCHMARK: SOME MODES FAILED");
|
||||
}
|
||||
println!(" Binary size: rvf-solver-wasm.wasm ~160 KB");
|
||||
println!();
|
||||
}
|
||||
Reference in New Issue
Block a user