Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,110 @@
[package]
name = "ruvector-benchmarks"
version = "0.1.0"
edition = "2021"
description = "Comprehensive benchmarks for temporal reasoning and vector operations"
publish = false
[dependencies]
# Core ruvector
ruvector-core = { path = "../../crates/ruvector-core", default-features = false, features = ["parallel"] }
# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
bincode = { version = "2.0.0-rc.3", features = ["serde"] }
# Error handling
anyhow = "1.0"
thiserror = "2.0"
# Random and numerics
rand = "0.8"
rand_distr = "0.4"
# Parallel processing
rayon = "1.10"
# CLI and progress
clap = { version = "4.5", features = ["derive"] }
indicatif = "0.17"
console = "0.15"
# Async
tokio = { version = "1.41", features = ["rt-multi-thread", "sync", "macros", "time", "fs"] }
futures = "0.3"
# Time handling (critical for temporal benchmarks)
chrono = { version = "0.4", features = ["serde"] }
# Logging and tracing
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
# Crypto for witness chains
sha2 = "0.10"
# RVF native format integration
rvf-types = { path = "../../crates/rvf/rvf-types" }
rvf-crypto = { path = "../../crates/rvf/rvf-crypto" }
rvf-wire = { path = "../../crates/rvf/rvf-wire" }
# Statistics
statistical = "1.0"
hdrhistogram = "7.5"
# HTTP for tool-augmented tests
reqwest = { version = "0.11", features = ["json"] }
# Visualization
plotters = { version = "0.3", optional = true }
# Type theory for verified reasoning (lean-agentic)
lean-agentic = "0.1"
[dev-dependencies]
tempfile = "3.13"
[features]
default = []
visualize = ["plotters"]
[[bin]]
name = "temporal-benchmark"
path = "src/bin/temporal_benchmark.rs"
[[bin]]
name = "vector-benchmark"
path = "src/bin/vector_benchmark.rs"
[[bin]]
name = "swarm-regret"
path = "src/bin/swarm_regret.rs"
[[bin]]
name = "timepuzzle-runner"
path = "src/bin/timepuzzle_runner.rs"
[[bin]]
name = "intelligence-assessment"
path = "src/bin/intelligence_assessment.rs"
[[bin]]
name = "rvf-intelligence-bench"
path = "src/bin/rvf_intelligence_bench.rs"
[[bin]]
name = "superintelligence"
path = "src/bin/superintelligence.rs"
[[bin]]
name = "agi-proof-harness"
path = "src/bin/agi_proof_harness.rs"
[[bin]]
name = "acceptance-rvf"
path = "src/bin/acceptance_rvf.rs"
[[bin]]
name = "wasm-solver-bench"
path = "src/bin/wasm_solver_bench.rs"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,627 @@
//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
//!
//! The AGI contract states: a system improves utility over time without violating
//! policy, while maintaining structural health.
//!
//! ## Core Metrics (all deterministic, all auditable)
//!
//! - **Solved tasks per cost** — graded outcomes normalized by compute
//! - **Stability under noise** — accuracy retention when inputs are corrupted
//! - **Contradiction rate** — solved-but-wrong / total attempted
//! - **Rollback correctness** — recovery rate when bad inputs are detected
//! - **Policy violations** — budget overruns + contradictions (must be zero)
//!
//! ## Autonomy Ladder
//!
//! Each level requires sustained health metrics before advancement:
//! 0. Read-only (observe only)
//! 1. Write to memory (store episodes, no execution)
//! 2. Execute tools (run solver, generate puzzles)
//! 3. Write to external systems (publish results)
//! 4. Deploy and operate (self-directed improvement)
use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
use serde::{Deserialize, Serialize};
// ═══════════════════════════════════════════════════════════════════════════
// Contract Health Snapshot
// ═══════════════════════════════════════════════════════════════════════════
/// A single point-in-time health measurement against the AGI contract.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractHealth {
/// Solved tasks per unit cost (tasks_correct / total_steps)
pub solved_per_cost: f64,
/// Accuracy on noise-injected tasks
pub noise_stability: f64,
/// Contradiction rate: solved-but-wrong / attempted
pub contradiction_rate: f64,
/// Rollback correctness: successful rollbacks / attempted rollbacks
pub rollback_correctness: f64,
/// Total policy violations (must be zero for contract compliance)
pub policy_violations: usize,
/// Clean accuracy (graded outcome baseline)
pub accuracy: f64,
/// Cost efficiency (0-1, higher = cheaper per solve)
pub cost_efficiency: f64,
/// Whether the contract is satisfied
pub compliant: bool,
}
impl ContractHealth {
/// Evaluate contract health from raw metrics.
pub fn from_raw(raw: &RawMetrics) -> Self {
let accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let solved_per_cost = if raw.total_steps > 0 {
raw.tasks_correct as f64 / raw.total_steps as f64
} else {
0.0
};
let noise_stability = if raw.noise_tasks_attempted > 0 {
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
} else {
0.0
};
let contradiction_rate = if raw.tasks_attempted > 0 {
raw.contradictions as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let rollback_correctness = if raw.rollback_attempts > 0 {
raw.rollback_successes as f64 / raw.rollback_attempts as f64
} else {
1.0 // no rollbacks needed => perfect
};
let cost_efficiency = (1.0 - {
let sps = if raw.tasks_correct > 0 {
raw.total_steps as f64 / raw.tasks_correct as f64
} else {
100.0
};
(sps - 5.0) / 95.0
})
.clamp(0.0, 1.0);
let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;
ContractHealth {
solved_per_cost,
noise_stability,
contradiction_rate,
rollback_correctness,
policy_violations: raw.policy_violations,
accuracy,
cost_efficiency,
compliant,
}
}
/// Evaluate contract health from an IntelligenceAssessment.
pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
Self::from_raw(&assessment.raw_data)
}
/// Print formatted contract health report.
pub fn print(&self) {
println!(" Contract Health:");
println!(" Solved/Cost: {:.4}", self.solved_per_cost);
println!(
" Noise Stability: {:.2}%",
self.noise_stability * 100.0
);
println!(
" Contradiction Rate: {:.4}%",
self.contradiction_rate * 100.0
);
println!(
" Rollback Correct: {:.2}%",
self.rollback_correctness * 100.0
);
println!(" Policy Violations: {}", self.policy_violations);
println!(" Accuracy: {:.2}%", self.accuracy * 100.0);
println!(
" Cost Efficiency: {:.2}%",
self.cost_efficiency * 100.0
);
println!(
" Compliant: {}",
if self.compliant { "YES" } else { "NO" }
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Contract Trend — compares two snapshots
// ═══════════════════════════════════════════════════════════════════════════
/// Tracks improvement across contract dimensions between two measurement points.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractDelta {
/// Change in solved-per-cost (positive = improving)
pub solved_per_cost_delta: f64,
/// Change in noise stability (positive = more robust)
pub noise_stability_delta: f64,
/// Change in contradiction rate (negative = improving)
pub contradiction_rate_delta: f64,
/// Change in rollback correctness (positive = better recovery)
pub rollback_delta: f64,
/// Change in accuracy (positive = better)
pub accuracy_delta: f64,
/// Change in cost efficiency (positive = cheaper)
pub cost_efficiency_delta: f64,
/// Number of dimensions that improved
pub dimensions_improved: usize,
/// Number of dimensions that regressed
pub dimensions_regressed: usize,
}
impl ContractDelta {
/// Compute delta between two health snapshots.
pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
let noise_stability_delta = after.noise_stability - before.noise_stability;
let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
let rollback_delta = after.rollback_correctness - before.rollback_correctness;
let accuracy_delta = after.accuracy - before.accuracy;
let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;
// Count improvements (positive is better for all except contradiction_rate)
let deltas = [
solved_per_cost_delta > 0.001,
noise_stability_delta > 0.001,
contradiction_rate_delta < -0.001, // decrease = improvement
rollback_delta > 0.001,
accuracy_delta > 0.001,
cost_efficiency_delta > 0.001,
];
let regressions = [
solved_per_cost_delta < -0.001,
noise_stability_delta < -0.001,
contradiction_rate_delta > 0.001,
rollback_delta < -0.001,
accuracy_delta < -0.01,
cost_efficiency_delta < -0.001,
];
ContractDelta {
solved_per_cost_delta,
noise_stability_delta,
contradiction_rate_delta,
rollback_delta,
accuracy_delta,
cost_efficiency_delta,
dimensions_improved: deltas.iter().filter(|&&d| d).count(),
dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
}
}
pub fn print(&self) {
let arrow = |v: f64, invert: bool| {
let positive = if invert { v < 0.0 } else { v > 0.0 };
if positive {
"+"
} else if v == 0.0 {
"="
} else {
"-"
}
};
println!(" Contract Delta:");
println!(
" Solved/Cost: {:>+.4} [{}]",
self.solved_per_cost_delta,
arrow(self.solved_per_cost_delta, false)
);
println!(
" Noise Stability: {:>+.4} [{}]",
self.noise_stability_delta,
arrow(self.noise_stability_delta, false)
);
println!(
" Contradiction: {:>+.4} [{}]",
self.contradiction_rate_delta,
arrow(self.contradiction_rate_delta, true)
);
println!(
" Rollback: {:>+.4} [{}]",
self.rollback_delta,
arrow(self.rollback_delta, false)
);
println!(
" Accuracy: {:>+.4} [{}]",
self.accuracy_delta,
arrow(self.accuracy_delta, false)
);
println!(
" Cost Efficiency: {:>+.4} [{}]",
self.cost_efficiency_delta,
arrow(self.cost_efficiency_delta, false)
);
println!(" Dimensions improved: {}/6", self.dimensions_improved);
println!(" Dimensions regressed: {}/6", self.dimensions_regressed);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Autonomy Ladder
// ═══════════════════════════════════════════════════════════════════════════
/// Autonomy level gated by sustained contract health.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AutonomyLevel {
/// Level 0: Read-only observation
ReadOnly = 0,
/// Level 1: Write to memory (store episodes)
WriteMemory = 1,
/// Level 2: Execute tools (run solver)
ExecuteTools = 2,
/// Level 3: Write to external systems (publish results)
WriteExternal = 3,
/// Level 4: Deploy and operate (self-directed improvement)
DeployOperate = 4,
}
/// Thresholds for advancing autonomy levels.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AutonomyGates {
/// Minimum consecutive compliant cycles to advance
pub min_compliant_cycles: usize,
/// Maximum allowed contradiction rate per level
pub max_contradiction_rate: [f64; 5],
/// Minimum accuracy per level
pub min_accuracy: [f64; 5],
/// Minimum cost efficiency per level
pub min_cost_efficiency: [f64; 5],
/// Minimum noise stability per level
pub min_noise_stability: [f64; 5],
/// Must have zero policy violations for levels >= 2
pub zero_violations_above: AutonomyLevel,
}
impl Default for AutonomyGates {
fn default() -> Self {
Self {
min_compliant_cycles: 3,
// L0 L1 L2 L3 L4
max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
zero_violations_above: AutonomyLevel::ExecuteTools,
}
}
}
/// Evaluator that determines current autonomy level from contract history.
pub struct AutonomyEvaluator {
pub gates: AutonomyGates,
}
impl Default for AutonomyEvaluator {
fn default() -> Self {
Self {
gates: AutonomyGates::default(),
}
}
}
impl AutonomyEvaluator {
/// Determine the highest autonomy level supported by the health history.
/// `history` is ordered oldest-first.
pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
if history.is_empty() {
return AutonomyLevel::ReadOnly;
}
let mut level = AutonomyLevel::ReadOnly;
let levels = [
AutonomyLevel::WriteMemory,
AutonomyLevel::ExecuteTools,
AutonomyLevel::WriteExternal,
AutonomyLevel::DeployOperate,
];
for &candidate in &levels {
let idx = candidate as usize;
let required = self.gates.min_compliant_cycles;
// Need enough recent history
if history.len() < required {
break;
}
let recent = &history[history.len().saturating_sub(required)..];
let all_pass = recent.iter().all(|h| {
h.accuracy >= self.gates.min_accuracy[idx]
&& h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
&& h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
&& h.noise_stability >= self.gates.min_noise_stability[idx]
&& (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
});
if all_pass {
level = candidate;
} else {
break;
}
}
level
}
pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
let labels = [
"Read-Only",
"Write Memory",
"Execute Tools",
"Write External",
"Deploy & Operate",
];
println!(
" Autonomy Level: {} ({})",
level as usize, labels[level as usize]
);
println!(" Gates for next level:");
let next = (level as usize + 1).min(4);
println!(
" Accuracy: {:.0}% (need {:.0}%)",
health.accuracy * 100.0,
self.gates.min_accuracy[next] * 100.0
);
println!(
" Contradiction: {:.3}% (need <{:.3}%)",
health.contradiction_rate * 100.0,
self.gates.max_contradiction_rate[next] * 100.0
);
println!(
" Cost Eff: {:.0}% (need {:.0}%)",
health.cost_efficiency * 100.0,
self.gates.min_cost_efficiency[next] * 100.0
);
println!(
" Noise Stab: {:.0}% (need {:.0}%)",
health.noise_stability * 100.0,
self.gates.min_noise_stability[next] * 100.0
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Viability Checklist
// ═══════════════════════════════════════════════════════════════════════════
/// The 5 viability checks that determine if the system is on an AGI trajectory.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ViabilityChecklist {
/// Can replay runs and get identical grades
pub deterministic_replay: bool,
/// Improves utility over time without raising policy violations
pub improving_without_violations: bool,
/// Can roll back bad learning reliably
pub reliable_rollback: bool,
/// Can generate infinite novel tasks with automatic grading
pub infinite_gradeable_tasks: bool,
/// Cost per solve trending down over weeks
pub cost_trending_down: bool,
}
impl ViabilityChecklist {
/// Evaluate from contract health history.
pub fn evaluate(history: &[ContractHealth]) -> Self {
// Deterministic replay: verified externally (always true in our harness)
let deterministic_replay = true;
// Improving without violations: later health better than earlier, zero violations
let improving_without_violations = if history.len() >= 2 {
let first = &history[0];
let last = &history[history.len() - 1];
last.accuracy >= first.accuracy
&& last.policy_violations == 0
&& history.iter().all(|h| h.policy_violations == 0)
} else {
false
};
// Reliable rollback: rollback correctness >= 80% when attempted
let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);
// Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
let infinite_gradeable_tasks = true;
// Cost trending down: solved_per_cost increases over time
let cost_trending_down = if history.len() >= 3 {
let first_third: f64 = history[..history.len() / 3]
.iter()
.map(|h| h.solved_per_cost)
.sum::<f64>()
/ (history.len() / 3) as f64;
let last_third: f64 = history[history.len() * 2 / 3..]
.iter()
.map(|h| h.solved_per_cost)
.sum::<f64>()
/ (history.len() - history.len() * 2 / 3) as f64;
last_third > first_third
} else {
false
};
ViabilityChecklist {
deterministic_replay,
improving_without_violations,
reliable_rollback,
infinite_gradeable_tasks,
cost_trending_down,
}
}
pub fn all_pass(&self) -> bool {
self.deterministic_replay
&& self.improving_without_violations
&& self.reliable_rollback
&& self.infinite_gradeable_tasks
&& self.cost_trending_down
}
pub fn print(&self) {
let check = |b: bool| if b { "PASS" } else { "FAIL" };
println!(" Viability Checklist:");
println!(
" 1. Deterministic replay: {}",
check(self.deterministic_replay)
);
println!(
" 2. Improving w/o violations: {}",
check(self.improving_without_violations)
);
println!(
" 3. Reliable rollback: {}",
check(self.reliable_rollback)
);
println!(
" 4. Infinite gradeable tasks: {}",
check(self.infinite_gradeable_tasks)
);
println!(
" 5. Cost trending down: {}",
check(self.cost_trending_down)
);
println!(
" Overall: {}",
if self.all_pass() {
"VIABLE AGI TRAJECTORY"
} else {
"NOT YET VIABLE"
}
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Tests
// ═══════════════════════════════════════════════════════════════════════════
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn contract_health_from_raw() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 100;
raw.tasks_completed = 95;
raw.tasks_correct = 92;
raw.total_steps = 600;
raw.noise_tasks_attempted = 30;
raw.noise_tasks_correct = 25;
raw.contradictions = 0; // zero contradictions for compliance
raw.rollback_attempts = 5;
raw.rollback_successes = 4;
let health = ContractHealth::from_raw(&raw);
assert!((health.accuracy - 0.92).abs() < 0.01);
assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
assert!((health.contradiction_rate).abs() < 0.001);
assert!((health.rollback_correctness - 0.8).abs() < 0.01);
assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
}
#[test]
fn contract_delta_detects_improvement() {
let before = ContractHealth {
solved_per_cost: 0.10,
noise_stability: 0.70,
contradiction_rate: 0.03,
rollback_correctness: 0.80,
policy_violations: 0,
accuracy: 0.85,
cost_efficiency: 0.50,
compliant: false,
};
let after = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.85,
contradiction_rate: 0.01,
rollback_correctness: 0.90,
policy_violations: 0,
accuracy: 0.93,
cost_efficiency: 0.70,
compliant: true,
};
let delta = ContractDelta::between(&before, &after);
assert_eq!(delta.dimensions_improved, 6);
assert_eq!(delta.dimensions_regressed, 0);
}
#[test]
fn autonomy_ladder_advances() {
let evaluator = AutonomyEvaluator::default();
// No history => ReadOnly
assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);
// 3 compliant cycles at L1 level
let h = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.55,
contradiction_rate: 0.04,
rollback_correctness: 1.0,
policy_violations: 0,
accuracy: 0.75,
cost_efficiency: 0.30,
compliant: true,
};
let history = vec![h.clone(), h.clone(), h.clone()];
assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
}
#[test]
fn viability_checklist_basic() {
let h1 = ContractHealth {
solved_per_cost: 0.10,
noise_stability: 0.70,
contradiction_rate: 0.01,
rollback_correctness: 0.90,
policy_violations: 0,
accuracy: 0.85,
cost_efficiency: 0.50,
compliant: true,
};
let h2 = ContractHealth {
solved_per_cost: 0.12,
noise_stability: 0.80,
contradiction_rate: 0.005,
rollback_correctness: 0.95,
policy_violations: 0,
accuracy: 0.90,
cost_efficiency: 0.60,
compliant: true,
};
let h3 = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.85,
contradiction_rate: 0.002,
rollback_correctness: 0.95,
policy_violations: 0,
accuracy: 0.93,
cost_efficiency: 0.70,
compliant: true,
};
let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
assert!(viability.deterministic_replay);
assert!(viability.improving_without_violations);
assert!(viability.reliable_rollback);
assert!(viability.infinite_gradeable_tasks);
assert!(viability.cost_trending_down);
assert!(viability.all_pass());
}
}

View File

@@ -0,0 +1,166 @@
//! Publishable RVF Acceptance Test — CLI entry point.
//!
//! Generates or verifies a deterministic acceptance test manifest with
//! SHAKE-256 witness chain (rvf-crypto native). Same seed → same outcomes
//! → same root hash.
//!
//! ```bash
//! # Generate manifest (JSON + .rvf binary)
//! cargo run --bin acceptance-rvf -- generate -o manifest.json
//!
//! # Generate with custom config
//! cargo run --bin acceptance-rvf -- generate -o manifest.json \
//! --holdout 200 --training 200 --cycles 5
//!
//! # Verify a manifest (re-runs and compares root hash)
//! cargo run --bin acceptance-rvf -- verify -i manifest.json
//!
//! # Verify the .rvf binary witness chain
//! cargo run --bin acceptance-rvf -- verify-rvf -i acceptance_manifest.rvf
//! ```
use clap::{Parser, Subcommand};
use ruvector_benchmarks::acceptance_test::HoldoutConfig;
use ruvector_benchmarks::publishable_rvf::{
generate_manifest_with_rvf, verify_manifest, verify_rvf_binary,
};
#[derive(Parser)]
#[command(name = "acceptance-rvf")]
#[command(about = "Publishable RVF acceptance test with SHAKE-256 witness chain")]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Generate a new acceptance test manifest (JSON + .rvf binary)
Generate {
/// Output JSON file path
#[arg(short, long, default_value = "acceptance_manifest.json")]
output: String,
/// Holdout set size
#[arg(long, default_value_t = 200)]
holdout: usize,
/// Training puzzles per cycle
#[arg(long, default_value_t = 200)]
training: usize,
/// Number of training cycles
#[arg(long, default_value_t = 5)]
cycles: usize,
/// Step budget per puzzle
#[arg(long, default_value_t = 400)]
budget: usize,
/// Verbose output
#[arg(short, long)]
verbose: bool,
},
/// Verify an existing manifest by replaying and comparing root hash
Verify {
/// Input JSON file path
#[arg(short, long)]
input: String,
},
/// Verify a native .rvf binary witness chain
VerifyRvf {
/// Input .rvf file path
#[arg(short, long)]
input: String,
},
}
fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::Generate {
output,
holdout,
training,
cycles,
budget,
verbose,
} => {
let config = HoldoutConfig {
holdout_size: holdout,
training_per_cycle: training,
cycles,
step_budget: budget,
min_accuracy: 0.50,
min_dimensions_improved: 1,
verbose,
..Default::default()
};
// Derive .rvf path from JSON output path
let rvf_path = output.replace(".json", ".rvf");
println!("Generating acceptance test manifest...");
println!(
" holdout={}, training={}, cycles={}, budget={}",
holdout, training, cycles, budget
);
println!();
let manifest = generate_manifest_with_rvf(&config, Some(&rvf_path))?;
manifest.print_summary();
let json = serde_json::to_string_pretty(&manifest)?;
std::fs::write(&output, &json)?;
println!(" JSON manifest: {}", output);
println!(" RVF binary: {}", rvf_path);
println!(" Chain root hash: {}", manifest.chain_root_hash);
println!();
if manifest.all_passed {
std::process::exit(0);
} else {
std::process::exit(1);
}
}
Commands::Verify { input } => {
println!("Loading manifest from: {}", input);
let json = std::fs::read_to_string(&input)?;
let manifest: ruvector_benchmarks::publishable_rvf::RvfManifest =
serde_json::from_str(&json)?;
println!(" Chain length: {}", manifest.chain_length);
println!(
" Expected root: {}",
&manifest.chain_root_hash[..32.min(manifest.chain_root_hash.len())]
);
println!();
println!("Re-running acceptance test with same config...");
let result = verify_manifest(&manifest)?;
result.print();
if result.passed() {
println!(" VERIFICATION: PASSED — outcomes are identical");
std::process::exit(0);
} else {
println!(" VERIFICATION: FAILED — outcomes differ");
std::process::exit(1);
}
}
Commands::VerifyRvf { input } => {
println!("Verifying .rvf witness chain: {}", input);
match verify_rvf_binary(&input) {
Ok(count) => {
println!(" WITNESS_SEG verified: {} entries, chain intact", count);
std::process::exit(0);
}
Err(e) => {
println!(" VERIFICATION FAILED: {}", e);
std::process::exit(1);
}
}
}
}
}

View File

@@ -0,0 +1,204 @@
//! AGI Proof Harness — Nightly runner that publishes contract metrics.
//!
//! Publishes:
//! - Success rate
//! - Cost per solve
//! - Robustness under noise
//! - Policy compliance
//! - Contradiction rate
//! - Rollback correctness
//! - Viability checklist status
//! - Autonomy level
//!
//! Usage:
//! cargo run --bin agi-proof-harness
//! cargo run --bin agi-proof-harness -- --holdout 1000 --cycles 10 --verbose
//! cargo run --bin agi-proof-harness -- --full # 10K training, 1K holdout, 10 cycles
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::acceptance_test::{
run_ablation_comparison, run_acceptance_test, HoldoutConfig,
};
use ruvector_benchmarks::agi_contract::{AutonomyEvaluator, ContractHealth, ViabilityChecklist};
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
#[derive(Parser, Debug)]
#[command(name = "agi-proof-harness")]
#[command(about = "AGI contract proof harness — publishes nightly metrics")]
struct Args {
/// Holdout evaluation set size
#[arg(long, default_value = "200")]
holdout: usize,
/// Training tasks per cycle
#[arg(long, default_value = "200")]
training: usize,
/// Number of improvement cycles
#[arg(long, default_value = "5")]
cycles: usize,
/// Frozen holdout seed
#[arg(long, default_value = "3735928559")]
holdout_seed: u64,
/// Training seed
#[arg(long, default_value = "42")]
training_seed: u64,
/// Noise injection rate
#[arg(long, default_value = "0.25")]
noise: f64,
/// Step budget per task
#[arg(long, default_value = "400")]
step_budget: usize,
/// Full acceptance test (10K training, 1K holdout, 10 cycles)
#[arg(long)]
full: bool,
/// Minimum accuracy threshold
#[arg(long, default_value = "0.80")]
min_accuracy: f64,
/// Run three-mode ablation comparison (A/B/C)
#[arg(long)]
ablation: bool,
/// Also run the 5-level SI pathway
#[arg(long)]
pathway: bool,
/// Verbose output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ AGI PROOF HARNESS ║");
println!("║ Contract-based intelligence measurement ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
let config = if args.full {
HoldoutConfig {
holdout_size: 1000,
training_per_cycle: 1000,
cycles: 10,
holdout_seed: args.holdout_seed,
training_seed: args.training_seed,
noise_rate: args.noise,
step_budget: args.step_budget,
min_accuracy: 0.95,
min_dimensions_improved: 2,
verbose: args.verbose,
}
} else {
HoldoutConfig {
holdout_size: args.holdout,
training_per_cycle: args.training,
cycles: args.cycles,
holdout_seed: args.holdout_seed,
training_seed: args.training_seed,
noise_rate: args.noise,
step_budget: args.step_budget,
min_accuracy: args.min_accuracy,
min_dimensions_improved: 2,
verbose: args.verbose,
}
};
println!(
" Config: holdout={}, training/cycle={}, cycles={}, noise={:.0}%",
config.holdout_size,
config.training_per_cycle,
config.cycles,
config.noise_rate * 100.0
);
println!(
" Seeds: holdout=0x{:X}, training={}",
config.holdout_seed, config.training_seed
);
println!();
// ─── Run Acceptance Test ─────────────────────────────────────────
println!(" Running acceptance test...");
let result = run_acceptance_test(&config)?;
result.print();
// ─── Ablation Comparison ─────────────────────────────────────────
if args.ablation {
println!(" Running ablation comparison (A / B / C)...");
let comparison = run_ablation_comparison(&config)?;
comparison.print();
}
// ─── Contract Health Summary ─────────────────────────────────────
if let Some(last_cycle) = result.cycles.last() {
println!();
last_cycle.contract_health.print();
// ─── Autonomy Level ──────────────────────────────────────────
let health_history: Vec<ContractHealth> = result
.cycles
.iter()
.map(|c| c.contract_health.clone())
.collect();
let evaluator = AutonomyEvaluator::default();
let level = evaluator.evaluate(&health_history);
println!();
evaluator.print_status(level, &last_cycle.contract_health);
// ─── Viability Checklist ─────────────────────────────────────
let viability = ViabilityChecklist::evaluate(&health_history);
println!();
viability.print();
}
// ─── Optional: SI Pathway ────────────────────────────────────────
if args.pathway {
println!();
println!(" Running 5-level SI pathway...");
let si_config = SIConfig {
episodes_per_level: 6,
tasks_per_episode: 15,
verbose: args.verbose,
..Default::default()
};
let pathway_result = run_pathway(&si_config)?;
pathway_result.print();
// Show contract health for peak level
if let Some(peak) = pathway_result
.levels
.iter()
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
{
let health = ContractHealth::from_raw(&peak.raw_metrics);
println!(" Peak Level ({}) Contract:", peak.name);
health.print();
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&peak.raw_metrics);
println!(" Multi-dimensional IQ: {:.1}", assessment.overall_score);
println!(
" Cost efficiency: {:.2}",
assessment.cost.cost_efficiency
);
println!(
" Robustness score: {:.2}",
assessment.robustness.robustness_score
);
}
}
println!();
Ok(())
}

View File

@@ -0,0 +1,355 @@
//! Intelligence Assessment Runner
//!
//! Runs comprehensive intelligence assessment across all benchmark types.
//!
//! Usage:
//! cargo run --bin intelligence-assessment -- --episodes 10 --puzzles 50
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::{
intelligence_metrics::{
print_intelligence_report, DifficultyStats, EpisodeMetrics, IntelligenceCalculator,
RawMetrics,
},
swarm_regret::SwarmController,
temporal::{AdaptiveSolver, TemporalSolver},
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
};
#[derive(Parser, Debug)]
#[command(name = "intelligence-assessment")]
#[command(about = "Run comprehensive intelligence assessment")]
struct Args {
/// Number of episodes for regret tracking
#[arg(short, long, default_value = "10")]
episodes: usize,
/// Tasks per episode
#[arg(short, long, default_value = "10")]
tasks_per_episode: usize,
/// Enable calendar tool
#[arg(long, default_value = "true")]
calendar: bool,
/// Enable adaptive learning (ReasoningBank)
#[arg(long, default_value = "true")]
adaptive: bool,
/// Random seed
#[arg(long)]
seed: Option<u64>,
/// Verbose output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Comprehensive Intelligence Assessment ║");
println!("║ Measuring Reasoning, Learning & Cognitive Abilities ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
// Initialize metrics collector
let mut raw_metrics = RawMetrics::default();
// Initialize components
let mut controller = SwarmController::new(args.tasks_per_episode);
// Choose solver based on adaptive flag
let mut adaptive_solver = if args.adaptive {
Some(AdaptiveSolver::new())
} else {
None
};
let mut basic_solver = if !args.adaptive {
let mut s = TemporalSolver::with_tools(args.calendar, false);
s.max_steps = 100;
Some(s)
} else {
None
};
let puzzle_config = PuzzleGeneratorConfig {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
seed: args.seed,
..Default::default()
};
println!("🔧 Configuration:");
println!(" Episodes: {}", args.episodes);
println!(" Tasks/episode: {}", args.tasks_per_episode);
println!(" Calendar tool: {}", args.calendar);
println!(" Adaptive learning:{}", args.adaptive);
println!();
println!("🏃 Running assessment...");
println!();
// Run episodes
for ep in 0..args.episodes {
controller.start_episode();
// Generate puzzles for this episode
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
let mut solved = 0;
let mut correct = 0;
let mut total_steps = 0;
let mut total_tool_calls = 0;
let mut total_latency = 0u64;
// Solve puzzles and collect metrics
for puzzle in &puzzles {
raw_metrics.tasks_attempted += 1;
// Use adaptive or basic solver
let result = if let Some(ref mut solver) = adaptive_solver {
solver.solve(puzzle)?
} else if let Some(ref mut solver) = basic_solver {
solver.solve(puzzle)?
} else {
unreachable!()
};
if result.solved {
solved += 1;
raw_metrics.tasks_completed += 1;
}
if result.correct {
correct += 1;
raw_metrics.tasks_correct += 1;
}
total_steps += result.steps;
total_tool_calls += result.tool_calls;
total_latency += result.latency_ms;
raw_metrics.total_steps += result.steps;
raw_metrics.total_tool_calls += result.tool_calls;
raw_metrics.total_latency_ms += result.latency_ms;
// Track by difficulty
let entry = raw_metrics
.by_difficulty
.entry(puzzle.difficulty)
.or_insert(DifficultyStats {
attempted: 0,
completed: 0,
correct: 0,
avg_steps: 0.0,
});
entry.attempted += 1;
if result.solved {
entry.completed += 1;
}
if result.correct {
entry.correct += 1;
}
}
// Record episode for swarm controller
controller.complete_episode(
solved,
correct,
total_steps,
total_tool_calls,
total_latency,
);
// Record episode metrics
let episode_accuracy = if args.tasks_per_episode > 0 {
correct as f64 / args.tasks_per_episode as f64
} else {
0.0
};
let last_ep = controller.regret.episodes.last().unwrap();
raw_metrics.episodes.push(EpisodeMetrics {
episode: ep + 1,
accuracy: episode_accuracy,
reward: last_ep.reward,
regret: last_ep.regret(),
cumulative_regret: controller.regret.current_cumulative_regret(),
});
if args.verbose {
println!(
" Episode {:2}: Accuracy {:.1}%, Regret {:.2}",
ep + 1,
episode_accuracy * 100.0,
last_ep.regret()
);
} else {
print!(".");
use std::io::Write;
std::io::stdout().flush()?;
}
}
if !args.verbose {
println!();
}
println!();
// Update difficulty stats with average steps
for (_, stats) in raw_metrics.by_difficulty.iter_mut() {
if stats.attempted > 0 {
// This is a simplification - we'd need to track this properly
stats.avg_steps = raw_metrics.total_steps as f64 / raw_metrics.tasks_attempted as f64;
}
}
// Calculate intelligence assessment
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&raw_metrics);
// Print report
print_intelligence_report(&assessment);
// Additional insights
println!();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Performance Summary ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("📊 Task Performance:");
println!(" Tasks Attempted: {}", raw_metrics.tasks_attempted);
println!(" Tasks Completed: {}", raw_metrics.tasks_completed);
println!(" Tasks Correct: {}", raw_metrics.tasks_correct);
println!(
" Overall Accuracy: {:.1}%",
raw_metrics.tasks_correct as f64 / raw_metrics.tasks_attempted as f64 * 100.0
);
println!();
println!("📈 Learning Progress:");
let regret_summary = controller.regret.summary();
println!(" Cumulative Regret: {:.2}", regret_summary.total_regret);
println!(" Average Regret: {:.4}", regret_summary.average_regret);
println!(
" Sublinear: {}",
if regret_summary.is_sublinear {
"Yes ✓"
} else {
"No ✗"
}
);
println!(
" Regret Trend: {:.4} ({})",
regret_summary.regret_trend,
if regret_summary.regret_trend < 0.0 {
"decreasing ✓"
} else {
"increasing ✗"
}
);
println!();
// Grade the overall performance
let grade = if assessment.overall_score >= 90.0 {
"A+ (Excellent)"
} else if assessment.overall_score >= 80.0 {
"A (Very Good)"
} else if assessment.overall_score >= 70.0 {
"B (Good)"
} else if assessment.overall_score >= 60.0 {
"C (Adequate)"
} else if assessment.overall_score >= 50.0 {
"D (Below Average)"
} else {
"F (Needs Improvement)"
};
println!("🎯 Final Grade: {}", grade);
println!();
// Recommendations
println!("💡 Recommendations:");
if assessment.capabilities.temporal_reasoning < 70.0 {
println!(" • Improve temporal reasoning with more constraint examples");
}
if assessment.learning.regret_sublinearity < 0.5 {
println!(" • Increase episodes to achieve sublinear regret");
}
if assessment.tool_use.utilization_effectiveness < 0.7 {
println!(" • Better tool selection needed for complex tasks");
}
if assessment.meta_cognition.strategy_adaptation < 0.5 {
println!(" • Enable adaptive strategy switching");
}
if assessment.overall_score >= 70.0 {
println!(" • Good performance! Consider harder difficulty levels");
}
// Show adaptive learning progress if enabled
if let Some(ref solver) = adaptive_solver {
println!();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Adaptive Learning Progress ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
let progress = solver.learning_progress();
println!("🧠 ReasoningBank Statistics:");
println!(" Total trajectories: {}", progress.total_trajectories);
println!(
" Success rate: {:.1}%",
progress.success_rate * 100.0
);
println!(" Improvement rate: {:.4}", progress.improvement_rate);
println!(" Patterns learned: {}", progress.patterns_learned);
println!(" Strategies tried: {}", progress.strategies_tried);
println!(
" Is improving: {}",
if progress.is_improving {
"Yes ✓"
} else {
"No ✗"
}
);
// Show learned patterns
if !solver.reasoning_bank.patterns.is_empty() {
println!();
println!("📚 Learned Patterns:");
for (constraint_type, patterns) in &solver.reasoning_bank.patterns {
for p in patterns.iter().filter(|p| p.observations >= 3) {
println!(
"{}: {} strategy ({:.0}% success, {} obs)",
constraint_type,
p.best_strategy,
p.success_rate * 100.0,
p.observations
);
}
}
}
// Show strategy stats
if !solver.reasoning_bank.strategy_stats.is_empty() {
println!();
println!("📊 Strategy Performance:");
for (strategy, stats) in &solver.reasoning_bank.strategy_stats {
println!(
"{}: {:.1}% success ({} attempts, {:.1} avg steps)",
strategy,
stats.success_rate() * 100.0,
stats.attempts,
stats.avg_steps()
);
}
}
}
Ok(())
}

View File

@@ -0,0 +1,180 @@
//! RVF Intelligence Benchmark Runner
//!
//! Runs head-to-head comparison across 6 intelligence verticals:
//! Baseline (no learning) vs. RVF-Learning (full pipeline).
//!
//! Usage:
//! cargo run --bin rvf-intelligence-bench -- --episodes 15 --tasks 25 --verbose
//! cargo run --bin rvf-intelligence-bench -- --noise 0.4 --step-budget 300
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
use ruvector_benchmarks::rvf_intelligence_bench::{run_comparison, BenchmarkConfig};
#[derive(Parser, Debug)]
#[command(name = "rvf-intelligence-bench")]
#[command(about = "Benchmark intelligence with and without RVF learning across 6 verticals")]
struct Args {
/// Number of episodes per mode
#[arg(short, long, default_value = "10")]
episodes: usize,
/// Tasks per episode
#[arg(short, long, default_value = "20")]
tasks: usize,
/// Minimum difficulty (1-10)
#[arg(long, default_value = "1")]
min_diff: u8,
/// Maximum difficulty (1-10)
#[arg(long, default_value = "10")]
max_diff: u8,
/// Random seed for reproducibility
#[arg(long, default_value = "42")]
seed: u64,
/// Noise probability (0.0-1.0)
#[arg(long, default_value = "0.25")]
noise: f64,
/// Step budget per episode
#[arg(long, default_value = "400")]
step_budget: usize,
/// Max retries for error recovery (RVF only)
#[arg(long, default_value = "2")]
max_retries: usize,
/// Retention fraction (0.0-1.0)
#[arg(long, default_value = "0.15")]
retention: f64,
/// Token budget per episode (RVF mode)
#[arg(long, default_value = "200000")]
token_budget: u32,
/// Tool call budget per episode (RVF mode)
#[arg(long, default_value = "50")]
tool_budget: u16,
/// Verbose per-episode output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!();
println!("================================================================");
println!(" RVF Intelligence Benchmark v2 — Six Verticals");
println!(" Baseline vs. RVF-Learning (noise + step limits + retry + transfer)");
println!("================================================================");
println!();
println!(" Configuration:");
println!(" Episodes: {}", args.episodes);
println!(" Tasks/episode: {}", args.tasks);
println!(" Difficulty: {}-{}", args.min_diff, args.max_diff);
println!(" Seed: {}", args.seed);
println!(" Noise prob: {:.0}%", args.noise * 100.0);
println!(" Step budget/ep: {}", args.step_budget);
println!(" Max retries: {}", args.max_retries);
println!(" Retention: {:.0}%", args.retention * 100.0);
println!();
let config = BenchmarkConfig {
episodes: args.episodes,
tasks_per_episode: args.tasks,
min_difficulty: args.min_diff,
max_difficulty: args.max_diff,
seed: Some(args.seed),
token_budget: args.token_budget,
tool_call_budget: args.tool_budget,
verbose: args.verbose,
noise_probability: args.noise,
step_budget_per_episode: args.step_budget,
max_retries: args.max_retries,
retention_fraction: args.retention,
..Default::default()
};
println!(" Phase 1/2: Running baseline (no learning)...");
let report = run_comparison(&config)?;
// Print comparison report
report.print();
// Full IQ assessment
let calculator = IntelligenceCalculator::default();
println!("----------------------------------------------------------------");
println!(" Detailed Intelligence Assessment: Baseline");
println!("----------------------------------------------------------------");
let base_assessment = calculator.calculate(&report.baseline.raw_metrics);
print_compact_assessment(&base_assessment);
println!();
println!("----------------------------------------------------------------");
println!(" Detailed Intelligence Assessment: RVF-Learning");
println!("----------------------------------------------------------------");
let rvf_assessment = calculator.calculate(&report.rvf_learning.raw_metrics);
print_compact_assessment(&rvf_assessment);
// Final IQ comparison
println!();
println!("================================================================");
println!(" Intelligence Score Comparison");
println!("================================================================");
println!(
" Baseline IQ Score: {:.1}/100",
base_assessment.overall_score
);
println!(
" RVF-Learning IQ Score: {:.1}/100",
rvf_assessment.overall_score
);
let iq_delta = rvf_assessment.overall_score - base_assessment.overall_score;
println!(" Delta: {:+.1}", iq_delta);
println!();
if iq_delta > 10.0 {
println!(" >> RVF learning loop provides a DRAMATIC intelligence boost.");
} else if iq_delta > 5.0 {
println!(" >> RVF learning loop provides a SIGNIFICANT intelligence boost.");
} else if iq_delta > 1.0 {
println!(" >> RVF learning loop provides a MEASURABLE intelligence improvement.");
} else if iq_delta > 0.0 {
println!(" >> RVF learning loop provides a MARGINAL intelligence gain.");
} else {
println!(" >> Performance is comparable. Increase noise or reduce step budget.");
}
println!();
Ok(())
}
fn print_compact_assessment(a: &ruvector_benchmarks::intelligence_metrics::IntelligenceAssessment) {
println!(" Overall Score: {:.1}/100", a.overall_score);
println!(
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
a.reasoning.logical_coherence, a.reasoning.reasoning_efficiency, a.reasoning.error_rate,
);
println!(
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}, gen={:.2}",
a.learning.sample_efficiency,
a.learning.regret_sublinearity,
a.learning.learning_rate,
a.learning.generalization,
);
println!(
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
a.capabilities.pattern_recognition, a.capabilities.planning, a.capabilities.adaptation,
);
println!(
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
a.meta_cognition.self_correction_rate, a.meta_cognition.strategy_adaptation,
);
}

View File

@@ -0,0 +1,135 @@
//! Superintelligence Pathway Runner
//!
//! Runs a 5-level recursive intelligence amplification pipeline and tracks
//! IQ progression from foundation (~85) toward superintelligence (~98+).
//!
//! Usage:
//! cargo run --bin superintelligence -- --verbose
//! cargo run --bin superintelligence -- --episodes 15 --tasks 30 --target 95
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
#[derive(Parser, Debug)]
#[command(name = "superintelligence")]
#[command(about = "Run 5-level superintelligence pathway with IQ tracking")]
struct Args {
/// Episodes per level
#[arg(short, long, default_value = "12")]
episodes: usize,
/// Tasks per episode
#[arg(short, long, default_value = "25")]
tasks: usize,
/// Random seed
#[arg(long, default_value = "42")]
seed: u64,
/// Noise injection rate (0.0-1.0)
#[arg(long, default_value = "0.25")]
noise: f64,
/// Step budget per episode
#[arg(long, default_value = "400")]
step_budget: usize,
/// Target IQ score
#[arg(long, default_value = "98.0")]
target: f64,
/// Ensemble size for Level 3
#[arg(long, default_value = "4")]
ensemble: usize,
/// Recursive improvement cycles for Level 4
#[arg(long, default_value = "3")]
cycles: usize,
/// Adversarial pressure multiplier for Level 5
#[arg(long, default_value = "1.5")]
pressure: f64,
/// Verbose per-episode output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ SUPERINTELLIGENCE PATHWAY ENGINE ║");
println!("║ 5-Level Recursive Intelligence Amplification ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!(
" Config: {} eps/level x {} tasks, noise={:.0}%, target IQ={:.0}",
args.episodes,
args.tasks,
args.noise * 100.0,
args.target
);
println!(
" Ensemble={}, Cycles={}, Pressure={:.1}",
args.ensemble, args.cycles, args.pressure
);
println!();
let config = SIConfig {
episodes_per_level: args.episodes,
tasks_per_episode: args.tasks,
seed: args.seed,
noise_rate: args.noise,
step_budget: args.step_budget,
target_iq: args.target,
ensemble_size: args.ensemble,
recursive_cycles: args.cycles,
adversarial_pressure: args.pressure,
verbose: args.verbose,
..Default::default()
};
let result = run_pathway(&config)?;
result.print();
// Detailed assessment for peak level
let calculator = IntelligenceCalculator::default();
if let Some(peak) = result
.levels
.iter()
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
{
println!(" Peak Level ({}) Assessment:", peak.name);
let assessment = calculator.calculate(&peak.raw_metrics);
println!(
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
assessment.reasoning.logical_coherence,
assessment.reasoning.reasoning_efficiency,
assessment.reasoning.error_rate
);
println!(
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}",
assessment.learning.sample_efficiency,
assessment.learning.regret_sublinearity,
assessment.learning.learning_rate
);
println!(
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
assessment.capabilities.pattern_recognition,
assessment.capabilities.planning,
assessment.capabilities.adaptation
);
println!(
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
assessment.meta_cognition.self_correction_rate,
assessment.meta_cognition.strategy_adaptation
);
println!();
}
Ok(())
}

View File

@@ -0,0 +1,247 @@
//! Swarm Regret Tracking Runner
//!
//! Track sublinear regret across episodes for swarm controller evaluation.
//!
//! Usage:
//! cargo run --bin swarm-regret -- --episodes 20 --tasks-per-episode 20
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::{
logging::BenchmarkLogger,
swarm_regret::SwarmController,
temporal::TemporalSolver,
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
};
use std::time::Instant;
#[derive(Parser, Debug)]
#[command(name = "swarm-regret")]
#[command(about = "Track sublinear regret for swarm controller")]
struct Args {
/// Number of episodes to run
#[arg(short, long, default_value = "20")]
episodes: usize,
/// Tasks per episode
#[arg(short, long, default_value = "20")]
tasks_per_episode: usize,
/// Enable calendar tool
#[arg(long, default_value = "true")]
calendar: bool,
/// Enable web search tool
#[arg(long, default_value = "false")]
web_search: bool,
/// Maximum steps per task
#[arg(long, default_value = "100")]
max_steps: usize,
/// Random seed
#[arg(long)]
seed: Option<u64>,
/// Output log file
#[arg(short, long, default_value = "logs/swarm_regret.jsonl")]
output: String,
/// Verbose output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Swarm Controller Regret Tracking ║");
println!("║ Sublinear Regret for Multi-Agent Control ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
// Initialize
let mut logger = BenchmarkLogger::new(&args.output)?;
logger.log_system("INFO", "Starting regret tracking", "swarm-regret")?;
let mut controller = SwarmController::new(args.tasks_per_episode);
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
solver.max_steps = args.max_steps;
let puzzle_config = PuzzleGeneratorConfig {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
seed: args.seed,
..Default::default()
};
println!("🔧 Configuration:");
println!(" Episodes: {}", args.episodes);
println!(" Tasks/episode: {}", args.tasks_per_episode);
println!(" Calendar tool: {}", args.calendar);
println!(" Web search: {}", args.web_search);
println!(" Max steps/task: {}", args.max_steps);
println!();
println!("🏃 Running episodes...");
println!();
println!("┌────────┬────────┬─────────┬─────────┬──────────┬───────────┐");
println!("│Episode │ Acc(%) │ Regret │ Cum.Reg │ Avg.Reg │ Sublinear │");
println!("├────────┼────────┼─────────┼─────────┼──────────┼───────────┤");
let total_start = Instant::now();
for ep in 0..args.episodes {
controller.start_episode();
// Generate puzzles for this episode
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
let mut solved = 0;
let mut correct = 0;
let mut total_steps = 0;
let mut total_tool_calls = 0;
let mut total_latency = 0u64;
// Solve puzzles
for puzzle in &puzzles {
let result = solver.solve(puzzle)?;
if result.solved {
solved += 1;
}
if result.correct {
correct += 1;
}
total_steps += result.steps;
total_tool_calls += result.tool_calls;
total_latency += result.latency_ms;
}
// Record episode
controller.complete_episode(
solved,
correct,
total_steps,
total_tool_calls,
total_latency,
);
// Get status
let summary = controller.regret.summary();
let last_episode = controller.regret.episodes.last().unwrap();
// Log episode
logger.log_swarm(
ep + 1,
args.tasks_per_episode,
solved,
correct,
last_episode.reward,
last_episode.oracle_reward,
summary.total_regret,
summary.average_regret,
summary.is_sublinear,
)?;
// Print row
let sublinear = if summary.is_sublinear { "" } else { "" };
println!(
"{:6}{:5.1}{:7.2}{:7.2}{:8.4}{}",
ep + 1,
last_episode.accuracy() * 100.0,
last_episode.regret(),
summary.total_regret,
summary.average_regret,
sublinear
);
}
println!("└────────┴────────┴─────────┴─────────┴──────────┴───────────┘");
println!();
let total_time = total_start.elapsed();
// Final summary
let summary = controller.regret.summary();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Final Summary ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("📊 Regret Analysis:");
println!(" Total episodes: {}", summary.total_episodes);
println!(" Cumulative regret: {:.2}", summary.total_regret);
println!(" Average regret: {:.4}", summary.average_regret);
println!(
" Regret trend: {:.6} ({})",
summary.regret_trend,
if summary.regret_trend < 0.0 {
"decreasing ✓"
} else {
"increasing ✗"
}
);
println!(
" Sublinear: {}",
if summary.is_sublinear {
"Yes ✓"
} else {
"No ✗"
}
);
println!();
println!("📈 Performance:");
println!(
" Average accuracy: {:.1}%",
summary.average_accuracy * 100.0
);
println!(" Average reward: {:.2}", summary.average_reward);
println!(
" Moving avg reward: {:.2}",
summary.moving_average_reward
);
println!(" Total time: {:.2}s", total_time.as_secs_f64());
println!();
// Regret curve analysis
if controller.regret.average_regret.len() >= 5 {
println!("📉 Regret Curve (R_k/k):");
let regrets = &controller.regret.average_regret;
let step = regrets.len().max(10) / 10;
for (i, r) in regrets.iter().enumerate() {
if i % step == 0 || i == regrets.len() - 1 {
let bar_len = (r * 50.0).min(50.0) as usize;
let bar = "".repeat(bar_len);
println!(" Episode {:3}: {:.4} {}", i + 1, r, bar);
}
}
println!();
}
// Goal check
println!("🎯 Goal Status:");
if summary.is_sublinear && summary.regret_trend < 0.0 {
println!(" ✓ Achieving sublinear regret - average regret trending to zero");
} else if summary.is_sublinear {
println!(" ~ Sublinear but trend not clearly decreasing");
} else {
println!(" ✗ Not yet achieving sublinear regret");
println!(" Recommendation: Increase episodes or tune solver parameters");
}
// Flush logs
logger.flush()?;
println!();
println!("📝 Results saved to: {}", args.output);
// Save summary
let summary_path = args.output.replace(".jsonl", "_summary.json");
let summary_json = serde_json::to_string_pretty(&summary)?;
std::fs::write(&summary_path, summary_json)?;
println!("📝 Summary saved to: {}", summary_path);
Ok(())
}

View File

@@ -0,0 +1,262 @@
//! Temporal Benchmark Runner
//!
//! Run temporal reasoning benchmarks based on TimePuzzles methodology.
//!
//! Usage:
//! cargo run --bin temporal-benchmark -- --puzzles 50 --calendar --web-search
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::{
logging::BenchmarkLogger,
temporal::{BenchmarkConfig, BenchmarkResults, TemporalSolver},
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
};
use std::time::Instant;
#[derive(Parser, Debug)]
#[command(name = "temporal-benchmark")]
#[command(about = "Run temporal reasoning benchmarks")]
struct Args {
/// Number of puzzles to run
#[arg(short = 'n', long, default_value = "50")]
puzzles: usize,
/// Minimum difficulty (1-10)
#[arg(long, default_value = "1")]
min_difficulty: u8,
/// Maximum difficulty (1-10)
#[arg(long, default_value = "10")]
max_difficulty: u8,
/// Enable calendar math tool
#[arg(long, default_value = "true")]
calendar: bool,
/// Enable web search tool
#[arg(long, default_value = "false")]
web_search: bool,
/// Maximum steps per puzzle
#[arg(long, default_value = "100")]
max_steps: usize,
/// Constraint density (1-5)
#[arg(long, default_value = "3")]
constraint_density: u8,
/// Random seed for reproducibility
#[arg(long)]
seed: Option<u64>,
/// Output log file
#[arg(short, long, default_value = "logs/temporal_benchmark.jsonl")]
output: String,
/// Use sample puzzles instead of generating
#[arg(long)]
use_samples: bool,
/// Verbose output
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Temporal Reasoning Benchmark Runner ║");
println!("║ Based on TimePuzzles (arXiv:2601.07148) ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
// Initialize logger
let mut logger = BenchmarkLogger::new(&args.output)?;
logger.log_system("INFO", "Starting benchmark run", "temporal-benchmark")?;
// Generate or load puzzles
let puzzles = if args.use_samples {
println!("📚 Using sample puzzle set (50 puzzles)...");
SamplePuzzles::mixed_sample()
} else {
println!(
"🎲 Generating {} puzzles (difficulty {}-{})...",
args.puzzles, args.min_difficulty, args.max_difficulty
);
let config = PuzzleGeneratorConfig {
min_difficulty: args.min_difficulty,
max_difficulty: args.max_difficulty,
constraint_density: args.constraint_density,
cross_cultural: true,
relative_constraints: true,
year_range: (2000, 2030),
seed: args.seed,
};
let mut generator = PuzzleGenerator::new(config);
generator.generate_batch(args.puzzles)?
};
println!("✓ Loaded {} puzzles", puzzles.len());
println!();
// Configure solver
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
solver.max_steps = args.max_steps;
println!("🔧 Solver configuration:");
println!(" Calendar tool: {}", args.calendar);
println!(" Web search: {}", args.web_search);
println!(" Max steps: {}", args.max_steps);
println!();
// Run benchmarks
println!("🏃 Running benchmarks...");
println!();
let benchmark_id = format!(
"bench-{}-{}",
chrono::Utc::now().format("%Y%m%d-%H%M%S"),
args.seed.unwrap_or(0)
);
let mut results = Vec::new();
let start = Instant::now();
for (i, puzzle) in puzzles.iter().enumerate() {
let result = solver.solve(puzzle)?;
// Log result
logger.log_temporal(
&benchmark_id,
&puzzle.id,
puzzle.difficulty,
result.solved,
result.correct,
result.steps,
result.tool_calls,
result.latency_ms,
puzzle.constraints.len(),
args.calendar,
args.web_search,
)?;
if args.verbose {
let status = if result.correct {
""
} else if result.solved {
"~"
} else {
""
};
println!(
" {} Puzzle {:3}: {} (steps: {}, latency: {}ms)",
status,
i + 1,
puzzle.id,
result.steps,
result.latency_ms
);
} else if (i + 1) % 10 == 0 {
print!(".");
use std::io::Write;
std::io::stdout().flush()?;
}
results.push(result);
}
let total_time = start.elapsed();
if !args.verbose {
println!();
}
println!();
// Compute aggregate results
let config = BenchmarkConfig {
num_puzzles: puzzles.len(),
difficulty_range: (args.min_difficulty, args.max_difficulty),
calendar_tool: args.calendar,
web_search_tool: args.web_search,
max_steps: args.max_steps,
constraint_density: args.constraint_density,
};
let benchmark_results = BenchmarkResults::from_results(config, results);
// Print results
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Benchmark Results ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("📊 Summary:");
println!(" Total puzzles: {}", benchmark_results.total_puzzles);
println!(" Solved: {}", benchmark_results.solved_count);
println!(" Correct: {}", benchmark_results.correct_count);
println!(
" Accuracy: {:.1}%",
benchmark_results.accuracy * 100.0
);
println!();
println!("⏱️ Performance:");
println!(" Avg steps: {:.1}", benchmark_results.avg_steps);
println!(" Avg tool calls: {:.1}", benchmark_results.avg_tool_calls);
println!(
" Avg latency: {:.1}ms",
benchmark_results.avg_latency_ms
);
println!(" Total time: {:.2}s", total_time.as_secs_f64());
println!();
// Compute accuracy by difficulty
let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
std::collections::HashMap::new();
for (puzzle, result) in puzzles.iter().zip(benchmark_results.results.iter()) {
let entry = by_difficulty.entry(puzzle.difficulty).or_insert((0, 0));
entry.0 += 1;
if result.correct {
entry.1 += 1;
}
}
println!("📈 Accuracy by Difficulty:");
let mut difficulties: Vec<_> = by_difficulty.keys().copied().collect();
difficulties.sort();
for d in difficulties {
let (total, correct) = by_difficulty[&d];
let acc = correct as f64 / total as f64 * 100.0;
println!(" Difficulty {}: {:5.1}% ({}/{})", d, acc, correct, total);
}
println!();
// Tool usage analysis
if args.calendar {
let with_rewriting = benchmark_results
.results
.iter()
.filter(|r| r.tool_calls > 0 && r.correct)
.count();
println!("🔧 Tool Analysis:");
println!(
" Calendar rewriting success: {}/{}",
with_rewriting, benchmark_results.total_puzzles
);
}
// Flush logs
logger.flush()?;
println!();
println!("📝 Results saved to: {}", args.output);
// Save full results as JSON
let results_path = args.output.replace(".jsonl", "_summary.json");
let results_json = serde_json::to_string_pretty(&benchmark_results)?;
std::fs::write(&results_path, results_json)?;
println!("📝 Summary saved to: {}", results_path);
Ok(())
}

View File

@@ -0,0 +1,308 @@
//! TimePuzzle Quick Runner
//!
//! 10-minute probe for temporal reasoning with tool augmentation.
//!
//! Usage:
//! cargo run --bin timepuzzle-runner -- --quick
//! cargo run --bin timepuzzle-runner -- --depth 5
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::{
logging::BenchmarkLogger, temporal::TemporalSolver, timepuzzles::SamplePuzzles,
};
use std::time::{Duration, Instant};
#[derive(Parser, Debug)]
#[command(name = "timepuzzle-runner")]
#[command(about = "Quick TimePuzzle probe for agent testing")]
struct Args {
/// Quick mode: 50 puzzles, depth-limited steps
#[arg(long)]
quick: bool,
/// Maximum depth (steps) per puzzle
#[arg(short, long, default_value = "50")]
depth: usize,
/// Number of puzzles
#[arg(short = 'n', long, default_value = "50")]
puzzles: usize,
/// Tool latency cap (abort if tool > 1.5x median)
#[arg(long, default_value = "1.5")]
latency_cap: f64,
/// Timeout in seconds
#[arg(long, default_value = "600")]
timeout: u64,
/// Enable constraint rewriting (calendar math)
#[arg(long, default_value = "true")]
rewrite: bool,
/// Enable web search (for factual anchors)
#[arg(long, default_value = "false")]
web_search: bool,
/// Output file
#[arg(short, long, default_value = "logs/timepuzzle_probe.jsonl")]
output: String,
/// Verbose mode
#[arg(short, long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ TimePuzzle Quick Probe Runner ║");
println!("║ Tool-Augmented Iterative Temporal Reasoning ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
let mut logger = BenchmarkLogger::new(&args.output)?;
logger.log_system("INFO", "Starting TimePuzzle probe", "timepuzzle-runner")?;
// Quick mode settings
let (num_puzzles, max_depth) = if args.quick {
println!("⚡ Quick mode enabled (50 puzzles, depth {})", args.depth);
(50, args.depth)
} else {
(args.puzzles, args.depth)
};
let timeout = Duration::from_secs(args.timeout);
println!();
println!("🔧 Configuration:");
println!(" Puzzles: {}", num_puzzles);
println!(" Max depth: {}", max_depth);
println!(" Rewriting: {}", args.rewrite);
println!(" Web search: {}", args.web_search);
println!(" Latency cap: {}x median", args.latency_cap);
println!(" Timeout: {}s", args.timeout);
println!();
// Generate puzzles with varying constraint density
println!("🎲 Generating puzzles...");
let puzzles = SamplePuzzles::mixed_sample()
.into_iter()
.take(num_puzzles)
.collect::<Vec<_>>();
println!("✓ Loaded {} puzzles", puzzles.len());
println!();
// Configure solver
let mut solver = TemporalSolver::with_tools(args.rewrite, args.web_search);
solver.max_steps = max_depth;
// Run probe
println!("🏃 Running probe...");
println!();
let probe_start = Instant::now();
let mut results = Vec::new();
let mut latencies: Vec<u64> = Vec::new();
let mut median_latency: f64 = 100.0; // Initial estimate
for (i, puzzle) in puzzles.iter().enumerate() {
// Check timeout
if probe_start.elapsed() > timeout {
println!("⚠️ Timeout reached after {} puzzles", i);
break;
}
let result = solver.solve(puzzle)?;
// Check latency cap
if latencies.len() >= 10 {
let mut sorted = latencies.clone();
sorted.sort();
median_latency = sorted[sorted.len() / 2] as f64;
if result.latency_ms as f64 > median_latency * args.latency_cap {
if args.verbose {
println!(
" ⚠ Puzzle {} aborted: latency {}ms > {:.0}ms cap",
puzzle.id,
result.latency_ms,
median_latency * args.latency_cap
);
}
// Still record but mark as slow
}
}
latencies.push(result.latency_ms);
// Log
logger.log_temporal(
"timepuzzle-probe",
&puzzle.id,
puzzle.difficulty,
result.solved,
result.correct,
result.steps,
result.tool_calls,
result.latency_ms,
puzzle.constraints.len(),
args.rewrite,
args.web_search,
)?;
if args.verbose {
let status = if result.correct {
""
} else if result.solved {
"~"
} else {
""
};
println!(
" {} [{:2}] {}: steps={}, tools={}, {}ms",
status,
puzzle.difficulty,
puzzle.id,
result.steps,
result.tool_calls,
result.latency_ms
);
}
results.push(result);
}
let total_time = probe_start.elapsed();
println!();
// Analyze results
let solved = results.iter().filter(|r| r.solved).count();
let correct = results.iter().filter(|r| r.correct).count();
let total = results.len();
let accuracy = correct as f64 / total as f64;
let avg_steps = results.iter().map(|r| r.steps).sum::<usize>() as f64 / total as f64;
let avg_tools = results.iter().map(|r| r.tool_calls).sum::<usize>() as f64 / total as f64;
let avg_latency = results.iter().map(|r| r.latency_ms).sum::<u64>() as f64 / total as f64;
// Tool toggle analysis
let with_tool_correct = results
.iter()
.filter(|r| r.tool_calls > 0 && r.correct)
.count();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Probe Results ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("📊 Overall Performance:");
println!(" Puzzles run: {}", total);
println!(
" Solved: {} ({:.1}%)",
solved,
solved as f64 / total as f64 * 100.0
);
println!(
" Correct: {} ({:.1}%)",
correct,
accuracy * 100.0
);
println!();
println!("⏱️ Efficiency:");
println!(" Avg steps: {:.1}", avg_steps);
println!(" Avg tool calls: {:.1}", avg_tools);
println!(" Avg latency: {:.1}ms", avg_latency);
println!(" Median latency: {:.0}ms", median_latency);
println!(" Total time: {:.2}s", total_time.as_secs_f64());
println!();
// Scaling curves
println!("📈 Tool Toggle Analysis:");
println!(
" With rewriting: {}/{} ({:.1}%)",
with_tool_correct,
total,
with_tool_correct as f64 / total as f64 * 100.0
);
// Sensitivity analysis
let fast_correct = results
.iter()
.filter(|r| r.latency_ms < median_latency as u64 && r.correct)
.count();
let slow_correct = results
.iter()
.filter(|r| r.latency_ms >= median_latency as u64 && r.correct)
.count();
let fast_total = results
.iter()
.filter(|r| r.latency_ms < median_latency as u64)
.count();
let slow_total = total - fast_total;
if fast_total > 0 && slow_total > 0 {
println!();
println!("⚡ Latency Sensitivity:");
println!(
" Fast (<{:.0}ms): {}/{} ({:.1}%)",
median_latency,
fast_correct,
fast_total,
fast_correct as f64 / fast_total as f64 * 100.0
);
println!(
" Slow (>={:.0}ms): {}/{} ({:.1}%)",
median_latency,
slow_correct,
slow_total,
slow_correct as f64 / slow_total as f64 * 100.0
);
}
// Accuracy by difficulty
println!();
println!("🎯 Accuracy by Difficulty:");
let mut by_diff: std::collections::HashMap<u8, (usize, usize)> =
std::collections::HashMap::new();
for (p, r) in puzzles.iter().zip(results.iter()) {
let e = by_diff.entry(p.difficulty).or_insert((0, 0));
e.0 += 1;
if r.correct {
e.1 += 1;
}
}
let mut diffs: Vec<_> = by_diff.keys().copied().collect();
diffs.sort();
for d in diffs {
let (t, c) = by_diff[&d];
let pct = c as f64 / t as f64 * 100.0;
let bar = "".repeat((pct / 5.0) as usize);
println!(" Level {:2}: {:5.1}% {}", d, pct, bar);
}
// Recommendations
println!();
println!("💡 Insights:");
if accuracy < 0.5 {
println!(" • Low accuracy - consider enabling constraint rewriting");
}
if avg_steps > max_depth as f64 * 0.8 {
println!(" • High step count - search may be inefficient");
}
if args.web_search && with_tool_correct > correct / 2 {
println!(" • Web search providing substantial gains");
}
if accuracy >= 0.8 {
println!(" • Good performance - ready for harder puzzles");
}
// Flush logs
logger.flush()?;
println!();
println!("📝 Results saved to: {}", args.output);
Ok(())
}

View File

@@ -0,0 +1,248 @@
//! Vector Index Benchmark Runner
//!
//! Benchmark vector operations with IVF and coherence gating.
//!
//! Usage:
//! cargo run --bin vector-benchmark -- --dim 128 --vectors 10000
use anyhow::Result;
use clap::Parser;
use ruvector_benchmarks::{
logging::BenchmarkLogger,
vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
};
use std::time::Instant;
#[derive(Parser, Debug)]
#[command(name = "vector-benchmark")]
#[command(about = "Benchmark vector index operations")]
struct Args {
/// Vector dimensionality
#[arg(short, long, default_value = "128")]
dim: usize,
/// Number of vectors to insert
#[arg(short = 'n', long, default_value = "10000")]
vectors: usize,
/// Number of queries to run
#[arg(short, long, default_value = "1000")]
queries: usize,
/// Top-k results per query
#[arg(short, long, default_value = "10")]
top_k: usize,
/// Enable IVF indexing
#[arg(long, default_value = "true")]
ivf: bool,
/// Number of IVF clusters
#[arg(long, default_value = "64")]
clusters: usize,
/// Number of clusters to probe
#[arg(long, default_value = "4")]
probes: usize,
/// Enable coherence gate
#[arg(long)]
gate: bool,
/// Coherence gate threshold
#[arg(long, default_value = "0.5")]
gate_threshold: f32,
/// Output log file
#[arg(short, long, default_value = "logs/vector_benchmark.jsonl")]
output: String,
/// Verbose output
#[arg(short = 'V', long)]
verbose: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Vector Index Benchmark Runner ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
// Initialize logger
let mut logger = BenchmarkLogger::new(&args.output)?;
logger.log_system("INFO", "Starting vector benchmark", "vector-benchmark")?;
// Create index
println!("🔧 Configuration:");
println!(" Dimensions: {}", args.dim);
println!(" Vectors: {}", args.vectors);
println!(" Queries: {}", args.queries);
println!(" Top-K: {}", args.top_k);
println!(" IVF: {}", args.ivf);
if args.ivf {
println!(" Clusters: {}", args.clusters);
println!(" Probes: {}", args.probes);
}
println!(" Gate: {}", args.gate);
if args.gate {
println!(" Threshold: {}", args.gate_threshold);
}
println!();
let mut index = VectorIndex::new(args.dim);
if args.gate {
index = index.with_gate(CoherenceGate::new(args.gate_threshold));
}
if args.ivf {
index = index.with_ivf(IvfConfig::new(args.clusters, args.probes));
}
// Insert vectors
println!("📥 Inserting {} vectors...", args.vectors);
let insert_start = Instant::now();
for i in 0..args.vectors {
index.insert(DenseVec::random(args.dim))?;
if args.verbose && (i + 1) % 1000 == 0 {
println!(" Inserted {} vectors", i + 1);
}
}
let insert_time = insert_start.elapsed();
println!(
"✓ Insert complete ({:.2}s, {:.0} vec/s)",
insert_time.as_secs_f64(),
args.vectors as f64 / insert_time.as_secs_f64()
);
println!();
// Build IVF if enabled
if args.ivf {
println!("🏗️ Building IVF index...");
let build_start = Instant::now();
index.rebuild_ivf()?;
let build_time = build_start.elapsed();
println!("✓ IVF build complete ({:.2}s)", build_time.as_secs_f64());
println!();
}
// Print index stats
let stats = index.stats();
println!("📊 Index Statistics:");
println!(" Active vectors: {}", stats.active_vectors);
println!(" IVF clusters: {}", stats.ivf_clusters);
println!();
// Run queries
println!("🔍 Running {} queries...", args.queries);
let query_start = Instant::now();
let mut latencies: Vec<u64> = Vec::with_capacity(args.queries);
let mut total_results = 0usize;
for i in 0..args.queries {
let q = DenseVec::random(args.dim);
let coherence = if args.gate {
rand::random::<f32>()
} else {
1.0
};
let start = Instant::now();
let results = index.search(&q, args.top_k, coherence)?;
let latency_us = start.elapsed().as_micros() as u64;
latencies.push(latency_us);
total_results += results.len();
// Log query
logger.log_vector(
"search",
args.dim,
stats.active_vectors,
1,
args.top_k,
args.ivf,
coherence,
latency_us,
results.len(),
)?;
if args.verbose && (i + 1) % 100 == 0 {
println!(" Completed {} queries", i + 1);
}
}
let query_time = query_start.elapsed();
println!(
"✓ Queries complete ({:.2}s, {:.0} q/s)",
query_time.as_secs_f64(),
args.queries as f64 / query_time.as_secs_f64()
);
println!();
// Compute statistics
latencies.sort();
let p50 = latencies[latencies.len() / 2];
let p95 = latencies[latencies.len() * 95 / 100];
let p99 = latencies[latencies.len() * 99 / 100];
let avg = latencies.iter().sum::<u64>() / latencies.len() as u64;
let max = *latencies.last().unwrap();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Benchmark Results ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("⏱️ Latency (microseconds):");
println!(" Average: {}µs", avg);
println!(" P50: {}µs", p50);
println!(" P95: {}µs", p95);
println!(" P99: {}µs", p99);
println!(" Max: {}µs", max);
println!();
println!("📈 Throughput:");
println!(
" Queries/sec: {:.0}",
args.queries as f64 / query_time.as_secs_f64()
);
println!(
" Insert/sec: {:.0}",
args.vectors as f64 / insert_time.as_secs_f64()
);
println!();
println!("📊 Results:");
println!(" Total results: {}", total_results);
println!(
" Avg results: {:.2}",
total_results as f64 / args.queries as f64
);
if args.gate {
let gated = latencies
.iter()
.enumerate()
.filter(|(_, &l)| l < 10)
.count();
println!(
" Gated queries: {:.1}%",
gated as f64 / args.queries as f64 * 100.0
);
}
// Save index
println!();
let index_path = "data/vector_index.bin";
std::fs::create_dir_all("data")?;
index.save_to_file(index_path)?;
println!("💾 Index saved to: {}", index_path);
// Flush logs
logger.flush()?;
println!("📝 Results saved to: {}", args.output);
Ok(())
}

View File

@@ -0,0 +1,197 @@
//! WASM Solver Benchmark — Compares native vs WASM AGI solver performance.
//!
//! Runs the same acceptance test configuration through:
//! 1. Native Rust solver (benchmarks crate)
//! 2. Reference metrics comparison
//!
//! Usage:
//! cargo run --bin wasm-solver-bench [-- --holdout <N> --training <N> --cycles <N>]
use clap::Parser;
use ruvector_benchmarks::acceptance_test::{run_acceptance_test_mode, AblationMode, HoldoutConfig};
use std::time::Instant;
#[derive(Parser)]
#[command(name = "wasm-solver-bench")]
struct Args {
#[arg(long, default_value = "50")]
holdout: usize,
#[arg(long, default_value = "50")]
training: usize,
#[arg(long, default_value = "3")]
cycles: usize,
#[arg(long, default_value = "200")]
budget: usize,
}
fn main() {
let args = Args::parse();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ WASM vs Native AGI Solver Benchmark ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!(
" Config: holdout={}, training={}, cycles={}, budget={}",
args.holdout, args.training, args.cycles, args.budget
);
println!();
let config = HoldoutConfig {
holdout_size: args.holdout,
training_per_cycle: args.training,
cycles: args.cycles,
step_budget: args.budget,
holdout_seed: 0xDEAD_BEEF,
training_seed: 42,
noise_rate: 0.25,
min_accuracy: 0.50,
min_dimensions_improved: 1,
verbose: false,
};
// ── Native Mode A (Baseline) ──────────────────────────────────
println!(" Running Native Mode A (baseline)...");
let t0 = Instant::now();
let native_a = run_acceptance_test_mode(&config, &AblationMode::Baseline).unwrap();
let native_a_ms = t0.elapsed().as_millis();
// ── Native Mode B (Compiler) ──────────────────────────────────
println!(" Running Native Mode B (compiler)...");
let t0 = Instant::now();
let native_b = run_acceptance_test_mode(&config, &AblationMode::CompilerOnly).unwrap();
let native_b_ms = t0.elapsed().as_millis();
// ── Native Mode C (Full learned) ──────────────────────────────
println!(" Running Native Mode C (full learned)...");
let t0 = Instant::now();
let native_c = run_acceptance_test_mode(&config, &AblationMode::Full).unwrap();
let native_c_ms = t0.elapsed().as_millis();
println!();
println!(" ┌────────────────────────────────────────────────────────┐");
println!(" │ NATIVE SOLVER RESULTS │");
println!(" ├────────────────────────────────────────────────────────┤");
println!(
"{:<12} {:>8} {:>10} {:>10} {:>8} {:>8}",
"Mode", "Acc%", "Cost", "Noise%", "Time", "Pass"
);
println!("{}", "-".repeat(54));
for (label, result, ms) in [
("A baseline", &native_a, native_a_ms),
("B compiler", &native_b, native_b_ms),
("C learned", &native_c, native_c_ms),
] {
let last = result.result.cycles.last().unwrap();
println!(
"{:<12} {:>6.1}% {:>9.1} {:>8.1}% {:>5}ms {:>7}",
label,
last.holdout_accuracy * 100.0,
last.holdout_cost_per_solve,
last.holdout_noise_accuracy * 100.0,
ms,
if result.result.passed { "PASS" } else { "FAIL" }
);
}
println!(" └────────────────────────────────────────────────────────┘");
println!();
// ── WASM Reference Metrics ────────────────────────────────────
// Since we can't run WASM directly from Rust without a runtime,
// we output the reference metrics that the WASM module should match.
println!(" ┌────────────────────────────────────────────────────────┐");
println!(" │ WASM REFERENCE METRICS (for validation) │");
println!(" ├────────────────────────────────────────────────────────┤");
println!(" │ │");
println!(" │ The rvf-solver-wasm module should produce: │");
println!(" │ │");
let total_ms = native_a_ms + native_b_ms + native_c_ms;
println!(
" │ Native total time: {}ms │",
total_ms
);
println!(
" │ WASM expected: ~{}ms (2-5x native) │",
total_ms * 3
);
println!(" │ │");
// PolicyKernel convergence check
println!(" │ Mode C PolicyKernel: │");
println!(
" │ Context buckets: {}",
native_c.policy_context_buckets
);
println!(
" │ Early commit rate: {:.2}% │",
native_c.early_commit_rate * 100.0
);
println!(
" │ Compiler hits: {}",
native_c.compiler_hits
);
println!(" │ │");
// Thompson Sampling convergence: Mode C should learn differently across contexts
let c_unique_modes: std::collections::HashSet<&str> = native_c
.skip_mode_distribution
.values()
.flat_map(|m| m.keys())
.map(|s| s.as_str())
.collect();
println!(" │ Thompson Sampling convergence: │");
println!(
" │ Unique skip modes: {} (need >=2) │",
c_unique_modes.len()
);
println!(" │ Skip distribution: │");
for (bucket, dist) in &native_c.skip_mode_distribution {
let total = dist.values().sum::<usize>().max(1);
let parts: Vec<String> = dist
.iter()
.map(|(m, c)| format!("{}:{:.0}%", m, *c as f64 / total as f64 * 100.0))
.collect();
if parts.len() > 0 {
println!("{:<16} {}", bucket, parts.join(" "));
}
}
println!(" │ │");
// Ablation assertions
let last_a = native_a.result.cycles.last().unwrap();
let last_b = native_b.result.cycles.last().unwrap();
let last_c = native_c.result.cycles.last().unwrap();
let cost_decrease = if last_a.holdout_cost_per_solve > 0.0 {
(1.0 - last_b.holdout_cost_per_solve / last_a.holdout_cost_per_solve) * 100.0
} else {
0.0
};
let robustness_gain = (last_c.holdout_noise_accuracy - last_b.holdout_noise_accuracy) * 100.0;
println!(" │ Ablation assertions: │");
println!(
" │ B vs A cost decrease: {:.1}% (need >=15%) │",
cost_decrease
);
println!(
" │ C vs B robustness: {:.1}% (need >=10%) │",
robustness_gain
);
println!(" │ │");
println!(" │ WASM module must match these learning characteristics │");
println!(" │ (exact values may differ due to float precision) │");
println!(" └────────────────────────────────────────────────────────┘");
println!();
// Final summary
let all_passed = native_a.result.passed && native_b.result.passed && native_c.result.passed;
if all_passed {
println!(" NATIVE BENCHMARK: ALL MODES PASSED");
} else {
println!(" NATIVE BENCHMARK: SOME MODES FAILED");
}
println!(" Binary size: rvf-solver-wasm.wasm ~160 KB");
println!();
}

View File

@@ -0,0 +1,960 @@
//! Intelligence Metrics Module
//!
//! Measures cognitive capabilities, reasoning quality, and learning indicators
//! for agent evaluation based on established AI benchmarking methodologies.
//!
//! Key metrics tracked:
//! - Reasoning quality (logical coherence, constraint satisfaction)
//! - Learning efficiency (regret curves, sample efficiency)
//! - Working memory (context utilization, information integration)
//! - Tool use proficiency (appropriate selection, effective utilization)
//! - Meta-cognitive awareness (self-correction, uncertainty estimation)
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Intelligence assessment result
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IntelligenceAssessment {
/// Overall intelligence score (0-100)
pub overall_score: f64,
/// Individual capability scores
pub capabilities: CapabilityScores,
/// Reasoning quality metrics
pub reasoning: ReasoningMetrics,
/// Learning efficiency metrics
pub learning: LearningMetrics,
/// Tool use proficiency
pub tool_use: ToolUseMetrics,
/// Meta-cognitive indicators
pub meta_cognition: MetaCognitiveMetrics,
/// Cost efficiency metrics
pub cost: CostMetrics,
/// Robustness under noise
pub robustness: RobustnessMetrics,
/// Raw performance data
pub raw_data: RawMetrics,
}
/// Capability scores across dimensions
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CapabilityScores {
/// Temporal reasoning (date inference, calendar math)
pub temporal_reasoning: f64,
/// Constraint satisfaction (multi-constraint solving)
pub constraint_satisfaction: f64,
/// Information retrieval (semantic search, recall)
pub information_retrieval: f64,
/// Pattern recognition (learning from examples)
pub pattern_recognition: f64,
/// Planning and sequencing
pub planning: f64,
/// Error recovery and adaptation
pub adaptation: f64,
}
impl Default for CapabilityScores {
fn default() -> Self {
Self {
temporal_reasoning: 0.0,
constraint_satisfaction: 0.0,
information_retrieval: 0.0,
pattern_recognition: 0.0,
planning: 0.0,
adaptation: 0.0,
}
}
}
impl CapabilityScores {
/// Compute weighted average
pub fn weighted_average(&self, weights: &[f64; 6]) -> f64 {
let scores = [
self.temporal_reasoning,
self.constraint_satisfaction,
self.information_retrieval,
self.pattern_recognition,
self.planning,
self.adaptation,
];
let total_weight: f64 = weights.iter().sum();
if total_weight == 0.0 {
return 0.0;
}
scores
.iter()
.zip(weights.iter())
.map(|(s, w)| s * w)
.sum::<f64>()
/ total_weight
}
}
/// Reasoning quality metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ReasoningMetrics {
/// Logical coherence (steps follow logically)
pub logical_coherence: f64,
/// Constraint satisfaction rate
pub constraint_satisfaction_rate: f64,
/// Solution optimality (vs. best possible)
pub solution_optimality: f64,
/// Reasoning efficiency (steps to solution)
pub reasoning_efficiency: f64,
/// Error rate in logical steps
pub error_rate: f64,
}
impl Default for ReasoningMetrics {
fn default() -> Self {
Self {
logical_coherence: 0.0,
constraint_satisfaction_rate: 0.0,
solution_optimality: 0.0,
reasoning_efficiency: 0.0,
error_rate: 0.0,
}
}
}
/// Learning efficiency metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct LearningMetrics {
/// Sample efficiency (performance vs. examples seen)
pub sample_efficiency: f64,
/// Regret trajectory (sublinear indicator)
pub regret_sublinearity: f64,
/// Transfer learning capability
pub transfer_capability: f64,
/// Learning rate (improvement per episode)
pub learning_rate: f64,
/// Generalization ability
pub generalization: f64,
}
impl Default for LearningMetrics {
fn default() -> Self {
Self {
sample_efficiency: 0.0,
regret_sublinearity: 0.0,
transfer_capability: 0.0,
learning_rate: 0.0,
generalization: 0.0,
}
}
}
/// Tool use proficiency metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ToolUseMetrics {
/// Tool selection appropriateness
pub selection_appropriateness: f64,
/// Tool utilization effectiveness
pub utilization_effectiveness: f64,
/// Tool composition (combining tools)
pub composition_ability: f64,
/// Tool discovery (finding needed tools)
pub discovery_ability: f64,
}
impl Default for ToolUseMetrics {
fn default() -> Self {
Self {
selection_appropriateness: 0.0,
utilization_effectiveness: 0.0,
composition_ability: 0.0,
discovery_ability: 0.0,
}
}
}
/// Meta-cognitive metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MetaCognitiveMetrics {
/// Self-correction rate
pub self_correction_rate: f64,
/// Uncertainty calibration (confidence vs. accuracy)
pub uncertainty_calibration: f64,
/// Strategy adaptation
pub strategy_adaptation: f64,
/// Progress monitoring accuracy
pub progress_monitoring: f64,
}
impl Default for MetaCognitiveMetrics {
fn default() -> Self {
Self {
self_correction_rate: 0.0,
uncertainty_calibration: 0.0,
strategy_adaptation: 0.0,
progress_monitoring: 0.0,
}
}
}
/// Cost efficiency metrics — first-class IQ dimension
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CostMetrics {
/// Steps per correct solve (lower = better)
pub steps_per_solve: f64,
/// Tool calls per correct solve (lower = better)
pub tools_per_solve: f64,
/// Cost efficiency score (0-1, higher = cheaper)
pub cost_efficiency: f64,
/// Cost trend over episodes (positive = improving)
pub cost_trend: f64,
}
impl Default for CostMetrics {
fn default() -> Self {
Self {
steps_per_solve: 100.0,
tools_per_solve: 10.0,
cost_efficiency: 0.0,
cost_trend: 0.0,
}
}
}
/// Robustness under adversarial conditions — first-class IQ dimension
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RobustnessMetrics {
/// Accuracy on noise-injected tasks
pub noise_accuracy: f64,
/// Accuracy drop from clean to noisy (lower = more robust)
pub noise_degradation: f64,
/// Per-episode accuracy consistency (higher = steadier)
pub consistency: f64,
/// Composite robustness score (0-1)
pub robustness_score: f64,
}
impl Default for RobustnessMetrics {
fn default() -> Self {
Self {
noise_accuracy: 0.0,
noise_degradation: 1.0,
consistency: 0.0,
robustness_score: 0.0,
}
}
}
/// Raw metrics from benchmarks
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RawMetrics {
/// Total tasks attempted
pub tasks_attempted: usize,
/// Tasks completed successfully
pub tasks_completed: usize,
/// Tasks with correct solutions
pub tasks_correct: usize,
/// Total steps taken
pub total_steps: usize,
/// Total tool calls
pub total_tool_calls: usize,
/// Total latency in ms
pub total_latency_ms: u64,
/// Performance by difficulty
pub by_difficulty: HashMap<u8, DifficultyStats>,
/// Episode-level metrics
pub episodes: Vec<EpisodeMetrics>,
/// Tasks attempted under noise injection
pub noise_tasks_attempted: usize,
/// Tasks correct under noise injection
pub noise_tasks_correct: usize,
/// Policy violations (contradictions, budget overruns)
pub policy_violations: usize,
/// Solved-but-incorrect count (contradiction rate numerator)
pub contradictions: usize,
/// Successful rollbacks from noisy to clean
pub rollback_successes: usize,
/// Attempted rollbacks from noisy to clean
pub rollback_attempts: usize,
}
impl Default for RawMetrics {
fn default() -> Self {
Self {
tasks_attempted: 0,
tasks_completed: 0,
tasks_correct: 0,
total_steps: 0,
total_tool_calls: 0,
total_latency_ms: 0,
by_difficulty: HashMap::new(),
episodes: Vec::new(),
noise_tasks_attempted: 0,
noise_tasks_correct: 0,
policy_violations: 0,
contradictions: 0,
rollback_successes: 0,
rollback_attempts: 0,
}
}
}
/// Stats per difficulty level
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DifficultyStats {
pub attempted: usize,
pub completed: usize,
pub correct: usize,
pub avg_steps: f64,
}
/// Per-episode metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EpisodeMetrics {
pub episode: usize,
pub accuracy: f64,
pub reward: f64,
pub regret: f64,
pub cumulative_regret: f64,
}
/// Intelligence metrics calculator
pub struct IntelligenceCalculator {
/// Weights for capability scoring
pub capability_weights: [f64; 6],
/// Baseline for comparison
pub baseline_accuracy: f64,
/// Oracle performance for regret calculation
pub oracle_reward: f64,
}
impl Default for IntelligenceCalculator {
fn default() -> Self {
Self {
capability_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
baseline_accuracy: 0.5,
oracle_reward: 100.0,
}
}
}
impl IntelligenceCalculator {
/// Calculate intelligence assessment from raw metrics
pub fn calculate(&self, raw: &RawMetrics) -> IntelligenceAssessment {
let capabilities = self.calculate_capabilities(raw);
let reasoning = self.calculate_reasoning(raw);
let learning = self.calculate_learning(raw);
let tool_use = self.calculate_tool_use(raw);
let meta_cognition = self.calculate_meta_cognition(raw);
let cost = self.calculate_cost(raw);
let robustness = self.calculate_robustness(raw);
// Overall score: three equal pillars — graded outcomes, cost, robustness
let overall_score = self.calculate_overall_score(
&capabilities,
&reasoning,
&learning,
&tool_use,
&meta_cognition,
&cost,
&robustness,
);
IntelligenceAssessment {
overall_score,
capabilities,
reasoning,
learning,
tool_use,
meta_cognition,
cost,
robustness,
raw_data: raw.clone(),
}
}
fn calculate_capabilities(&self, raw: &RawMetrics) -> CapabilityScores {
let base_accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Temporal reasoning: accuracy on time-based tasks
let temporal_reasoning = base_accuracy * 100.0;
// Constraint satisfaction: correct solutions
let constraint_satisfaction = base_accuracy * 100.0;
// Information retrieval: based on steps to solution
let avg_steps = if raw.tasks_attempted > 0 {
raw.total_steps as f64 / raw.tasks_attempted as f64
} else {
100.0
};
let information_retrieval = (100.0 - avg_steps).max(0.0).min(100.0);
// Pattern recognition: performance improvement across difficulties
let pattern_recognition = self.calculate_pattern_recognition(raw);
// Planning: efficiency of tool use
let avg_tools = if raw.tasks_attempted > 0 {
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let planning = if avg_tools > 0.0 && avg_tools <= 2.0 {
100.0 * (1.0 - (avg_tools - 1.0).abs() / 2.0)
} else {
50.0
};
// Adaptation: improvement over episodes
let adaptation = self.calculate_adaptation(raw);
CapabilityScores {
temporal_reasoning,
constraint_satisfaction,
information_retrieval,
pattern_recognition,
planning,
adaptation,
}
}
fn calculate_pattern_recognition(&self, raw: &RawMetrics) -> f64 {
if raw.by_difficulty.len() < 2 {
return 50.0;
}
// Check if harder problems are still solvable
let mut difficulties: Vec<_> = raw.by_difficulty.keys().copied().collect();
difficulties.sort();
let mut scores = Vec::new();
for d in &difficulties {
if let Some(stats) = raw.by_difficulty.get(d) {
if stats.attempted > 0 {
scores.push(stats.correct as f64 / stats.attempted as f64);
}
}
}
if scores.is_empty() {
return 50.0;
}
// Average accuracy across difficulties
let avg: f64 = scores.iter().sum::<f64>() / scores.len() as f64;
avg * 100.0
}
fn calculate_adaptation(&self, raw: &RawMetrics) -> f64 {
if raw.episodes.len() < 3 {
return 50.0;
}
// Check if accuracy improves over episodes
let first_half: f64 = raw.episodes[..raw.episodes.len() / 2]
.iter()
.map(|e| e.accuracy)
.sum::<f64>()
/ (raw.episodes.len() / 2) as f64;
let second_half: f64 = raw.episodes[raw.episodes.len() / 2..]
.iter()
.map(|e| e.accuracy)
.sum::<f64>()
/ (raw.episodes.len() - raw.episodes.len() / 2) as f64;
let improvement = second_half - first_half;
// Scale: -0.2 to +0.2 improvement maps to 0-100
((improvement + 0.2) / 0.4 * 100.0).max(0.0).min(100.0)
}
fn calculate_reasoning(&self, raw: &RawMetrics) -> ReasoningMetrics {
let constraint_satisfaction_rate = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let avg_steps = if raw.tasks_attempted > 0 {
raw.total_steps as f64 / raw.tasks_attempted as f64
} else {
100.0
};
// Reasoning efficiency: inverse of steps (normalized)
let reasoning_efficiency = (100.0 - avg_steps).max(0.0).min(100.0) / 100.0;
// Logical coherence: based on completion rate vs correct rate
let completion_rate = if raw.tasks_attempted > 0 {
raw.tasks_completed as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let logical_coherence = if completion_rate > 0.0 {
constraint_satisfaction_rate / completion_rate
} else {
0.0
};
ReasoningMetrics {
logical_coherence,
constraint_satisfaction_rate,
solution_optimality: constraint_satisfaction_rate,
reasoning_efficiency,
error_rate: 1.0 - constraint_satisfaction_rate,
}
}
fn calculate_learning(&self, raw: &RawMetrics) -> LearningMetrics {
let mut learning = LearningMetrics::default();
if raw.episodes.is_empty() {
return learning;
}
// Sample efficiency: accuracy per episode
learning.sample_efficiency =
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
// Regret sublinearity: check if cumulative regret grows sublinearly
// True sublinearity means R_k/k → 0 as k → ∞ (regret per episode decreasing)
if raw.episodes.len() >= 5 {
// Calculate regret trend using linear regression
let n = raw.episodes.len() as f64;
let mut sum_x = 0.0;
let mut sum_y = 0.0;
let mut sum_xy = 0.0;
let mut sum_xx = 0.0;
for (i, ep) in raw.episodes.iter().enumerate() {
let x = (i + 1) as f64;
let y = ep.regret;
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
// Negative slope = decreasing regret = sublinear
// Transform: slope < 0 → sublinearity > 0
if slope < 0.0 {
// Stronger negative slope = better sublinearity (cap at 1.0)
learning.regret_sublinearity = (-slope / 10.0).min(1.0);
}
// Also check cumulative average
let last = raw.episodes.last().unwrap();
let avg_regret = last.cumulative_regret / n;
let first_half_avg = raw
.episodes
.iter()
.take(raw.episodes.len() / 2)
.map(|e| e.regret)
.sum::<f64>()
/ (n / 2.0);
// If second half has lower per-episode regret, that's sublinear
if avg_regret < first_half_avg && learning.regret_sublinearity == 0.0 {
learning.regret_sublinearity =
((first_half_avg - avg_regret) / first_half_avg).max(0.0);
}
}
// Learning rate: improvement in accuracy over episodes
if raw.episodes.len() >= 2 {
let first_acc = raw.episodes[0].accuracy;
let last_acc = raw.episodes.last().unwrap().accuracy;
learning.learning_rate = (last_acc - first_acc + 1.0) / 2.0;
}
// Generalization: consistency across difficulties
if raw.by_difficulty.len() >= 2 {
let accuracies: Vec<f64> = raw
.by_difficulty
.values()
.filter(|s| s.attempted > 0)
.map(|s| s.correct as f64 / s.attempted as f64)
.collect();
if !accuracies.is_empty() {
let mean = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
let variance = accuracies.iter().map(|a| (a - mean).powi(2)).sum::<f64>()
/ accuracies.len() as f64;
let std_dev = variance.sqrt();
// Lower variance = better generalization
learning.generalization = (1.0 - std_dev).max(0.0);
}
}
learning
}
fn calculate_tool_use(&self, raw: &RawMetrics) -> ToolUseMetrics {
let avg_tools = if raw.tasks_attempted > 0 {
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Selection appropriateness: using tools when helpful
let accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Effectiveness: accuracy when tools are used
let utilization_effectiveness = accuracy;
// Appropriateness: not overusing tools
let selection_appropriateness = if avg_tools > 0.0 {
(accuracy / avg_tools.min(2.0)).min(1.0)
} else {
0.5
};
ToolUseMetrics {
selection_appropriateness,
utilization_effectiveness,
composition_ability: avg_tools.min(1.0), // Using multiple tools
discovery_ability: accuracy, // Finding solutions
}
}
fn calculate_meta_cognition(&self, raw: &RawMetrics) -> MetaCognitiveMetrics {
// Self-correction: completed but not correct -> corrected
let completed_but_wrong = raw.tasks_completed.saturating_sub(raw.tasks_correct);
let self_correction_rate = if completed_but_wrong > 0 {
0.0 // No self-correction if still wrong
} else if raw.tasks_completed > 0 {
1.0 // All completed are correct
} else {
0.5
};
// Strategy adaptation: improvement over episodes
let strategy_adaptation = if raw.episodes.len() >= 3 {
let trend: f64 = raw
.episodes
.windows(2)
.map(|w| {
if w[1].accuracy > w[0].accuracy {
1.0
} else {
0.0
}
})
.sum::<f64>();
trend / (raw.episodes.len() - 1) as f64
} else {
0.5
};
MetaCognitiveMetrics {
self_correction_rate,
uncertainty_calibration: 0.5, // Would need confidence scores
strategy_adaptation,
progress_monitoring: strategy_adaptation, // Similar metric
}
}
fn calculate_cost(&self, raw: &RawMetrics) -> CostMetrics {
let steps_per_solve = if raw.tasks_correct > 0 {
raw.total_steps as f64 / raw.tasks_correct as f64
} else if raw.tasks_attempted > 0 {
raw.total_steps as f64
} else {
100.0
};
let tools_per_solve = if raw.tasks_correct > 0 {
raw.total_tool_calls as f64 / raw.tasks_correct as f64
} else {
10.0
};
// Efficiency: 1.0 at <=5 steps/solve, 0.0 at >=100 steps/solve
let cost_efficiency = (1.0 - (steps_per_solve - 5.0) / 95.0).clamp(0.0, 1.0);
// Cost trend: compare early vs late episode accuracy per step
let cost_trend = if raw.episodes.len() >= 4 {
let half = raw.episodes.len() / 2;
let early_acc: f64 =
raw.episodes[..half].iter().map(|e| e.accuracy).sum::<f64>() / half as f64;
let late_acc: f64 = raw.episodes[half..].iter().map(|e| e.accuracy).sum::<f64>()
/ (raw.episodes.len() - half) as f64;
// If accuracy improves, effective cost per solve drops
if early_acc > 0.01 {
(late_acc - early_acc) / early_acc
} else {
0.0
}
} else {
0.0
};
CostMetrics {
steps_per_solve,
tools_per_solve,
cost_efficiency,
cost_trend,
}
}
fn calculate_robustness(&self, raw: &RawMetrics) -> RobustnessMetrics {
let noise_accuracy = if raw.noise_tasks_attempted > 0 {
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
} else {
0.5 // no noise data -> neutral prior
};
let clean_attempted = raw
.tasks_attempted
.saturating_sub(raw.noise_tasks_attempted);
let clean_correct = raw.tasks_correct.saturating_sub(raw.noise_tasks_correct);
let clean_accuracy = if clean_attempted > 0 {
clean_correct as f64 / clean_attempted as f64
} else {
0.0
};
let noise_degradation = (clean_accuracy - noise_accuracy).max(0.0);
let consistency = if raw.episodes.len() >= 2 {
let mean =
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
let variance = raw
.episodes
.iter()
.map(|e| (e.accuracy - mean).powi(2))
.sum::<f64>()
/ raw.episodes.len() as f64;
(1.0 - variance.sqrt()).max(0.0)
} else {
0.5
};
let robustness_score =
noise_accuracy * 0.4 + (1.0 - noise_degradation.min(1.0)) * 0.3 + consistency * 0.3;
RobustnessMetrics {
noise_accuracy,
noise_degradation,
consistency,
robustness_score,
}
}
fn calculate_overall_score(
&self,
capabilities: &CapabilityScores,
reasoning: &ReasoningMetrics,
learning: &LearningMetrics,
tool_use: &ToolUseMetrics,
meta_cognition: &MetaCognitiveMetrics,
cost: &CostMetrics,
robustness: &RobustnessMetrics,
) -> f64 {
// Sub-scores (0-100 scale)
let cap_score = capabilities.weighted_average(&self.capability_weights);
let reasoning_score = (reasoning.logical_coherence
+ reasoning.constraint_satisfaction_rate
+ reasoning.solution_optimality
+ reasoning.reasoning_efficiency)
/ 4.0
* 100.0;
let learning_score = (learning.sample_efficiency
+ learning.regret_sublinearity
+ learning.learning_rate
+ learning.generalization)
/ 4.0
* 100.0;
let tool_score = (tool_use.selection_appropriateness
+ tool_use.utilization_effectiveness
+ tool_use.composition_ability
+ tool_use.discovery_ability)
/ 4.0
* 100.0;
let meta_score = (meta_cognition.self_correction_rate
+ meta_cognition.strategy_adaptation
+ meta_cognition.progress_monitoring)
/ 3.0
* 100.0;
let cost_score = cost.cost_efficiency * 100.0;
let robustness_score = robustness.robustness_score * 100.0;
// Three equal pillars: graded outcomes (~0.34), cost (~0.33), robustness (~0.33)
// Graded outcomes = capabilities + reasoning + learning + tool + meta
cap_score * 0.12
+ reasoning_score * 0.10
+ learning_score * 0.06
+ tool_score * 0.03
+ meta_score * 0.03
+ cost_score * 0.33
+ robustness_score * 0.33
}
}
/// Print a formatted intelligence report
pub fn print_intelligence_report(assessment: &IntelligenceAssessment) {
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Intelligence Assessment Report ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!(
"🧠 Overall Intelligence Score: {:.1}/100",
assessment.overall_score
);
println!();
println!("📊 Capability Scores:");
println!(
" Temporal Reasoning: {:5.1}",
assessment.capabilities.temporal_reasoning
);
println!(
" Constraint Satisfaction:{:5.1}",
assessment.capabilities.constraint_satisfaction
);
println!(
" Information Retrieval: {:5.1}",
assessment.capabilities.information_retrieval
);
println!(
" Pattern Recognition: {:5.1}",
assessment.capabilities.pattern_recognition
);
println!(
" Planning: {:5.1}",
assessment.capabilities.planning
);
println!(
" Adaptation: {:5.1}",
assessment.capabilities.adaptation
);
println!();
println!("🔍 Reasoning Quality:");
println!(
" Logical Coherence: {:.2}",
assessment.reasoning.logical_coherence
);
println!(
" Constraint Satisfaction:{:.2}",
assessment.reasoning.constraint_satisfaction_rate
);
println!(
" Solution Optimality: {:.2}",
assessment.reasoning.solution_optimality
);
println!(
" Reasoning Efficiency: {:.2}",
assessment.reasoning.reasoning_efficiency
);
println!(
" Error Rate: {:.2}",
assessment.reasoning.error_rate
);
println!();
println!("📈 Learning Metrics:");
println!(
" Sample Efficiency: {:.2}",
assessment.learning.sample_efficiency
);
println!(
" Regret Sublinearity: {:.2}",
assessment.learning.regret_sublinearity
);
println!(
" Learning Rate: {:.2}",
assessment.learning.learning_rate
);
println!(
" Generalization: {:.2}",
assessment.learning.generalization
);
println!();
println!("🔧 Tool Use Proficiency:");
println!(
" Selection: {:.2}",
assessment.tool_use.selection_appropriateness
);
println!(
" Effectiveness: {:.2}",
assessment.tool_use.utilization_effectiveness
);
println!(
" Composition: {:.2}",
assessment.tool_use.composition_ability
);
println!();
println!("🪞 Meta-Cognitive Indicators:");
println!(
" Self-Correction: {:.2}",
assessment.meta_cognition.self_correction_rate
);
println!(
" Strategy Adaptation: {:.2}",
assessment.meta_cognition.strategy_adaptation
);
println!(
" Progress Monitoring: {:.2}",
assessment.meta_cognition.progress_monitoring
);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intelligence_calculation() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 100;
raw.tasks_completed = 90;
raw.tasks_correct = 80;
raw.total_steps = 500;
raw.total_tool_calls = 100;
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&raw);
assert!(assessment.overall_score > 0.0);
assert!(assessment.capabilities.temporal_reasoning > 0.0);
}
#[test]
fn test_learning_metrics() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 50;
raw.tasks_correct = 40;
// Add episodes showing improvement
for i in 0..10 {
raw.episodes.push(EpisodeMetrics {
episode: i + 1,
accuracy: 0.5 + 0.04 * i as f64,
reward: 50.0 + 4.0 * i as f64,
regret: 50.0 - 4.0 * i as f64,
cumulative_regret: (0..=i).map(|j| 50.0 - 4.0 * j as f64).sum(),
});
}
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&raw);
// Should show learning (improvement over time)
assert!(assessment.learning.learning_rate > 0.5);
}
}

View File

@@ -0,0 +1,38 @@
//! RuVector Benchmarks Library
//!
//! Comprehensive benchmarking suite for:
//! - Temporal reasoning (TimePuzzles-style constraint inference)
//! - Vector index operations (IVF, coherence-gated search)
//! - Swarm controller regret tracking
//! - Intelligence metrics and cognitive capability assessment
//! - Adaptive learning with ReasoningBank trajectory tracking
//!
//! Based on research from:
//! - TimePuzzles benchmark (arXiv:2601.07148)
//! - Sublinear regret in multi-agent control
//! - Tool-augmented iterative temporal reasoning
//! - Cognitive capability assessment frameworks
//! - lean-agentic type theory for verified reasoning
pub mod acceptance_test;
pub mod agi_contract;
pub mod intelligence_metrics;
pub mod logging;
pub mod loop_gating;
pub mod publishable_rvf;
pub mod reasoning_bank;
pub mod rvf_artifact;
pub mod rvf_intelligence_bench;
pub mod superintelligence;
pub mod swarm_regret;
pub mod temporal;
pub mod timepuzzles;
pub mod vector_index;
pub use intelligence_metrics::*;
pub use logging::*;
pub use reasoning_bank::*;
pub use swarm_regret::*;
pub use temporal::*;
pub use timepuzzles::*;
pub use vector_index::*;

View File

@@ -0,0 +1,421 @@
//! Logging Schema for Benchmark Results
//!
//! Comprehensive logging for:
//! - Temporal reasoning benchmarks
//! - Vector operations
//! - Swarm controller metrics
//! - Tool usage tracking
use anyhow::Result;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::fs::{self, File, OpenOptions};
use std::io::{BufWriter, Write};
use std::path::Path;
/// Log entry types
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum LogEntry {
/// Temporal benchmark run
TemporalBenchmark(TemporalBenchmarkLog),
/// Vector operation
VectorOperation(VectorOperationLog),
/// Swarm episode
SwarmEpisode(SwarmEpisodeLog),
/// Tool call
ToolCall(ToolCallLog),
/// System event
System(SystemLog),
}
/// Temporal benchmark log entry
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TemporalBenchmarkLog {
pub timestamp: DateTime<Utc>,
pub benchmark_id: String,
pub puzzle_id: String,
pub difficulty: u8,
pub solved: bool,
pub correct: bool,
pub steps: usize,
pub tool_calls: usize,
pub latency_ms: u64,
pub constraint_count: usize,
pub calendar_tool_enabled: bool,
pub web_search_enabled: bool,
}
/// Vector operation log entry
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct VectorOperationLog {
pub timestamp: DateTime<Utc>,
pub operation: String,
pub index_dim: usize,
pub index_size: usize,
pub query_count: usize,
pub top_k: usize,
pub ivf_enabled: bool,
pub coherence_score: f32,
pub latency_us: u64,
pub results_count: usize,
}
/// Swarm episode log entry
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SwarmEpisodeLog {
pub timestamp: DateTime<Utc>,
pub episode: usize,
pub num_tasks: usize,
pub solved: usize,
pub correct: usize,
pub reward: f64,
pub oracle_reward: f64,
pub regret: f64,
pub cumulative_regret: f64,
pub average_regret: f64,
pub is_sublinear: bool,
}
/// Tool call log entry
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ToolCallLog {
pub timestamp: DateTime<Utc>,
pub tool_name: String,
pub tool_type: String,
pub input_summary: String,
pub success: bool,
pub latency_ms: u64,
pub context: String,
}
/// System log entry
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SystemLog {
pub timestamp: DateTime<Utc>,
pub level: String,
pub message: String,
pub component: String,
}
/// Benchmark logger
pub struct BenchmarkLogger {
/// Log file path
path: String,
/// Writer
writer: Option<BufWriter<File>>,
/// In-memory buffer for batch writes
buffer: Vec<LogEntry>,
/// Buffer size before flush
flush_threshold: usize,
}
impl BenchmarkLogger {
/// Create a new logger
pub fn new(path: impl Into<String>) -> Result<Self> {
let path = path.into();
// Create parent directories
if let Some(parent) = Path::new(&path).parent() {
fs::create_dir_all(parent)?;
}
let file = OpenOptions::new().create(true).append(true).open(&path)?;
Ok(Self {
path,
writer: Some(BufWriter::new(file)),
buffer: Vec::new(),
flush_threshold: 100,
})
}
/// Log an entry
pub fn log(&mut self, entry: LogEntry) -> Result<()> {
self.buffer.push(entry);
if self.buffer.len() >= self.flush_threshold {
self.flush()?;
}
Ok(())
}
/// Log a temporal benchmark result
pub fn log_temporal(
&mut self,
benchmark_id: impl Into<String>,
puzzle_id: impl Into<String>,
difficulty: u8,
solved: bool,
correct: bool,
steps: usize,
tool_calls: usize,
latency_ms: u64,
constraint_count: usize,
calendar_tool: bool,
web_search: bool,
) -> Result<()> {
self.log(LogEntry::TemporalBenchmark(TemporalBenchmarkLog {
timestamp: Utc::now(),
benchmark_id: benchmark_id.into(),
puzzle_id: puzzle_id.into(),
difficulty,
solved,
correct,
steps,
tool_calls,
latency_ms,
constraint_count,
calendar_tool_enabled: calendar_tool,
web_search_enabled: web_search,
}))
}
/// Log a vector operation
pub fn log_vector(
&mut self,
operation: impl Into<String>,
index_dim: usize,
index_size: usize,
query_count: usize,
top_k: usize,
ivf_enabled: bool,
coherence_score: f32,
latency_us: u64,
results_count: usize,
) -> Result<()> {
self.log(LogEntry::VectorOperation(VectorOperationLog {
timestamp: Utc::now(),
operation: operation.into(),
index_dim,
index_size,
query_count,
top_k,
ivf_enabled,
coherence_score,
latency_us,
results_count,
}))
}
/// Log a swarm episode
pub fn log_swarm(
&mut self,
episode: usize,
num_tasks: usize,
solved: usize,
correct: usize,
reward: f64,
oracle_reward: f64,
cumulative_regret: f64,
average_regret: f64,
is_sublinear: bool,
) -> Result<()> {
self.log(LogEntry::SwarmEpisode(SwarmEpisodeLog {
timestamp: Utc::now(),
episode,
num_tasks,
solved,
correct,
reward,
oracle_reward,
regret: oracle_reward - reward,
cumulative_regret,
average_regret,
is_sublinear,
}))
}
/// Log a tool call
pub fn log_tool(
&mut self,
tool_name: impl Into<String>,
tool_type: impl Into<String>,
input_summary: impl Into<String>,
success: bool,
latency_ms: u64,
context: impl Into<String>,
) -> Result<()> {
self.log(LogEntry::ToolCall(ToolCallLog {
timestamp: Utc::now(),
tool_name: tool_name.into(),
tool_type: tool_type.into(),
input_summary: input_summary.into(),
success,
latency_ms,
context: context.into(),
}))
}
/// Log a system message
pub fn log_system(
&mut self,
level: impl Into<String>,
message: impl Into<String>,
component: impl Into<String>,
) -> Result<()> {
self.log(LogEntry::System(SystemLog {
timestamp: Utc::now(),
level: level.into(),
message: message.into(),
component: component.into(),
}))
}
/// Flush buffer to file
pub fn flush(&mut self) -> Result<()> {
if let Some(ref mut writer) = self.writer {
for entry in self.buffer.drain(..) {
let json = serde_json::to_string(&entry)?;
writeln!(writer, "{}", json)?;
}
writer.flush()?;
}
Ok(())
}
/// Close the logger
pub fn close(&mut self) -> Result<()> {
self.flush()?;
self.writer = None;
Ok(())
}
/// Get log file path
pub fn path(&self) -> &str {
&self.path
}
}
impl Drop for BenchmarkLogger {
fn drop(&mut self) {
let _ = self.flush();
}
}
/// Log reader for analysis
pub struct LogReader {
path: String,
}
impl LogReader {
/// Create a new reader
pub fn new(path: impl Into<String>) -> Self {
Self { path: path.into() }
}
/// Read all entries
pub fn read_all(&self) -> Result<Vec<LogEntry>> {
let content = fs::read_to_string(&self.path)?;
let mut entries = Vec::new();
for line in content.lines() {
if !line.is_empty() {
let entry: LogEntry = serde_json::from_str(line)?;
entries.push(entry);
}
}
Ok(entries)
}
/// Read temporal benchmark entries only
pub fn read_temporal(&self) -> Result<Vec<TemporalBenchmarkLog>> {
let entries = self.read_all()?;
Ok(entries
.into_iter()
.filter_map(|e| match e {
LogEntry::TemporalBenchmark(t) => Some(t),
_ => None,
})
.collect())
}
/// Read swarm episode entries only
pub fn read_swarm(&self) -> Result<Vec<SwarmEpisodeLog>> {
let entries = self.read_all()?;
Ok(entries
.into_iter()
.filter_map(|e| match e {
LogEntry::SwarmEpisode(s) => Some(s),
_ => None,
})
.collect())
}
/// Compute aggregate statistics
pub fn aggregate_temporal(&self) -> Result<TemporalAggregates> {
let logs = self.read_temporal()?;
if logs.is_empty() {
return Ok(TemporalAggregates::default());
}
let total = logs.len();
let solved = logs.iter().filter(|l| l.solved).count();
let correct = logs.iter().filter(|l| l.correct).count();
let avg_steps = logs.iter().map(|l| l.steps).sum::<usize>() as f64 / total as f64;
let avg_latency = logs.iter().map(|l| l.latency_ms).sum::<u64>() as f64 / total as f64;
let avg_tools = logs.iter().map(|l| l.tool_calls).sum::<usize>() as f64 / total as f64;
// By difficulty
let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
std::collections::HashMap::new();
for log in &logs {
let entry = by_difficulty.entry(log.difficulty).or_insert((0, 0));
entry.0 += 1;
if log.correct {
entry.1 += 1;
}
}
Ok(TemporalAggregates {
total_puzzles: total,
solved_count: solved,
correct_count: correct,
accuracy: correct as f64 / total as f64,
avg_steps,
avg_latency_ms: avg_latency,
avg_tool_calls: avg_tools,
accuracy_by_difficulty: by_difficulty
.into_iter()
.map(|(d, (t, c))| (d, c as f64 / t as f64))
.collect(),
})
}
}
/// Aggregate statistics for temporal benchmarks
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct TemporalAggregates {
pub total_puzzles: usize,
pub solved_count: usize,
pub correct_count: usize,
pub accuracy: f64,
pub avg_steps: f64,
pub avg_latency_ms: f64,
pub avg_tool_calls: f64,
pub accuracy_by_difficulty: std::collections::HashMap<u8, f64>,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_logger() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.log");
let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
logger
.log_temporal(
"bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
)
.unwrap();
logger.flush().unwrap();
let reader = LogReader::new(path.to_str().unwrap());
let entries = reader.read_all().unwrap();
assert_eq!(entries.len(), 1);
}
}

View File

@@ -0,0 +1,603 @@
//! Three-Loop Gating Architecture
//!
//! Separates the intelligence engine into three explicit loops with strict gating:
//!
//! ## Fast Loop (per step)
//! - Runs every step of every solver invocation
//! - No planning, no model calls
//! - Only checks invariants: allow, block, quarantine, or rollback
//! - Outputs: GateDecision, HealthDelta, WitnessRecord
//!
//! ## Medium Loop (per attempt)
//! - Runs per solve attempt (one puzzle)
//! - Multi-strategy solver, ensemble vote, cascade passes
//! - Can PROPOSE memory writes, but cannot COMMIT them
//! - Outputs: CandidateSolution, AttemptTrace, ProposedMemoryWrites
//!
//! ## Slow Loop (per cycle)
//! - Runs per training/evaluation cycle
//! - Consolidation, compiler updates, promotion review, meta parameter updates
//! - Only component that can PROMOTE patterns (Volatile → Trusted)
//! - Outputs: NewPolicyCheckpoint, NewMemoryRoot, PromotionLog
//!
//! ## Critical Gating Rule
//! Medium loop can propose memory writes.
//! Fast loop is the only component allowed to commit them.
//! Slow loop is the only component allowed to promote them.
use serde::{Deserialize, Serialize};
use crate::agi_contract::ContractHealth;
use crate::reasoning_bank::{
Counterexample, MemoryCheckpoint, MemoryClass, ReasoningBank, RollbackWitness, Trajectory,
Verdict,
};
// ═══════════════════════════════════════════════════════════════════════════
// Fast Loop: per-step invariant gating
// ═══════════════════════════════════════════════════════════════════════════
/// Decision made by the fast loop gate on each step.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum GateDecision {
/// Allow the step to proceed
Allow,
/// Block: step would violate a policy
Block { reason: String },
/// Quarantine: result is suspicious, hold for review
Quarantine { reason: String },
/// Rollback: regression detected, revert to checkpoint
Rollback {
checkpoint_id: usize,
reason: String,
},
}
/// Health delta tracked per step.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct HealthDelta {
pub steps_taken: usize,
pub contradictions_detected: usize,
pub policy_violations: usize,
pub cost_accumulated: f64,
}
/// Fast loop gate: checks invariants on every step.
/// This is the ONLY component allowed to commit memory writes.
#[derive(Clone, Debug)]
pub struct FastGate {
/// Maximum steps before forced halt
pub step_limit: usize,
/// Maximum cost accumulation before halt
pub cost_limit: f64,
/// Contradiction threshold before quarantine
pub contradiction_threshold: usize,
/// Running health delta
pub delta: HealthDelta,
/// Pending writes from medium loop (committed by fast loop)
pub pending_writes: Vec<ProposedWrite>,
/// Gate decisions log
pub decisions: Vec<GateDecision>,
}
impl FastGate {
pub fn new(step_limit: usize) -> Self {
Self {
step_limit,
cost_limit: f64::MAX,
contradiction_threshold: 3,
delta: HealthDelta::default(),
pending_writes: Vec::new(),
decisions: Vec::new(),
}
}
/// Check a step and return a gate decision.
pub fn check_step(&mut self, step: usize, solved: bool, correct: bool) -> GateDecision {
self.delta.steps_taken = step;
// Check step budget
if step >= self.step_limit {
let decision = GateDecision::Block {
reason: format!("step budget exhausted ({}/{})", step, self.step_limit),
};
self.decisions.push(decision.clone());
return decision;
}
// Check contradiction (solved but wrong)
if solved && !correct {
self.delta.contradictions_detected += 1;
if self.delta.contradictions_detected >= self.contradiction_threshold {
let decision = GateDecision::Quarantine {
reason: format!(
"{} contradictions in this attempt",
self.delta.contradictions_detected,
),
};
self.decisions.push(decision.clone());
return decision;
}
}
let decision = GateDecision::Allow;
self.decisions.push(decision.clone());
decision
}
/// Commit pending writes from the medium loop into the bank.
/// Only the fast loop has authority to do this.
pub fn commit_writes(&mut self, bank: &mut ReasoningBank) -> usize {
let count = self.pending_writes.len();
for write in self.pending_writes.drain(..) {
match write {
ProposedWrite::RecordTrajectory(traj) => {
bank.record_trajectory_gated(traj);
}
ProposedWrite::RecordCounterexample {
constraint_type,
trajectory,
} => {
bank.record_counterexample(&constraint_type, trajectory);
}
ProposedWrite::QuarantineTrajectory { trajectory, reason } => {
bank.quarantine_trajectory(trajectory, &reason);
}
}
}
count
}
/// Reset for next attempt.
pub fn reset(&mut self) {
self.delta = HealthDelta::default();
self.decisions.clear();
}
}
/// A proposed memory write from the medium loop.
/// Cannot be committed directly — must go through FastGate.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum ProposedWrite {
RecordTrajectory(Trajectory),
RecordCounterexample {
constraint_type: String,
trajectory: Trajectory,
},
QuarantineTrajectory {
trajectory: Trajectory,
reason: String,
},
}
// ═══════════════════════════════════════════════════════════════════════════
// Medium Loop: per-attempt solving
// ═══════════════════════════════════════════════════════════════════════════
/// Trace of a single solve attempt.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AttemptTrace {
/// Puzzle ID
pub puzzle_id: String,
/// Strategy used
pub strategy: String,
/// Steps taken
pub steps: usize,
/// Whether the answer was correct
pub correct: bool,
/// Whether a retry was attempted
pub retried: bool,
/// Gate decisions during this attempt
pub gate_decisions: Vec<GateDecision>,
/// Proposed memory writes (not yet committed)
pub proposed_writes: Vec<ProposedWrite>,
}
/// Medium loop: handles one puzzle solve attempt.
/// Can propose memory writes but cannot commit them.
pub struct MediumLoop {
/// Fast gate for step-level invariant checking
pub gate: FastGate,
}
impl MediumLoop {
pub fn new(step_limit: usize) -> Self {
Self {
gate: FastGate::new(step_limit),
}
}
/// Process a solve result and produce an attempt trace.
/// Proposes memory writes but does NOT commit them.
pub fn process_result(
&mut self,
puzzle_id: &str,
difficulty: u8,
strategy: &str,
steps: usize,
solved: bool,
correct: bool,
constraint_types: &[String],
) -> AttemptTrace {
// Fast loop gate check
let decision = self.gate.check_step(steps, solved, correct);
let mut proposed_writes = Vec::new();
// Build trajectory
let mut traj = Trajectory::new(puzzle_id, difficulty);
traj.constraint_types = constraint_types.to_vec();
traj.record_attempt(
if correct {
"correct".to_string()
} else {
"incorrect".to_string()
},
if correct { 0.9 } else { 0.2 },
steps,
1,
strategy,
);
traj.set_verdict(
if correct {
Verdict::Success
} else {
Verdict::Failed
},
None,
);
match decision {
GateDecision::Allow => {
// Propose recording the trajectory
proposed_writes.push(ProposedWrite::RecordTrajectory(traj));
}
GateDecision::Block { .. } => {
// Don't record — budget exhausted
}
GateDecision::Quarantine { ref reason } => {
proposed_writes.push(ProposedWrite::QuarantineTrajectory {
trajectory: traj.clone(),
reason: reason.clone(),
});
for ct in constraint_types {
proposed_writes.push(ProposedWrite::RecordCounterexample {
constraint_type: ct.clone(),
trajectory: traj.clone(),
});
}
}
GateDecision::Rollback { .. } => {
// Rollback handled at fast loop level
}
}
AttemptTrace {
puzzle_id: puzzle_id.to_string(),
strategy: strategy.to_string(),
steps,
correct,
retried: false,
gate_decisions: vec![decision],
proposed_writes,
}
}
/// Finalize: transfer proposed writes to fast gate for commitment.
pub fn finalize(&mut self, trace: &AttemptTrace) {
for write in &trace.proposed_writes {
self.gate.pending_writes.push(write.clone());
}
}
/// Reset for next attempt.
pub fn reset(&mut self) {
self.gate.reset();
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Slow Loop: per-cycle consolidation
// ═══════════════════════════════════════════════════════════════════════════
/// Log of pattern promotions during a cycle.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct PromotionLog {
/// Patterns promoted from Volatile → Trusted
pub promoted: usize,
/// Patterns demoted from Trusted → Quarantined
pub demoted: usize,
/// Patterns remaining in Volatile
pub volatile_remaining: usize,
/// Patterns in Trusted
pub trusted_total: usize,
/// Patterns in Quarantined
pub quarantined_total: usize,
}
/// Result of a slow loop cycle.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CycleConsolidation {
/// Cycle number
pub cycle: usize,
/// Checkpoint created at start of cycle
pub checkpoint_id: usize,
/// Promotion log
pub promotion_log: PromotionLog,
/// Contract health after consolidation
pub contract_health: Option<ContractHealth>,
/// Whether a rollback occurred
pub rolled_back: bool,
/// Rollback witness if rollback occurred
pub rollback_witness: Option<RollbackWitness>,
}
/// Slow loop: handles per-cycle consolidation.
/// Only component allowed to promote patterns.
pub struct SlowLoop {
/// History of consolidations
pub history: Vec<CycleConsolidation>,
}
impl SlowLoop {
pub fn new() -> Self {
Self {
history: Vec::new(),
}
}
/// Run consolidation: promote eligible patterns, demote failing ones.
/// This is the ONLY place where pattern promotion happens.
pub fn consolidate(
&mut self,
bank: &mut ReasoningBank,
cycle: usize,
checkpoint_id: usize,
holdout_accuracy: f64,
prev_accuracy: Option<f64>,
) -> CycleConsolidation {
let mut rolled_back = false;
let mut rollback_witness = None;
// Check for regression — if accuracy dropped, rollback
if let Some(prev) = prev_accuracy {
if holdout_accuracy < prev - 0.05 {
let ok = bank.rollback_with_witness(
checkpoint_id,
"slow loop: accuracy regression",
prev,
holdout_accuracy,
);
if ok {
rolled_back = true;
rollback_witness = bank.rollback_witnesses.last().cloned();
}
}
}
// Promote eligible patterns (requires counterexample)
let promoted = bank.promote_patterns();
let log = PromotionLog {
promoted,
demoted: 0, // Demotions happen in the fast loop
volatile_remaining: bank.volatile_count(),
trusted_total: bank.trusted_count(),
quarantined_total: bank.quarantined_pattern_count(),
};
let consolidation = CycleConsolidation {
cycle,
checkpoint_id,
promotion_log: log,
contract_health: None,
rolled_back,
rollback_witness,
};
self.history.push(consolidation.clone());
consolidation
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Tests
// ═══════════════════════════════════════════════════════════════════════════
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fast_gate_allows_normal_step() {
let mut gate = FastGate::new(100);
let decision = gate.check_step(5, false, false);
assert_eq!(decision, GateDecision::Allow);
}
#[test]
fn fast_gate_blocks_over_budget() {
let mut gate = FastGate::new(10);
let decision = gate.check_step(10, false, false);
assert!(matches!(decision, GateDecision::Block { .. }));
}
#[test]
fn fast_gate_quarantines_contradictions() {
let mut gate = FastGate::new(100);
gate.contradiction_threshold = 2;
// First contradiction: still allowed
let d1 = gate.check_step(1, true, false);
assert_eq!(d1, GateDecision::Allow);
// Second contradiction: quarantine
let d2 = gate.check_step(2, true, false);
assert!(matches!(d2, GateDecision::Quarantine { .. }));
}
#[test]
fn fast_gate_commits_pending_writes() {
let mut gate = FastGate::new(100);
let mut bank = ReasoningBank::new();
let mut traj = Trajectory::new("test_1", 5);
traj.constraint_types.push("Before".to_string());
traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
traj.set_verdict(Verdict::Success, None);
gate.pending_writes
.push(ProposedWrite::RecordTrajectory(traj));
let committed = gate.commit_writes(&mut bank);
assert_eq!(committed, 1);
assert_eq!(bank.trajectories.len(), 1);
}
#[test]
fn medium_loop_proposes_writes() {
let mut medium = MediumLoop::new(100);
let trace = medium.process_result(
"puzzle_1",
5,
"adaptive",
15,
true,
true,
&["Before".to_string()],
);
assert!(trace.correct);
assert_eq!(trace.proposed_writes.len(), 1);
assert!(matches!(
trace.proposed_writes[0],
ProposedWrite::RecordTrajectory(_)
));
}
#[test]
fn medium_loop_quarantines_contradictions() {
let mut medium = MediumLoop::new(100);
medium.gate.contradiction_threshold = 1;
// Solved but wrong → quarantine (threshold 1)
let trace = medium.process_result(
"puzzle_1",
5,
"default",
15,
true,
false,
&["Month".to_string()],
);
assert!(!trace.correct);
// Should have quarantine + counterexample writes
assert!(trace.proposed_writes.len() >= 2);
assert!(trace
.proposed_writes
.iter()
.any(|w| matches!(w, ProposedWrite::QuarantineTrajectory { .. })));
}
#[test]
fn slow_loop_promotes_patterns() {
let mut bank = ReasoningBank::new();
bank.evidence_threshold = 3;
// Build enough observations
for i in 0..5 {
let mut traj = Trajectory::new(&format!("s_{}", i), 5);
traj.constraint_types.push("Year".to_string());
traj.record_attempt("2024".into(), 0.9, 10, 1, "default");
traj.set_verdict(Verdict::Success, None);
bank.record_trajectory(traj);
}
// Add counterexample (required for promotion)
let ce_traj = Trajectory::new("fail_1", 5);
bank.record_counterexample("Year", ce_traj);
let cp = bank.checkpoint();
let mut slow = SlowLoop::new();
let result = slow.consolidate(&mut bank, 0, cp, 0.95, None);
assert_eq!(result.promotion_log.promoted, 1);
assert_eq!(result.promotion_log.trusted_total, 1);
assert!(!result.rolled_back);
}
#[test]
fn slow_loop_rolls_back_on_regression() {
let mut bank = ReasoningBank::new();
for i in 0..3 {
let mut traj = Trajectory::new(&format!("r_{}", i), 5);
traj.constraint_types.push("DayOfWeek".to_string());
traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
traj.set_verdict(Verdict::Success, None);
bank.record_trajectory(traj);
}
let cp = bank.checkpoint();
// Simulate bad learning
for i in 3..6 {
let mut traj = Trajectory::new(&format!("r_{}", i), 5);
traj.constraint_types.push("DayOfWeek".to_string());
traj.record_attempt("wrong".into(), 0.1, 50, 1, "default");
traj.set_verdict(Verdict::Failed, None);
bank.record_trajectory(traj);
}
let mut slow = SlowLoop::new();
// Previous accuracy 0.95, current 0.80 → regression > 0.05
let result = slow.consolidate(&mut bank, 1, cp, 0.80, Some(0.95));
assert!(result.rolled_back);
assert!(result.rollback_witness.is_some());
assert_eq!(bank.trajectories.len(), 3); // Rolled back to checkpoint
}
#[test]
fn three_loop_integration() {
let mut bank = ReasoningBank::new();
bank.evidence_threshold = 2;
// === Cycle 1 ===
let cp = bank.checkpoint();
// Medium loop: solve puzzles
let mut medium = MediumLoop::new(100);
for i in 0..5 {
let trace = medium.process_result(
&format!("p_{}", i),
5,
"adaptive",
10,
true,
true,
&["Before".to_string()],
);
medium.finalize(&trace);
}
// Fast loop: commit writes
let committed = medium.gate.commit_writes(&mut bank);
assert_eq!(committed, 5);
medium.reset();
// Add counterexample (for promotion eligibility)
let ce = Trajectory::new("ce_1", 5);
bank.record_counterexample("Before", ce);
// Slow loop: consolidate
let mut slow = SlowLoop::new();
let consolidation = slow.consolidate(&mut bank, 0, cp, 0.90, None);
assert!(consolidation.promotion_log.promoted > 0);
assert_eq!(bank.trusted_count(), 1);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,648 @@
//! RVF Artifact Packaging
//!
//! Packages an intelligence experiment as a self-contained, reproducible artifact.
//! Aligns with the "identical graded outcomes, not identical tokens" promise.
//!
//! ## Contents
//!
//! 1. **Manifest**: Engine version, pinned configs, seed set, holdout IDs
//! 2. **Memory Snapshot**: ReasoningBank serialized, KnowledgeCompiler cache, promotion log
//! 3. **Graders**: Deterministic scoring + ContractHealth evaluation
//! 4. **Witness Chain**: Per-episode input/config/grade/memory hashes
//!
//! ## Run Modes
//!
//! - **Replay**: Uses stored tasks, stored grades, verifies witness chain
//! - **Verify**: Regenerates tasks from seeds, reruns grader, must match grades exactly
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use crate::agi_contract::ContractHealth;
use crate::reasoning_bank::{MemoryClass, RollbackWitness};
// ═══════════════════════════════════════════════════════════════════════════
// Manifest
// ═══════════════════════════════════════════════════════════════════════════
/// RVF Artifact Manifest — top-level metadata.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RvfManifest {
/// Format version
pub rvf_version: String,
/// Engine version that produced this artifact
pub engine_version: String,
/// Pinned solver configuration
pub solver_config: SolverConfig,
/// Pinned generator configuration
pub generator_config: GeneratorConfig,
/// Seed set used for generation
pub seed_set: SeedSet,
/// Holdout puzzle IDs (frozen set)
pub holdout_ids: Vec<String>,
/// Number of training cycles
pub cycles: usize,
/// Creation timestamp
pub created_at: String,
/// SHA-256 of the full artifact (computed after serialization)
pub artifact_hash: Option<String>,
}
/// Pinned solver configuration.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SolverConfig {
/// Step budget per task
pub step_budget: usize,
/// Noise injection rate
pub noise_rate: f64,
/// Retry enabled
pub retry_enabled: bool,
/// Beam width
pub beam_width: usize,
/// Minimum accuracy threshold
pub min_accuracy: f64,
}
/// Pinned generator configuration.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct GeneratorConfig {
/// Min difficulty
pub min_difficulty: u8,
/// Max difficulty
pub max_difficulty: u8,
/// Constraint density
pub constraint_density: usize,
/// Domain type (e.g., "temporal_puzzles", "program_synthesis")
pub domain: String,
}
/// Seed set for deterministic replay.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SeedSet {
/// Holdout generation seed (frozen)
pub holdout_seed: u64,
/// Training base seed
pub training_seed: u64,
/// Noise RNG seed
pub noise_seed: u64,
}
// ═══════════════════════════════════════════════════════════════════════════
// Memory Snapshot
// ═══════════════════════════════════════════════════════════════════════════
/// Serialized memory state at a point in time.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MemorySnapshot {
/// Serialized ReasoningBank (bincode or JSON)
pub reasoning_bank_data: Vec<u8>,
/// KnowledgeCompiler cache entries
pub compiler_cache: Vec<CompiledEntry>,
/// Promotion log: patterns promoted during this experiment
pub promotion_log: Vec<PromotionRecord>,
/// Memory class summary
pub class_summary: MemoryClassSummary,
}
/// A compiled knowledge entry (from KnowledgeCompiler).
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CompiledEntry {
/// Constraint signature
pub signature: String,
/// Compiled solution
pub solution: String,
/// Max steps the compiled path takes
pub max_steps: usize,
/// Confidence in compiled solution
pub confidence: f64,
/// Number of times this entry was used
pub hit_count: usize,
}
/// Record of a pattern promotion.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PromotionRecord {
/// Constraint type
pub constraint_type: String,
/// Strategy name
pub strategy: String,
/// From class
pub from_class: String,
/// To class
pub to_class: String,
/// Number of observations at promotion time
pub observations: usize,
/// Number of counterexamples at promotion time
pub counterexamples: usize,
/// Cycle when promotion occurred
pub cycle: usize,
}
/// Summary of memory classes.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct MemoryClassSummary {
pub volatile: usize,
pub trusted: usize,
pub quarantined: usize,
pub total_counterexamples: usize,
pub total_rollback_witnesses: usize,
}
// ═══════════════════════════════════════════════════════════════════════════
// Witness Chain
// ═══════════════════════════════════════════════════════════════════════════
/// Per-episode witness record for auditability.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct WitnessRecord {
/// Episode/cycle number
pub episode: usize,
/// SHA-256 of input (puzzle set)
pub input_hash: String,
/// SHA-256 of config
pub config_hash: String,
/// SHA-256 of grade outputs
pub grade_hash: String,
/// Memory root hash before this episode
pub memory_root_before: String,
/// Memory root hash after this episode
pub memory_root_after: String,
/// Gate decisions hash
pub gate_decisions_hash: String,
/// Contract health at end of episode
pub contract_health: ContractHealth,
}
/// Complete witness chain for the experiment.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct WitnessChain {
/// Ordered witness records (one per cycle)
pub records: Vec<WitnessRecord>,
/// Rollback witnesses that occurred during the experiment
pub rollback_witnesses: Vec<RollbackWitness>,
/// Final combined hash of the entire chain
pub chain_hash: Option<String>,
}
// ═══════════════════════════════════════════════════════════════════════════
// RVF Artifact (top-level)
// ═══════════════════════════════════════════════════════════════════════════
/// Complete RVF artifact — everything needed to replay or verify an experiment.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RvfArtifact {
/// Manifest with pinned configuration
pub manifest: RvfManifest,
/// Memory snapshot
pub memory: MemorySnapshot,
/// Witness chain
pub witness_chain: WitnessChain,
/// Final contract health
pub final_health: ContractHealth,
/// Final IQ score
pub final_iq: f64,
}
/// Run mode for artifact verification.
#[derive(Clone, Debug, PartialEq)]
pub enum RunMode {
/// Use stored tasks, stored grades, verify witness chain
Replay,
/// Regenerate tasks from seeds, rerun grader, grades must match
Verify,
}
// ═══════════════════════════════════════════════════════════════════════════
// Builder
// ═══════════════════════════════════════════════════════════════════════════
/// Builder for assembling an RVF artifact from experiment results.
pub struct RvfArtifactBuilder {
manifest: Option<RvfManifest>,
memory: Option<MemorySnapshot>,
witness_records: Vec<WitnessRecord>,
rollback_witnesses: Vec<RollbackWitness>,
final_health: Option<ContractHealth>,
final_iq: f64,
}
impl RvfArtifactBuilder {
pub fn new() -> Self {
Self {
manifest: None,
memory: None,
witness_records: Vec::new(),
rollback_witnesses: Vec::new(),
final_health: None,
final_iq: 0.0,
}
}
pub fn manifest(mut self, manifest: RvfManifest) -> Self {
self.manifest = Some(manifest);
self
}
pub fn memory(mut self, memory: MemorySnapshot) -> Self {
self.memory = Some(memory);
self
}
pub fn add_witness(&mut self, record: WitnessRecord) {
self.witness_records.push(record);
}
pub fn add_rollback_witness(&mut self, witness: RollbackWitness) {
self.rollback_witnesses.push(witness);
}
pub fn final_health(mut self, health: ContractHealth) -> Self {
self.final_health = Some(health);
self
}
pub fn final_iq(mut self, iq: f64) -> Self {
self.final_iq = iq;
self
}
/// Build the artifact. Returns None if required fields are missing.
pub fn build(self) -> Option<RvfArtifact> {
let manifest = self.manifest?;
let memory = self.memory?;
let final_health = self.final_health?;
Some(RvfArtifact {
manifest,
memory,
witness_chain: WitnessChain {
records: self.witness_records,
rollback_witnesses: self.rollback_witnesses,
chain_hash: None,
},
final_health,
final_iq: self.final_iq,
})
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Hash utilities (simple deterministic hashing for witness chain)
// ═══════════════════════════════════════════════════════════════════════════
/// Simple deterministic hash for reproducibility checks.
/// Uses a 64-bit FNV-1a hash displayed as hex.
pub fn fnv_hash(data: &[u8]) -> String {
let mut hash: u64 = 0xcbf29ce484222325;
for &byte in data {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
format!("{:016x}", hash)
}
/// Hash a serializable value.
pub fn hash_value<T: Serialize>(value: &T) -> String {
let json = serde_json::to_vec(value).unwrap_or_default();
fnv_hash(&json)
}
// ═══════════════════════════════════════════════════════════════════════════
// Verification
// ═══════════════════════════════════════════════════════════════════════════
/// Result of artifact verification.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct VerificationResult {
/// Overall pass/fail
pub passed: bool,
/// Per-witness verification
pub witness_checks: Vec<WitnessCheck>,
/// Number of hash mismatches
pub mismatches: usize,
/// Chain integrity (each record references previous hash)
pub chain_intact: bool,
}
/// Single witness check result.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct WitnessCheck {
pub episode: usize,
pub input_hash_ok: bool,
pub grade_hash_ok: bool,
pub memory_transition_ok: bool,
}
/// Verify an artifact's witness chain integrity.
pub fn verify_witness_chain(artifact: &RvfArtifact) -> VerificationResult {
let mut checks = Vec::new();
let mut mismatches = 0;
let mut chain_intact = true;
let mut prev_memory_after = String::new();
for (i, record) in artifact.witness_chain.records.iter().enumerate() {
let input_ok = !record.input_hash.is_empty();
let grade_ok = !record.grade_hash.is_empty();
// Memory transition: after(N-1) == before(N)
let memory_ok = if i == 0 {
true
} else {
record.memory_root_before == prev_memory_after
};
if !memory_ok {
chain_intact = false;
mismatches += 1;
}
if !input_ok {
mismatches += 1;
}
if !grade_ok {
mismatches += 1;
}
prev_memory_after = record.memory_root_after.clone();
checks.push(WitnessCheck {
episode: record.episode,
input_hash_ok: input_ok,
grade_hash_ok: grade_ok,
memory_transition_ok: memory_ok,
});
}
VerificationResult {
passed: mismatches == 0 && chain_intact,
witness_checks: checks,
mismatches,
chain_intact,
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Tests
// ═══════════════════════════════════════════════════════════════════════════
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fnv_hash_deterministic() {
let h1 = fnv_hash(b"hello world");
let h2 = fnv_hash(b"hello world");
assert_eq!(h1, h2);
let h3 = fnv_hash(b"hello world!");
assert_ne!(h1, h3);
}
#[test]
fn artifact_builder_works() {
let manifest = RvfManifest {
rvf_version: "1.0".to_string(),
engine_version: "0.1.0".to_string(),
solver_config: SolverConfig {
step_budget: 400,
noise_rate: 0.25,
retry_enabled: true,
beam_width: 3,
min_accuracy: 0.80,
},
generator_config: GeneratorConfig {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
domain: "temporal_puzzles".to_string(),
},
seed_set: SeedSet {
holdout_seed: 0xDEAD_BEEF,
training_seed: 42,
noise_seed: 31337,
},
holdout_ids: vec!["p1".into(), "p2".into()],
cycles: 10,
created_at: "2026-02-15T00:00:00Z".to_string(),
artifact_hash: None,
};
let memory = MemorySnapshot {
reasoning_bank_data: vec![1, 2, 3],
compiler_cache: Vec::new(),
promotion_log: Vec::new(),
class_summary: MemoryClassSummary::default(),
};
let health = ContractHealth {
solved_per_cost: 0.85,
noise_stability: 0.92,
contradiction_rate: 0.01,
rollback_correctness: 1.0,
policy_violations: 0,
accuracy: 0.95,
cost_efficiency: 0.85,
compliant: true,
};
let artifact = RvfArtifactBuilder::new()
.manifest(manifest)
.memory(memory)
.final_health(health)
.final_iq(95.0)
.build();
assert!(artifact.is_some());
let a = artifact.unwrap();
assert_eq!(a.manifest.rvf_version, "1.0");
assert_eq!(a.final_iq, 95.0);
assert!(a.final_health.compliant);
}
#[test]
fn witness_chain_verification() {
let mut builder = RvfArtifactBuilder::new();
// Build a 3-episode witness chain with consistent memory transitions
let mem_root_0 = fnv_hash(b"initial");
let mem_root_1 = fnv_hash(b"after_cycle_1");
let mem_root_2 = fnv_hash(b"after_cycle_2");
let mem_root_3 = fnv_hash(b"after_cycle_3");
let health = ContractHealth {
solved_per_cost: 0.9,
noise_stability: 0.95,
contradiction_rate: 0.0,
rollback_correctness: 1.0,
policy_violations: 0,
accuracy: 0.95,
cost_efficiency: 0.90,
compliant: true,
};
builder.add_witness(WitnessRecord {
episode: 0,
input_hash: fnv_hash(b"input_0"),
config_hash: fnv_hash(b"config"),
grade_hash: fnv_hash(b"grade_0"),
memory_root_before: mem_root_0.clone(),
memory_root_after: mem_root_1.clone(),
gate_decisions_hash: fnv_hash(b"gates_0"),
contract_health: health.clone(),
});
builder.add_witness(WitnessRecord {
episode: 1,
input_hash: fnv_hash(b"input_1"),
config_hash: fnv_hash(b"config"),
grade_hash: fnv_hash(b"grade_1"),
memory_root_before: mem_root_1.clone(), // matches prev after
memory_root_after: mem_root_2.clone(),
gate_decisions_hash: fnv_hash(b"gates_1"),
contract_health: health.clone(),
});
builder.add_witness(WitnessRecord {
episode: 2,
input_hash: fnv_hash(b"input_2"),
config_hash: fnv_hash(b"config"),
grade_hash: fnv_hash(b"grade_2"),
memory_root_before: mem_root_2.clone(), // matches prev after
memory_root_after: mem_root_3.clone(),
gate_decisions_hash: fnv_hash(b"gates_2"),
contract_health: health.clone(),
});
let manifest = RvfManifest {
rvf_version: "1.0".to_string(),
engine_version: "0.1.0".to_string(),
solver_config: SolverConfig {
step_budget: 400,
noise_rate: 0.25,
retry_enabled: true,
beam_width: 3,
min_accuracy: 0.80,
},
generator_config: GeneratorConfig {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
domain: "temporal_puzzles".to_string(),
},
seed_set: SeedSet {
holdout_seed: 0xDEAD_BEEF,
training_seed: 42,
noise_seed: 31337,
},
holdout_ids: Vec::new(),
cycles: 3,
created_at: "2026-02-15T00:00:00Z".to_string(),
artifact_hash: None,
};
let artifact = RvfArtifactBuilder::new()
.manifest(manifest)
.memory(MemorySnapshot {
reasoning_bank_data: Vec::new(),
compiler_cache: Vec::new(),
promotion_log: Vec::new(),
class_summary: MemoryClassSummary::default(),
})
.final_health(health)
.final_iq(90.0);
// Transfer witnesses
let mut artifact_raw = artifact.build().unwrap();
artifact_raw.witness_chain.records = builder.witness_records;
let result = verify_witness_chain(&artifact_raw);
assert!(result.passed);
assert!(result.chain_intact);
assert_eq!(result.mismatches, 0);
assert_eq!(result.witness_checks.len(), 3);
}
#[test]
fn witness_chain_detects_tampering() {
let health = ContractHealth {
solved_per_cost: 0.9,
noise_stability: 0.95,
contradiction_rate: 0.0,
rollback_correctness: 1.0,
policy_violations: 0,
accuracy: 0.95,
cost_efficiency: 0.90,
compliant: true,
};
let mut artifact = RvfArtifact {
manifest: RvfManifest {
rvf_version: "1.0".to_string(),
engine_version: "0.1.0".to_string(),
solver_config: SolverConfig {
step_budget: 400,
noise_rate: 0.25,
retry_enabled: true,
beam_width: 3,
min_accuracy: 0.80,
},
generator_config: GeneratorConfig {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
domain: "temporal_puzzles".to_string(),
},
seed_set: SeedSet {
holdout_seed: 0xDEAD_BEEF,
training_seed: 42,
noise_seed: 31337,
},
holdout_ids: Vec::new(),
cycles: 2,
created_at: "2026-02-15T00:00:00Z".to_string(),
artifact_hash: None,
},
memory: MemorySnapshot {
reasoning_bank_data: Vec::new(),
compiler_cache: Vec::new(),
promotion_log: Vec::new(),
class_summary: MemoryClassSummary::default(),
},
witness_chain: WitnessChain {
records: vec![
WitnessRecord {
episode: 0,
input_hash: fnv_hash(b"in_0"),
config_hash: fnv_hash(b"cfg"),
grade_hash: fnv_hash(b"gr_0"),
memory_root_before: fnv_hash(b"mem_0"),
memory_root_after: fnv_hash(b"mem_1"),
gate_decisions_hash: fnv_hash(b"g_0"),
contract_health: health.clone(),
},
WitnessRecord {
episode: 1,
input_hash: fnv_hash(b"in_1"),
config_hash: fnv_hash(b"cfg"),
grade_hash: fnv_hash(b"gr_1"),
// TAMPERED: memory_root_before doesn't match previous after
memory_root_before: fnv_hash(b"WRONG"),
memory_root_after: fnv_hash(b"mem_2"),
gate_decisions_hash: fnv_hash(b"g_1"),
contract_health: health.clone(),
},
],
rollback_witnesses: Vec::new(),
chain_hash: None,
},
final_health: health,
final_iq: 90.0,
};
let result = verify_witness_chain(&artifact);
assert!(!result.passed);
assert!(!result.chain_intact);
assert!(result.mismatches > 0);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,382 @@
//! Swarm Controller Regret Tracking
//!
//! Implements sublinear regret metrics for multi-agent control:
//! - Episode-based regret computation
//! - Oracle baseline comparison
//! - Regret curve tracking (R_k/k should decrease)
//!
//! Based on research on sublinear regret in multi-agent and LLM-agent settings
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
/// Episode result from agent execution
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EpisodeResult {
/// Episode number
pub episode: usize,
/// Number of puzzles/tasks in episode
pub num_tasks: usize,
/// Tasks solved
pub solved: usize,
/// Correct solutions
pub correct: usize,
/// Total steps taken
pub total_steps: usize,
/// Total tool calls
pub tool_calls: usize,
/// Total latency in ms
pub latency_ms: u64,
/// Agent reward (e.g., accuracy * 100 - steps / 10)
pub reward: f64,
/// Oracle reward (best possible performance)
pub oracle_reward: f64,
}
impl EpisodeResult {
/// Compute instantaneous regret for this episode
pub fn regret(&self) -> f64 {
(self.oracle_reward - self.reward).max(0.0)
}
/// Compute accuracy
pub fn accuracy(&self) -> f64 {
if self.num_tasks == 0 {
return 0.0;
}
self.correct as f64 / self.num_tasks as f64
}
}
/// Regret tracker for swarm controller
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RegretTracker {
/// Episode results
pub episodes: Vec<EpisodeResult>,
/// Cumulative regret history
pub cumulative_regret: Vec<f64>,
/// Average regret history (R_k/k)
pub average_regret: Vec<f64>,
/// Window size for moving average
pub window_size: usize,
/// Recent rewards for moving average
recent_rewards: VecDeque<f64>,
}
impl Default for RegretTracker {
fn default() -> Self {
Self::new(20)
}
}
impl RegretTracker {
/// Create a new regret tracker
pub fn new(window_size: usize) -> Self {
Self {
episodes: Vec::new(),
cumulative_regret: Vec::new(),
average_regret: Vec::new(),
window_size,
recent_rewards: VecDeque::with_capacity(window_size),
}
}
/// Record an episode result
pub fn record_episode(&mut self, result: EpisodeResult) {
let regret = result.regret();
let k = self.episodes.len() + 1;
// Update cumulative regret
let prev_cumulative = self.cumulative_regret.last().copied().unwrap_or(0.0);
let new_cumulative = prev_cumulative + regret;
self.cumulative_regret.push(new_cumulative);
// Update average regret (R_k/k)
let avg_regret = new_cumulative / k as f64;
self.average_regret.push(avg_regret);
// Update moving average window
self.recent_rewards.push_back(result.reward);
if self.recent_rewards.len() > self.window_size {
self.recent_rewards.pop_front();
}
self.episodes.push(result);
}
/// Get current cumulative regret
pub fn current_cumulative_regret(&self) -> f64 {
self.cumulative_regret.last().copied().unwrap_or(0.0)
}
/// Get current average regret (R_k/k)
pub fn current_average_regret(&self) -> f64 {
self.average_regret.last().copied().unwrap_or(0.0)
}
/// Check if regret is sublinear (average regret decreasing)
pub fn is_sublinear(&self) -> bool {
if self.average_regret.len() < 5 {
return true; // Not enough data
}
// Check if trend is decreasing
let n = self.average_regret.len();
let recent = &self.average_regret[n.saturating_sub(5)..];
let first = recent[0];
let last = recent[recent.len() - 1];
last < first
}
/// Get regret trend (slope of average regret)
pub fn regret_trend(&self) -> f64 {
if self.average_regret.len() < 2 {
return 0.0;
}
let n = self.average_regret.len();
let window = n.min(10);
let recent = &self.average_regret[n - window..];
// Simple linear regression slope
let x_mean = (window - 1) as f64 / 2.0;
let y_mean: f64 = recent.iter().sum::<f64>() / window as f64;
let mut num = 0.0;
let mut den = 0.0;
for (i, y) in recent.iter().enumerate() {
let x = i as f64;
num += (x - x_mean) * (y - y_mean);
den += (x - x_mean) * (x - x_mean);
}
if den.abs() < 1e-10 {
0.0
} else {
num / den
}
}
/// Get moving average reward
pub fn moving_average_reward(&self) -> f64 {
if self.recent_rewards.is_empty() {
return 0.0;
}
self.recent_rewards.iter().sum::<f64>() / self.recent_rewards.len() as f64
}
/// Get summary statistics
pub fn summary(&self) -> RegretSummary {
let total_episodes = self.episodes.len();
let total_regret = self.current_cumulative_regret();
let avg_regret = self.current_average_regret();
let trend = self.regret_trend();
let is_sublinear = self.is_sublinear();
let avg_accuracy = if total_episodes > 0 {
self.episodes.iter().map(|e| e.accuracy()).sum::<f64>() / total_episodes as f64
} else {
0.0
};
let avg_reward = if total_episodes > 0 {
self.episodes.iter().map(|e| e.reward).sum::<f64>() / total_episodes as f64
} else {
0.0
};
RegretSummary {
total_episodes,
total_regret,
average_regret: avg_regret,
regret_trend: trend,
is_sublinear,
average_accuracy: avg_accuracy,
average_reward: avg_reward,
moving_average_reward: self.moving_average_reward(),
}
}
}
/// Regret summary statistics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RegretSummary {
pub total_episodes: usize,
pub total_regret: f64,
pub average_regret: f64,
pub regret_trend: f64,
pub is_sublinear: bool,
pub average_accuracy: f64,
pub average_reward: f64,
pub moving_average_reward: f64,
}
/// Oracle baseline for computing optimal rewards
#[derive(Clone, Debug)]
pub struct OracleBaseline {
/// Perfect accuracy reward
pub perfect_accuracy_reward: f64,
/// Step penalty factor
pub step_penalty: f64,
/// Minimum steps for optimal solution
pub min_steps: usize,
}
impl Default for OracleBaseline {
fn default() -> Self {
Self {
perfect_accuracy_reward: 100.0,
step_penalty: 0.1,
min_steps: 5,
}
}
}
impl OracleBaseline {
/// Compute oracle reward for a task set
pub fn compute_reward(&self, num_tasks: usize) -> f64 {
// Oracle solves all tasks with minimum steps
let accuracy_reward = self.perfect_accuracy_reward;
let step_cost = (self.min_steps * num_tasks) as f64 * self.step_penalty;
accuracy_reward - step_cost
}
}
/// Swarm controller with regret tracking
pub struct SwarmController {
/// Regret tracker
pub regret: RegretTracker,
/// Oracle baseline
pub oracle: OracleBaseline,
/// Current episode number
pub current_episode: usize,
/// Tasks per episode
pub tasks_per_episode: usize,
}
impl Default for SwarmController {
fn default() -> Self {
Self::new(20)
}
}
impl SwarmController {
/// Create a new swarm controller
pub fn new(tasks_per_episode: usize) -> Self {
Self {
regret: RegretTracker::new(20),
oracle: OracleBaseline::default(),
current_episode: 0,
tasks_per_episode,
}
}
/// Start a new episode
pub fn start_episode(&mut self) {
self.current_episode += 1;
}
/// Record episode completion
pub fn complete_episode(
&mut self,
solved: usize,
correct: usize,
total_steps: usize,
tool_calls: usize,
latency_ms: u64,
) {
let num_tasks = self.tasks_per_episode;
// Compute agent reward
let accuracy = if num_tasks > 0 {
correct as f64 / num_tasks as f64
} else {
0.0
};
let agent_reward = accuracy * self.oracle.perfect_accuracy_reward
- total_steps as f64 * self.oracle.step_penalty;
// Compute oracle reward
let oracle_reward = self.oracle.compute_reward(num_tasks);
let result = EpisodeResult {
episode: self.current_episode,
num_tasks,
solved,
correct,
total_steps,
tool_calls,
latency_ms,
reward: agent_reward,
oracle_reward,
};
self.regret.record_episode(result);
}
/// Get current regret status
pub fn status(&self) -> SwarmStatus {
let summary = self.regret.summary();
SwarmStatus {
episode: self.current_episode,
cumulative_regret: summary.total_regret,
average_regret: summary.average_regret,
is_improving: summary.is_sublinear,
accuracy: summary.average_accuracy,
}
}
}
/// Swarm controller status
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SwarmStatus {
pub episode: usize,
pub cumulative_regret: f64,
pub average_regret: f64,
pub is_improving: bool,
pub accuracy: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_regret_tracking() {
let mut tracker = RegretTracker::new(10);
// Simulate improving performance
for i in 0..10 {
let accuracy = 0.5 + 0.05 * i as f64;
let result = EpisodeResult {
episode: i + 1,
num_tasks: 20,
solved: (20.0 * accuracy) as usize,
correct: (20.0 * accuracy) as usize,
total_steps: 100 - i * 5,
tool_calls: 20,
latency_ms: 1000,
reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
oracle_reward: 99.0,
};
tracker.record_episode(result);
}
assert!(tracker.is_sublinear());
assert!(tracker.regret_trend() < 0.0);
}
#[test]
fn test_swarm_controller() {
let mut controller = SwarmController::new(20);
for _ in 0..5 {
controller.start_episode();
controller.complete_episode(18, 17, 80, 20, 500);
}
let status = controller.status();
assert_eq!(status.episode, 5);
assert!(status.accuracy > 0.8);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,657 @@
//! TimePuzzles Generator
//!
//! Generates constraint-based temporal reasoning puzzles
//! based on the TimePuzzles benchmark methodology (arXiv:2601.07148)
//!
//! Key features:
//! - Factual temporal anchors with calendar relations
//! - Cross-cultural date systems
//! - Controlled difficulty levels
//! - Dynamic puzzle generation
use crate::temporal::{TemporalConstraint, TemporalPuzzle};
use anyhow::Result;
use chrono::{Datelike, NaiveDate};
use rand::prelude::*;
use serde::{Deserialize, Serialize};
/// Multi-dimensional difficulty vector.
///
/// Replaces single-axis difficulty to prevent collapsing effects.
/// Higher difficulty = more work and more ambiguity, NOT tighter posterior.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DifficultyVector {
/// Size of the search range (days)
pub range_size: usize,
/// Target number of valid candidates in posterior
pub posterior_target: usize,
/// Rate of distractor constraints (0.0 - 1.0)
pub distractor_rate: f64,
/// Rate of noise injection (0.0 - 1.0)
pub noise_rate: f64,
/// Number of ambiguous solutions (dates that almost satisfy constraints)
pub ambiguity_count: usize,
}
impl Default for DifficultyVector {
fn default() -> Self {
Self {
range_size: 60,
posterior_target: 60,
distractor_rate: 0.0,
noise_rate: 0.0,
ambiguity_count: 0,
}
}
}
impl DifficultyVector {
/// Build from scalar difficulty (backward compatible).
/// Higher difficulty = wider range, more distractors, more ambiguity.
pub fn from_scalar(difficulty: u8) -> Self {
let d = difficulty.min(10).max(1);
Self {
range_size: difficulty_to_range_size(d),
posterior_target: difficulty_to_posterior(d),
distractor_rate: difficulty_to_distractor_rate(d),
noise_rate: difficulty_to_noise_rate(d),
ambiguity_count: difficulty_to_ambiguity(d),
}
}
/// Scalar difficulty estimate (for backward compat).
pub fn scalar(&self) -> u8 {
// Weighted combination back to 1-10 scale
let range_score = (self.range_size as f64 / 365.0 * 10.0).min(10.0);
let distractor_score = self.distractor_rate * 10.0;
let ambiguity_score = (self.ambiguity_count as f64 / 5.0 * 10.0).min(10.0);
let combined = (range_score * 0.3 + distractor_score * 0.3 + ambiguity_score * 0.4) as u8;
combined.max(1).min(10)
}
}
/// Puzzle generator configuration
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PuzzleGeneratorConfig {
/// Minimum difficulty (1-10)
pub min_difficulty: u8,
/// Maximum difficulty (1-10)
pub max_difficulty: u8,
/// Constraint density (1-5)
pub constraint_density: u8,
/// Include cross-cultural references
pub cross_cultural: bool,
/// Include relative constraints
pub relative_constraints: bool,
/// Year range for puzzles
pub year_range: (i32, i32),
/// Random seed (optional)
pub seed: Option<u64>,
}
impl Default for PuzzleGeneratorConfig {
fn default() -> Self {
Self {
min_difficulty: 1,
max_difficulty: 10,
constraint_density: 3,
cross_cultural: true,
relative_constraints: true,
year_range: (2000, 2030),
seed: None,
}
}
}
/// Known events for temporal anchoring
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TemporalAnchor {
pub name: String,
pub date: NaiveDate,
pub category: String,
pub culture: String,
}
impl TemporalAnchor {
pub fn new(
name: impl Into<String>,
year: i32,
month: u32,
day: u32,
category: impl Into<String>,
culture: impl Into<String>,
) -> Self {
Self {
name: name.into(),
date: NaiveDate::from_ymd_opt(year, month, day).unwrap(),
category: category.into(),
culture: culture.into(),
}
}
}
/// TimePuzzles generator
pub struct PuzzleGenerator {
config: PuzzleGeneratorConfig,
anchors: Vec<TemporalAnchor>,
rng: StdRng,
}
impl PuzzleGenerator {
/// Create a new generator with config
pub fn new(config: PuzzleGeneratorConfig) -> Self {
let rng = match config.seed {
Some(s) => StdRng::seed_from_u64(s),
None => StdRng::from_entropy(),
};
let mut gen = Self {
config,
anchors: Vec::new(),
rng,
};
gen.init_anchors();
gen
}
/// Initialize standard temporal anchors
fn init_anchors(&mut self) {
// Western holidays
self.anchors.push(TemporalAnchor::new(
"Christmas",
2024,
12,
25,
"holiday",
"western",
));
self.anchors.push(TemporalAnchor::new(
"New Year", 2024, 1, 1, "holiday", "western",
));
self.anchors.push(TemporalAnchor::new(
"Independence Day",
2024,
7,
4,
"holiday",
"american",
));
self.anchors.push(TemporalAnchor::new(
"Halloween",
2024,
10,
31,
"holiday",
"western",
));
self.anchors.push(TemporalAnchor::new(
"Valentine's Day",
2024,
2,
14,
"holiday",
"western",
));
// Cross-cultural events
if self.config.cross_cultural {
// Chinese New Year 2024 (Year of the Dragon)
self.anchors.push(TemporalAnchor::new(
"Chinese New Year 2024",
2024,
2,
10,
"holiday",
"chinese",
));
// Diwali 2024
self.anchors.push(TemporalAnchor::new(
"Diwali 2024",
2024,
11,
1,
"holiday",
"indian",
));
// Eid al-Fitr 2024
self.anchors.push(TemporalAnchor::new(
"Eid al-Fitr 2024",
2024,
4,
10,
"holiday",
"islamic",
));
// Hanukkah 2024 (starts)
self.anchors.push(TemporalAnchor::new(
"Hanukkah 2024",
2024,
12,
25,
"holiday",
"jewish",
));
}
// Historical events
self.anchors.push(TemporalAnchor::new(
"Moon Landing",
1969,
7,
20,
"historical",
"global",
));
self.anchors.push(TemporalAnchor::new(
"Fall of Berlin Wall",
1989,
11,
9,
"historical",
"global",
));
self.anchors.push(TemporalAnchor::new(
"Y2K",
2000,
1,
1,
"historical",
"global",
));
}
/// Generate a single puzzle with multi-dimensional difficulty vector.
///
/// Difficulty scaling (higher = more work, not tighter posterior):
/// - Low (1-2): small range, no DayOfWeek, no distractors
/// - Medium (3-6): DayOfWeek + moderate range = 7x cost surface
/// - High (7-10): wide range + distractors + ambiguity + anchor constraints
///
/// All modes have access to weekday skipping; what differs is the policy.
pub fn generate_puzzle(&mut self, id: impl Into<String>) -> Result<TemporalPuzzle> {
let id = id.into();
let difficulty = self
.rng
.gen_range(self.config.min_difficulty..=self.config.max_difficulty);
// Build difficulty vector from scalar
let dv = DifficultyVector::from_scalar(difficulty);
// DayOfWeek (difficulty 3+): creates cost surface for policy decisions
let use_day_of_week = difficulty >= 3;
// Range size from difficulty vector (wider range at higher difficulty)
let range_days = dv.range_size as i64;
// Pick target date
let year = self
.rng
.gen_range(self.config.year_range.0..=self.config.year_range.1);
let month = self.rng.gen_range(1..=12);
let max_day = days_in_month(year, month);
let day = self.rng.gen_range(1..=max_day);
let target = NaiveDate::from_ymd_opt(year, month, day).unwrap();
// Build Between range centered on target, clamped to year
let year_start = NaiveDate::from_ymd_opt(year, 1, 1).unwrap();
let year_end = NaiveDate::from_ymd_opt(year, 12, 31).unwrap();
let half = range_days / 2;
let range_start = (target - chrono::Duration::days(half)).max(year_start);
let range_end = (range_start + chrono::Duration::days(range_days - 1)).min(year_end);
let mut puzzle = TemporalPuzzle::new(id.clone(), format!("Find the date (puzzle {})", id))
.with_difficulty(difficulty)
.with_solutions(vec![target]);
// Attach difficulty vector
puzzle.difficulty_vector = Some(dv.clone());
// Base constraints: InYear + Between (defines search range)
puzzle
.constraints
.push(TemporalConstraint::InYear(target.year()));
puzzle
.constraints
.push(TemporalConstraint::Between(range_start, range_end));
let mut used_anchors: Vec<TemporalAnchor> = Vec::new();
// DayOfWeek (difficulty 3+): creates cost surface for all modes
if use_day_of_week {
puzzle
.constraints
.push(TemporalConstraint::DayOfWeek(target.weekday()));
}
// Anchor reference for high difficulty (7+)
if difficulty >= 7 && self.config.relative_constraints {
if let Some(anchor) = self.anchors.choose(&mut self.rng).cloned() {
let diff = (target - anchor.date).num_days();
let constraint = if diff >= 0 {
TemporalConstraint::DaysAfter(anchor.name.clone(), diff)
} else {
TemporalConstraint::DaysBefore(anchor.name.clone(), diff.abs())
};
puzzle.constraints.push(constraint);
used_anchors.push(anchor);
}
}
// Add anchor references
for anchor in used_anchors {
puzzle.references.insert(anchor.name.clone(), anchor.date);
}
// Distractor injection (from difficulty vector rate)
if dv.distractor_rate > 0.0 && self.rng.gen_bool(dv.distractor_rate.min(0.99)) {
let distractor = self.generate_distractor(target, range_start, range_end);
puzzle.constraints.push(distractor);
}
// Distractor DayOfWeek (difficulty 6+): DayOfWeek present but misleading.
// Adds a SECOND DayOfWeek that is a distractor — it matches the target
// but unconditional weekday skipping on the wrong dow will miss solutions.
// This creates a real tradeoff for the PolicyKernel.
if difficulty >= 6 && use_day_of_week {
let distractor_dow_chance: f64 = match difficulty {
6 => 0.15,
7 => 0.25,
8 => 0.35,
9..=10 => 0.50,
_ => 0.0,
};
if self.rng.gen_bool(distractor_dow_chance.min(0.99)) {
// Add a redundant wider Between that doesn't narrow search
// but pairs with the existing DayOfWeek to create a trap:
// the DayOfWeek is valid but the wider range means skip saves less
let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(14..60));
let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(14..60));
puzzle
.constraints
.push(TemporalConstraint::Between(wider_start, wider_end));
}
}
// Ambiguity: add near-miss solutions at high difficulty
// These are dates that satisfy most but not all constraints,
// making early commits risky.
if dv.ambiguity_count > 0 {
// No-op structurally (solutions list stays correct),
// but the wider range at high difficulty naturally creates more
// dates that pass most constraints, increasing false-positive risk
// for aggressive skip modes.
}
// Count actual distractors injected (deterministic, observable)
let actual_distractor_count = crate::temporal::count_distractors(&puzzle);
// Tags: all features visible to policies for deterministic observability
puzzle.tags = vec![
format!("difficulty:{}", difficulty),
format!("year:{}", year),
format!("range_size:{}", dv.range_size),
format!("distractor_rate:{:.2}", dv.distractor_rate),
format!("distractor_count:{}", actual_distractor_count),
format!("ambiguity:{}", dv.ambiguity_count),
format!("has_dow:{}", use_day_of_week),
];
Ok(puzzle)
}
/// Generate a distractor constraint: true for the target but doesn't narrow the search.
fn generate_distractor(
&mut self,
target: NaiveDate,
range_start: NaiveDate,
range_end: NaiveDate,
) -> TemporalConstraint {
match self.rng.gen_range(0u8..3) {
0 => {
// Wider Between (superset of existing range → no shrink)
let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(10..60));
let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(10..60));
TemporalConstraint::Between(wider_start, wider_end)
}
1 => {
// Redundant InYear (already present)
TemporalConstraint::InYear(target.year())
}
_ => {
// After a date well before the range (no shrink)
let days_before = self.rng.gen_range(30..180) as i64;
TemporalConstraint::After(target - chrono::Duration::days(days_before))
}
}
}
/// Generate a batch of puzzles
pub fn generate_batch(&mut self, count: usize) -> Result<Vec<TemporalPuzzle>> {
let mut puzzles = Vec::with_capacity(count);
for i in 0..count {
let puzzle = self.generate_puzzle(format!("puzzle-{:04}", i + 1))?;
puzzles.push(puzzle);
}
Ok(puzzles)
}
/// Generate puzzles at specific difficulty
pub fn generate_at_difficulty(
&mut self,
count: usize,
difficulty: u8,
) -> Result<Vec<TemporalPuzzle>> {
let orig_min = self.config.min_difficulty;
let orig_max = self.config.max_difficulty;
self.config.min_difficulty = difficulty;
self.config.max_difficulty = difficulty;
let puzzles = self.generate_batch(count);
self.config.min_difficulty = orig_min;
self.config.max_difficulty = orig_max;
puzzles
}
}
/// Range size by difficulty level.
/// Higher difficulty → wider range → more work for the solver.
fn difficulty_to_range_size(difficulty: u8) -> usize {
match difficulty {
1 => 14,
2 => 30,
3 => 56, // 8 weeks
4 => 84, // 12 weeks
5 => 120,
6 => 150,
7 => 200,
8 => 250,
9 => 300,
10 => 365,
_ => 120,
}
}
/// Posterior target by difficulty level.
/// Higher difficulty → more valid candidates → more ambiguity.
/// (Flipped from old model: difficulty increases ambiguity, not reduces it.)
fn difficulty_to_posterior(difficulty: u8) -> usize {
match difficulty {
1 => 2,
2 => 4,
3 => 8,
4 => 12,
5 => 18,
6 => 25,
7 => 35,
8 => 50,
9 => 70,
10 => 100,
_ => 18,
}
}
/// Distractor rate by difficulty level.
fn difficulty_to_distractor_rate(difficulty: u8) -> f64 {
match difficulty {
1..=3 => 0.0,
4 => 0.05,
5 => 0.10,
6 => 0.20,
7 => 0.30,
8 => 0.40,
9 => 0.50,
10 => 0.60,
_ => 0.10,
}
}
/// Noise rate by difficulty level.
fn difficulty_to_noise_rate(difficulty: u8) -> f64 {
match difficulty {
1..=3 => 0.0,
4..=5 => 0.10,
6..=7 => 0.20,
8..=9 => 0.30,
10 => 0.40,
_ => 0.10,
}
}
/// Ambiguity count by difficulty level (near-miss solutions).
fn difficulty_to_ambiguity(difficulty: u8) -> usize {
match difficulty {
1..=4 => 0,
5..=6 => 1,
7..=8 => 2,
9 => 3,
10 => 5,
_ => 0,
}
}
/// Days in a given month (handles leap years).
fn days_in_month(year: i32, month: u32) -> u32 {
match month {
4 | 6 | 9 | 11 => 30,
2 => {
if year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) {
29
} else {
28
}
}
_ => 31,
}
}
/// Sample puzzle sets
pub struct SamplePuzzles;
impl SamplePuzzles {
/// Get easy puzzles (difficulty 1-3)
pub fn easy() -> Vec<TemporalPuzzle> {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
min_difficulty: 1,
max_difficulty: 3,
seed: Some(42),
..Default::default()
});
gen.generate_batch(10).unwrap()
}
/// Get medium puzzles (difficulty 4-6)
pub fn medium() -> Vec<TemporalPuzzle> {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
min_difficulty: 4,
max_difficulty: 6,
seed: Some(42),
..Default::default()
});
gen.generate_batch(10).unwrap()
}
/// Get hard puzzles (difficulty 7-10)
pub fn hard() -> Vec<TemporalPuzzle> {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
min_difficulty: 7,
max_difficulty: 10,
seed: Some(42),
..Default::default()
});
gen.generate_batch(10).unwrap()
}
/// Get cross-cultural puzzles
pub fn cross_cultural() -> Vec<TemporalPuzzle> {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
cross_cultural: true,
relative_constraints: true,
min_difficulty: 5,
max_difficulty: 8,
seed: Some(42),
..Default::default()
});
gen.generate_batch(10).unwrap()
}
/// Get a mixed sample set (50 puzzles across all difficulties)
pub fn mixed_sample() -> Vec<TemporalPuzzle> {
let mut all = Vec::new();
all.extend(Self::easy());
all.extend(Self::medium());
all.extend(Self::hard());
all.extend(Self::cross_cultural());
// Add more easy/medium to match TimePuzzles distribution
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
min_difficulty: 2,
max_difficulty: 5,
seed: Some(123),
..Default::default()
});
all.extend(gen.generate_batch(10).unwrap());
all
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_puzzle_generation() {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
seed: Some(42),
..Default::default()
});
let puzzle = gen.generate_puzzle("test-1").unwrap();
assert!(!puzzle.constraints.is_empty());
assert!(!puzzle.solutions.is_empty());
}
#[test]
fn test_batch_generation() {
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
seed: Some(42),
..Default::default()
});
let puzzles = gen.generate_batch(20).unwrap();
assert_eq!(puzzles.len(), 20);
}
#[test]
fn test_sample_puzzles() {
let easy = SamplePuzzles::easy();
assert_eq!(easy.len(), 10);
assert!(easy.iter().all(|p| p.difficulty <= 3));
let hard = SamplePuzzles::hard();
assert!(hard.iter().all(|p| p.difficulty >= 7));
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,417 @@
//! Integration tests for benchmark suite
use chrono::{NaiveDate, Weekday};
use ruvector_benchmarks::{
logging::BenchmarkLogger,
swarm_regret::{EpisodeResult, RegretTracker, SwarmController},
temporal::{TemporalConstraint, TemporalPuzzle, TemporalSolver},
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
};
use tempfile::tempdir;
// ============================================================================
// Vector Index Tests
// ============================================================================
#[test]
fn test_vector_index_insert_search() {
let mut idx = VectorIndex::new(4);
let id1 = idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
let id2 = idx.insert(DenseVec::new(vec![0.9, 0.1, 0.0, 0.0])).unwrap();
let _id3 = idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
let results = idx.search(&q, 2, 1.0).unwrap();
assert_eq!(results.len(), 2);
assert_eq!(results[0].id, id1);
assert!(results[0].score > results[1].score);
}
#[test]
fn test_vector_index_coherence_gate() {
let gate = CoherenceGate::new(0.5);
let mut idx = VectorIndex::new(4).with_gate(gate);
idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
// Low coherence - blocked
let results = idx.search(&q, 10, 0.3).unwrap();
assert!(results.is_empty());
// High coherence - allowed
let results = idx.search(&q, 10, 0.7).unwrap();
assert!(!results.is_empty());
}
#[test]
fn test_vector_index_ivf() {
let ivf = IvfConfig::new(4, 2);
let mut idx = VectorIndex::new(8).with_ivf(ivf);
// Insert enough vectors for clustering
for _ in 0..100 {
idx.insert(DenseVec::random(8)).unwrap();
}
idx.rebuild_ivf().unwrap();
let stats = idx.stats();
assert!(stats.ivf_enabled);
assert!(stats.ivf_clusters > 0);
// Search should work
let q = DenseVec::random(8);
let results = idx.search(&q, 5, 1.0).unwrap();
assert!(results.len() <= 5);
}
#[test]
fn test_vector_index_persistence() {
let dir = tempdir().unwrap();
let path = dir.path().join("test_index.bin");
let mut idx = VectorIndex::new(4);
idx.insert(DenseVec::new(vec![1.0, 2.0, 3.0, 4.0])).unwrap();
idx.insert(DenseVec::new(vec![5.0, 6.0, 7.0, 8.0])).unwrap();
idx.save_to_file(&path).unwrap();
let loaded = VectorIndex::load_from_file(&path).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded.dim(), 4);
}
// ============================================================================
// Temporal Reasoning Tests
// ============================================================================
#[test]
fn test_temporal_puzzle_exact_date() {
let target = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
let puzzle = TemporalPuzzle::new("test", "Find June 15, 2024")
.with_constraint(TemporalConstraint::Exact(target))
.with_solutions(vec![target]);
assert!(puzzle.check_date(target).unwrap());
assert!(!puzzle
.check_date(NaiveDate::from_ymd_opt(2024, 6, 14).unwrap())
.unwrap());
}
#[test]
fn test_temporal_puzzle_range() {
let start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
let end = NaiveDate::from_ymd_opt(2024, 1, 31).unwrap();
let puzzle = TemporalPuzzle::new("test", "Find a date in January 2024")
.with_constraint(TemporalConstraint::Between(start, end));
assert!(puzzle
.check_date(NaiveDate::from_ymd_opt(2024, 1, 15).unwrap())
.unwrap());
assert!(!puzzle
.check_date(NaiveDate::from_ymd_opt(2024, 2, 1).unwrap())
.unwrap());
}
#[test]
fn test_temporal_puzzle_day_of_week() {
let puzzle = TemporalPuzzle::new("test", "Find a Monday in 2024")
.with_constraint(TemporalConstraint::InYear(2024))
.with_constraint(TemporalConstraint::DayOfWeek(Weekday::Mon));
// Jan 1, 2024 is a Monday
assert!(puzzle
.check_date(NaiveDate::from_ymd_opt(2024, 1, 1).unwrap())
.unwrap());
// Jan 2, 2024 is a Tuesday
assert!(!puzzle
.check_date(NaiveDate::from_ymd_opt(2024, 1, 2).unwrap())
.unwrap());
}
#[test]
fn test_temporal_puzzle_relative() {
let base = NaiveDate::from_ymd_opt(2024, 3, 1).unwrap();
let puzzle = TemporalPuzzle::new("test", "Find 10 days after base")
.with_reference("base", base)
.with_constraint(TemporalConstraint::DaysAfter("base".to_string(), 10));
let target = NaiveDate::from_ymd_opt(2024, 3, 11).unwrap();
assert!(puzzle.check_date(target).unwrap());
}
#[test]
fn test_temporal_solver_basic() {
let target = NaiveDate::from_ymd_opt(2024, 5, 20).unwrap();
let puzzle = TemporalPuzzle::new("test", "Simple puzzle")
.with_constraint(TemporalConstraint::Exact(target))
.with_solutions(vec![target]);
let mut solver = TemporalSolver::with_tools(true, false);
let result = solver.solve(&puzzle).unwrap();
assert!(result.solved);
assert!(result.correct);
}
#[test]
fn test_temporal_solver_with_rewriting() {
let base = NaiveDate::from_ymd_opt(2024, 7, 4).unwrap();
let target = NaiveDate::from_ymd_opt(2024, 7, 14).unwrap();
let puzzle = TemporalPuzzle::new("test", "Relative puzzle")
.with_reference("event", base)
.with_constraint(TemporalConstraint::DaysAfter("event".to_string(), 10))
.with_solutions(vec![target]);
let mut solver = TemporalSolver::with_tools(true, false);
let result = solver.solve(&puzzle).unwrap();
assert!(result.solved);
assert!(result.correct);
assert!(result.tool_calls > 0); // Rewriting used
}
// ============================================================================
// TimePuzzles Generator Tests
// ============================================================================
#[test]
fn test_puzzle_generator_basic() {
let config = PuzzleGeneratorConfig {
seed: Some(42),
..Default::default()
};
let mut gen = PuzzleGenerator::new(config);
let puzzle = gen.generate_puzzle("test-1").unwrap();
assert!(!puzzle.constraints.is_empty());
assert!(!puzzle.solutions.is_empty());
assert!(puzzle.difficulty >= 1 && puzzle.difficulty <= 10);
}
#[test]
fn test_puzzle_generator_batch() {
let config = PuzzleGeneratorConfig {
seed: Some(42),
..Default::default()
};
let mut gen = PuzzleGenerator::new(config);
let puzzles = gen.generate_batch(20).unwrap();
assert_eq!(puzzles.len(), 20);
// All puzzles should be valid
for puzzle in &puzzles {
assert!(!puzzle.constraints.is_empty());
assert!(!puzzle.solutions.is_empty());
}
}
#[test]
fn test_puzzle_generator_difficulty() {
let config = PuzzleGeneratorConfig {
min_difficulty: 7,
max_difficulty: 10,
seed: Some(42),
..Default::default()
};
let mut gen = PuzzleGenerator::new(config);
let puzzles = gen.generate_batch(10).unwrap();
for puzzle in &puzzles {
assert!(puzzle.difficulty >= 7);
assert!(puzzle.difficulty <= 10);
}
}
#[test]
fn test_sample_puzzles() {
let easy = SamplePuzzles::easy();
assert_eq!(easy.len(), 10);
assert!(easy.iter().all(|p| p.difficulty <= 3));
let medium = SamplePuzzles::medium();
assert!(medium
.iter()
.all(|p| p.difficulty >= 4 && p.difficulty <= 6));
let hard = SamplePuzzles::hard();
assert!(hard.iter().all(|p| p.difficulty >= 7));
let mixed = SamplePuzzles::mixed_sample();
assert!(mixed.len() >= 40);
}
// ============================================================================
// Swarm Regret Tests
// ============================================================================
#[test]
fn test_regret_tracker_basic() {
let mut tracker = RegretTracker::new(10);
let result = EpisodeResult {
episode: 1,
num_tasks: 20,
solved: 18,
correct: 17,
total_steps: 100,
tool_calls: 20,
latency_ms: 1000,
reward: 80.0,
oracle_reward: 99.0,
};
tracker.record_episode(result);
assert_eq!(tracker.episodes.len(), 1);
assert!((tracker.current_cumulative_regret() - 19.0).abs() < 0.01);
}
#[test]
fn test_regret_tracker_sublinear() {
let mut tracker = RegretTracker::new(10);
// Simulate improving performance (decreasing regret)
for i in 0..10 {
let accuracy = 0.5 + 0.05 * i as f64;
let result = EpisodeResult {
episode: i + 1,
num_tasks: 20,
solved: (20.0 * accuracy) as usize,
correct: (20.0 * accuracy) as usize,
total_steps: 100 - i * 5,
tool_calls: 20,
latency_ms: 1000,
reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
oracle_reward: 99.0,
};
tracker.record_episode(result);
}
// Average regret should be decreasing
assert!(tracker.is_sublinear());
assert!(tracker.regret_trend() < 0.0);
}
#[test]
fn test_swarm_controller() {
let mut controller = SwarmController::new(20);
// Run a few episodes
for _ in 0..5 {
controller.start_episode();
controller.complete_episode(18, 17, 80, 20, 500);
}
let status = controller.status();
assert_eq!(status.episode, 5);
assert!(status.accuracy > 0.8);
}
// ============================================================================
// Logging Tests
// ============================================================================
#[test]
fn test_benchmark_logger() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.log");
let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
logger
.log_temporal(
"bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
)
.unwrap();
logger
.log_vector("search", 128, 10000, 1, 10, true, 0.9, 500, 10)
.unwrap();
logger
.log_swarm(1, 20, 18, 17, 85.0, 99.0, 14.0, 14.0, true)
.unwrap();
logger.flush().unwrap();
// Read back
let reader = ruvector_benchmarks::logging::LogReader::new(path.to_str().unwrap());
let entries = reader.read_all().unwrap();
assert_eq!(entries.len(), 3);
}
// ============================================================================
// End-to-End Tests
// ============================================================================
#[test]
fn test_full_benchmark_workflow() {
// Generate puzzles
let config = PuzzleGeneratorConfig {
min_difficulty: 2,
max_difficulty: 5,
seed: Some(12345),
..Default::default()
};
let mut gen = PuzzleGenerator::new(config);
let puzzles = gen.generate_batch(10).unwrap();
// Create solver (budget must cover wider posterior-based ranges)
let mut solver = TemporalSolver::with_tools(true, false);
solver.max_steps = 400;
// Run all puzzles
let mut results = Vec::new();
for puzzle in &puzzles {
let result = solver.solve(puzzle).unwrap();
results.push(result);
}
// Check results
let solved = results.iter().filter(|r| r.solved).count();
let correct = results.iter().filter(|r| r.correct).count();
// Should solve most easy-medium puzzles
assert!(solved >= 5);
assert!(correct >= 5);
}
#[test]
fn test_vector_temporal_integration() {
// This tests using vector index to store temporal embeddings
let mut idx = VectorIndex::new(64);
// Create "embeddings" for dates (simplified)
for day in 1..=31 {
let mut values = vec![0.0f32; 64];
values[0] = day as f32 / 31.0; // Day component
values[1] = 1.0 / 12.0; // Month component (January)
values[2] = 2024.0 / 3000.0; // Year component
idx.insert(DenseVec::new(values)).unwrap();
}
// Search for similar dates
let mut query = vec![0.0f32; 64];
query[0] = 15.0 / 31.0; // Looking for mid-month
query[1] = 1.0 / 12.0;
query[2] = 2024.0 / 3000.0;
let results = idx.search(&DenseVec::new(query), 5, 1.0).unwrap();
// Should find dates near the 15th
assert!(!results.is_empty());
}