Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
110
vendor/ruvector/examples/benchmarks/Cargo.toml
vendored
Normal file
110
vendor/ruvector/examples/benchmarks/Cargo.toml
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
[package]
|
||||
name = "ruvector-benchmarks"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Comprehensive benchmarks for temporal reasoning and vector operations"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
# Core ruvector
|
||||
ruvector-core = { path = "../../crates/ruvector-core", default-features = false, features = ["parallel"] }
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
bincode = { version = "2.0.0-rc.3", features = ["serde"] }
|
||||
|
||||
# Error handling
|
||||
anyhow = "1.0"
|
||||
thiserror = "2.0"
|
||||
|
||||
# Random and numerics
|
||||
rand = "0.8"
|
||||
rand_distr = "0.4"
|
||||
|
||||
# Parallel processing
|
||||
rayon = "1.10"
|
||||
|
||||
# CLI and progress
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
indicatif = "0.17"
|
||||
console = "0.15"
|
||||
|
||||
# Async
|
||||
tokio = { version = "1.41", features = ["rt-multi-thread", "sync", "macros", "time", "fs"] }
|
||||
futures = "0.3"
|
||||
|
||||
# Time handling (critical for temporal benchmarks)
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
# Logging and tracing
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||
|
||||
# Crypto for witness chains
|
||||
sha2 = "0.10"
|
||||
|
||||
# RVF native format integration
|
||||
rvf-types = { path = "../../crates/rvf/rvf-types" }
|
||||
rvf-crypto = { path = "../../crates/rvf/rvf-crypto" }
|
||||
rvf-wire = { path = "../../crates/rvf/rvf-wire" }
|
||||
|
||||
# Statistics
|
||||
statistical = "1.0"
|
||||
hdrhistogram = "7.5"
|
||||
|
||||
# HTTP for tool-augmented tests
|
||||
reqwest = { version = "0.11", features = ["json"] }
|
||||
|
||||
# Visualization
|
||||
plotters = { version = "0.3", optional = true }
|
||||
|
||||
# Type theory for verified reasoning (lean-agentic)
|
||||
lean-agentic = "0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.13"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
visualize = ["plotters"]
|
||||
|
||||
[[bin]]
|
||||
name = "temporal-benchmark"
|
||||
path = "src/bin/temporal_benchmark.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "vector-benchmark"
|
||||
path = "src/bin/vector_benchmark.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "swarm-regret"
|
||||
path = "src/bin/swarm_regret.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "timepuzzle-runner"
|
||||
path = "src/bin/timepuzzle_runner.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "intelligence-assessment"
|
||||
path = "src/bin/intelligence_assessment.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "rvf-intelligence-bench"
|
||||
path = "src/bin/rvf_intelligence_bench.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "superintelligence"
|
||||
path = "src/bin/superintelligence.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "agi-proof-harness"
|
||||
path = "src/bin/agi_proof_harness.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "acceptance-rvf"
|
||||
path = "src/bin/acceptance_rvf.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "wasm-solver-bench"
|
||||
path = "src/bin/wasm_solver_bench.rs"
|
||||
1165
vendor/ruvector/examples/benchmarks/src/acceptance_test.rs
vendored
Normal file
1165
vendor/ruvector/examples/benchmarks/src/acceptance_test.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
627
vendor/ruvector/examples/benchmarks/src/agi_contract.rs
vendored
Normal file
627
vendor/ruvector/examples/benchmarks/src/agi_contract.rs
vendored
Normal file
@@ -0,0 +1,627 @@
|
||||
//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
|
||||
//!
|
||||
//! The AGI contract states: a system improves utility over time without violating
|
||||
//! policy, while maintaining structural health.
|
||||
//!
|
||||
//! ## Core Metrics (all deterministic, all auditable)
|
||||
//!
|
||||
//! - **Solved tasks per cost** — graded outcomes normalized by compute
|
||||
//! - **Stability under noise** — accuracy retention when inputs are corrupted
|
||||
//! - **Contradiction rate** — solved-but-wrong / total attempted
|
||||
//! - **Rollback correctness** — recovery rate when bad inputs are detected
|
||||
//! - **Policy violations** — budget overruns + contradictions (must be zero)
|
||||
//!
|
||||
//! ## Autonomy Ladder
|
||||
//!
|
||||
//! Each level requires sustained health metrics before advancement:
|
||||
//! 0. Read-only (observe only)
|
||||
//! 1. Write to memory (store episodes, no execution)
|
||||
//! 2. Execute tools (run solver, generate puzzles)
|
||||
//! 3. Write to external systems (publish results)
|
||||
//! 4. Deploy and operate (self-directed improvement)
|
||||
|
||||
use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Contract Health Snapshot
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// A single point-in-time health measurement against the AGI contract.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ContractHealth {
|
||||
/// Solved tasks per unit cost (tasks_correct / total_steps)
|
||||
pub solved_per_cost: f64,
|
||||
/// Accuracy on noise-injected tasks
|
||||
pub noise_stability: f64,
|
||||
/// Contradiction rate: solved-but-wrong / attempted
|
||||
pub contradiction_rate: f64,
|
||||
/// Rollback correctness: successful rollbacks / attempted rollbacks
|
||||
pub rollback_correctness: f64,
|
||||
/// Total policy violations (must be zero for contract compliance)
|
||||
pub policy_violations: usize,
|
||||
/// Clean accuracy (graded outcome baseline)
|
||||
pub accuracy: f64,
|
||||
/// Cost efficiency (0-1, higher = cheaper per solve)
|
||||
pub cost_efficiency: f64,
|
||||
/// Whether the contract is satisfied
|
||||
pub compliant: bool,
|
||||
}
|
||||
|
||||
impl ContractHealth {
|
||||
/// Evaluate contract health from raw metrics.
|
||||
pub fn from_raw(raw: &RawMetrics) -> Self {
|
||||
let accuracy = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_correct as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let solved_per_cost = if raw.total_steps > 0 {
|
||||
raw.tasks_correct as f64 / raw.total_steps as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let noise_stability = if raw.noise_tasks_attempted > 0 {
|
||||
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let contradiction_rate = if raw.tasks_attempted > 0 {
|
||||
raw.contradictions as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let rollback_correctness = if raw.rollback_attempts > 0 {
|
||||
raw.rollback_successes as f64 / raw.rollback_attempts as f64
|
||||
} else {
|
||||
1.0 // no rollbacks needed => perfect
|
||||
};
|
||||
|
||||
let cost_efficiency = (1.0 - {
|
||||
let sps = if raw.tasks_correct > 0 {
|
||||
raw.total_steps as f64 / raw.tasks_correct as f64
|
||||
} else {
|
||||
100.0
|
||||
};
|
||||
(sps - 5.0) / 95.0
|
||||
})
|
||||
.clamp(0.0, 1.0);
|
||||
|
||||
let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;
|
||||
|
||||
ContractHealth {
|
||||
solved_per_cost,
|
||||
noise_stability,
|
||||
contradiction_rate,
|
||||
rollback_correctness,
|
||||
policy_violations: raw.policy_violations,
|
||||
accuracy,
|
||||
cost_efficiency,
|
||||
compliant,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate contract health from an IntelligenceAssessment.
|
||||
pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
|
||||
Self::from_raw(&assessment.raw_data)
|
||||
}
|
||||
|
||||
/// Print formatted contract health report.
|
||||
pub fn print(&self) {
|
||||
println!(" Contract Health:");
|
||||
println!(" Solved/Cost: {:.4}", self.solved_per_cost);
|
||||
println!(
|
||||
" Noise Stability: {:.2}%",
|
||||
self.noise_stability * 100.0
|
||||
);
|
||||
println!(
|
||||
" Contradiction Rate: {:.4}%",
|
||||
self.contradiction_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" Rollback Correct: {:.2}%",
|
||||
self.rollback_correctness * 100.0
|
||||
);
|
||||
println!(" Policy Violations: {}", self.policy_violations);
|
||||
println!(" Accuracy: {:.2}%", self.accuracy * 100.0);
|
||||
println!(
|
||||
" Cost Efficiency: {:.2}%",
|
||||
self.cost_efficiency * 100.0
|
||||
);
|
||||
println!(
|
||||
" Compliant: {}",
|
||||
if self.compliant { "YES" } else { "NO" }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Contract Trend — compares two snapshots
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Tracks improvement across contract dimensions between two measurement points.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ContractDelta {
|
||||
/// Change in solved-per-cost (positive = improving)
|
||||
pub solved_per_cost_delta: f64,
|
||||
/// Change in noise stability (positive = more robust)
|
||||
pub noise_stability_delta: f64,
|
||||
/// Change in contradiction rate (negative = improving)
|
||||
pub contradiction_rate_delta: f64,
|
||||
/// Change in rollback correctness (positive = better recovery)
|
||||
pub rollback_delta: f64,
|
||||
/// Change in accuracy (positive = better)
|
||||
pub accuracy_delta: f64,
|
||||
/// Change in cost efficiency (positive = cheaper)
|
||||
pub cost_efficiency_delta: f64,
|
||||
/// Number of dimensions that improved
|
||||
pub dimensions_improved: usize,
|
||||
/// Number of dimensions that regressed
|
||||
pub dimensions_regressed: usize,
|
||||
}
|
||||
|
||||
impl ContractDelta {
|
||||
/// Compute delta between two health snapshots.
|
||||
pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
|
||||
let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
|
||||
let noise_stability_delta = after.noise_stability - before.noise_stability;
|
||||
let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
|
||||
let rollback_delta = after.rollback_correctness - before.rollback_correctness;
|
||||
let accuracy_delta = after.accuracy - before.accuracy;
|
||||
let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;
|
||||
|
||||
// Count improvements (positive is better for all except contradiction_rate)
|
||||
let deltas = [
|
||||
solved_per_cost_delta > 0.001,
|
||||
noise_stability_delta > 0.001,
|
||||
contradiction_rate_delta < -0.001, // decrease = improvement
|
||||
rollback_delta > 0.001,
|
||||
accuracy_delta > 0.001,
|
||||
cost_efficiency_delta > 0.001,
|
||||
];
|
||||
let regressions = [
|
||||
solved_per_cost_delta < -0.001,
|
||||
noise_stability_delta < -0.001,
|
||||
contradiction_rate_delta > 0.001,
|
||||
rollback_delta < -0.001,
|
||||
accuracy_delta < -0.01,
|
||||
cost_efficiency_delta < -0.001,
|
||||
];
|
||||
|
||||
ContractDelta {
|
||||
solved_per_cost_delta,
|
||||
noise_stability_delta,
|
||||
contradiction_rate_delta,
|
||||
rollback_delta,
|
||||
accuracy_delta,
|
||||
cost_efficiency_delta,
|
||||
dimensions_improved: deltas.iter().filter(|&&d| d).count(),
|
||||
dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn print(&self) {
|
||||
let arrow = |v: f64, invert: bool| {
|
||||
let positive = if invert { v < 0.0 } else { v > 0.0 };
|
||||
if positive {
|
||||
"+"
|
||||
} else if v == 0.0 {
|
||||
"="
|
||||
} else {
|
||||
"-"
|
||||
}
|
||||
};
|
||||
println!(" Contract Delta:");
|
||||
println!(
|
||||
" Solved/Cost: {:>+.4} [{}]",
|
||||
self.solved_per_cost_delta,
|
||||
arrow(self.solved_per_cost_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Noise Stability: {:>+.4} [{}]",
|
||||
self.noise_stability_delta,
|
||||
arrow(self.noise_stability_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Contradiction: {:>+.4} [{}]",
|
||||
self.contradiction_rate_delta,
|
||||
arrow(self.contradiction_rate_delta, true)
|
||||
);
|
||||
println!(
|
||||
" Rollback: {:>+.4} [{}]",
|
||||
self.rollback_delta,
|
||||
arrow(self.rollback_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Accuracy: {:>+.4} [{}]",
|
||||
self.accuracy_delta,
|
||||
arrow(self.accuracy_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Cost Efficiency: {:>+.4} [{}]",
|
||||
self.cost_efficiency_delta,
|
||||
arrow(self.cost_efficiency_delta, false)
|
||||
);
|
||||
println!(" Dimensions improved: {}/6", self.dimensions_improved);
|
||||
println!(" Dimensions regressed: {}/6", self.dimensions_regressed);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Autonomy Ladder
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Autonomy level gated by sustained contract health.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub enum AutonomyLevel {
|
||||
/// Level 0: Read-only observation
|
||||
ReadOnly = 0,
|
||||
/// Level 1: Write to memory (store episodes)
|
||||
WriteMemory = 1,
|
||||
/// Level 2: Execute tools (run solver)
|
||||
ExecuteTools = 2,
|
||||
/// Level 3: Write to external systems (publish results)
|
||||
WriteExternal = 3,
|
||||
/// Level 4: Deploy and operate (self-directed improvement)
|
||||
DeployOperate = 4,
|
||||
}
|
||||
|
||||
/// Thresholds for advancing autonomy levels.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AutonomyGates {
|
||||
/// Minimum consecutive compliant cycles to advance
|
||||
pub min_compliant_cycles: usize,
|
||||
/// Maximum allowed contradiction rate per level
|
||||
pub max_contradiction_rate: [f64; 5],
|
||||
/// Minimum accuracy per level
|
||||
pub min_accuracy: [f64; 5],
|
||||
/// Minimum cost efficiency per level
|
||||
pub min_cost_efficiency: [f64; 5],
|
||||
/// Minimum noise stability per level
|
||||
pub min_noise_stability: [f64; 5],
|
||||
/// Must have zero policy violations for levels >= 2
|
||||
pub zero_violations_above: AutonomyLevel,
|
||||
}
|
||||
|
||||
impl Default for AutonomyGates {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_compliant_cycles: 3,
|
||||
// L0 L1 L2 L3 L4
|
||||
max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
|
||||
min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
|
||||
min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
|
||||
min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
|
||||
zero_violations_above: AutonomyLevel::ExecuteTools,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluator that determines current autonomy level from contract history.
|
||||
pub struct AutonomyEvaluator {
|
||||
pub gates: AutonomyGates,
|
||||
}
|
||||
|
||||
impl Default for AutonomyEvaluator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gates: AutonomyGates::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AutonomyEvaluator {
|
||||
/// Determine the highest autonomy level supported by the health history.
|
||||
/// `history` is ordered oldest-first.
|
||||
pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
|
||||
if history.is_empty() {
|
||||
return AutonomyLevel::ReadOnly;
|
||||
}
|
||||
|
||||
let mut level = AutonomyLevel::ReadOnly;
|
||||
let levels = [
|
||||
AutonomyLevel::WriteMemory,
|
||||
AutonomyLevel::ExecuteTools,
|
||||
AutonomyLevel::WriteExternal,
|
||||
AutonomyLevel::DeployOperate,
|
||||
];
|
||||
|
||||
for &candidate in &levels {
|
||||
let idx = candidate as usize;
|
||||
let required = self.gates.min_compliant_cycles;
|
||||
|
||||
// Need enough recent history
|
||||
if history.len() < required {
|
||||
break;
|
||||
}
|
||||
|
||||
let recent = &history[history.len().saturating_sub(required)..];
|
||||
let all_pass = recent.iter().all(|h| {
|
||||
h.accuracy >= self.gates.min_accuracy[idx]
|
||||
&& h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
|
||||
&& h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
|
||||
&& h.noise_stability >= self.gates.min_noise_stability[idx]
|
||||
&& (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
|
||||
});
|
||||
|
||||
if all_pass {
|
||||
level = candidate;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
level
|
||||
}
|
||||
|
||||
pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
|
||||
let labels = [
|
||||
"Read-Only",
|
||||
"Write Memory",
|
||||
"Execute Tools",
|
||||
"Write External",
|
||||
"Deploy & Operate",
|
||||
];
|
||||
println!(
|
||||
" Autonomy Level: {} ({})",
|
||||
level as usize, labels[level as usize]
|
||||
);
|
||||
println!(" Gates for next level:");
|
||||
let next = (level as usize + 1).min(4);
|
||||
println!(
|
||||
" Accuracy: {:.0}% (need {:.0}%)",
|
||||
health.accuracy * 100.0,
|
||||
self.gates.min_accuracy[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Contradiction: {:.3}% (need <{:.3}%)",
|
||||
health.contradiction_rate * 100.0,
|
||||
self.gates.max_contradiction_rate[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Cost Eff: {:.0}% (need {:.0}%)",
|
||||
health.cost_efficiency * 100.0,
|
||||
self.gates.min_cost_efficiency[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Noise Stab: {:.0}% (need {:.0}%)",
|
||||
health.noise_stability * 100.0,
|
||||
self.gates.min_noise_stability[next] * 100.0
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Viability Checklist
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// The 5 viability checks that determine if the system is on an AGI trajectory.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ViabilityChecklist {
|
||||
/// Can replay runs and get identical grades
|
||||
pub deterministic_replay: bool,
|
||||
/// Improves utility over time without raising policy violations
|
||||
pub improving_without_violations: bool,
|
||||
/// Can roll back bad learning reliably
|
||||
pub reliable_rollback: bool,
|
||||
/// Can generate infinite novel tasks with automatic grading
|
||||
pub infinite_gradeable_tasks: bool,
|
||||
/// Cost per solve trending down over weeks
|
||||
pub cost_trending_down: bool,
|
||||
}
|
||||
|
||||
impl ViabilityChecklist {
|
||||
/// Evaluate from contract health history.
|
||||
pub fn evaluate(history: &[ContractHealth]) -> Self {
|
||||
// Deterministic replay: verified externally (always true in our harness)
|
||||
let deterministic_replay = true;
|
||||
|
||||
// Improving without violations: later health better than earlier, zero violations
|
||||
let improving_without_violations = if history.len() >= 2 {
|
||||
let first = &history[0];
|
||||
let last = &history[history.len() - 1];
|
||||
last.accuracy >= first.accuracy
|
||||
&& last.policy_violations == 0
|
||||
&& history.iter().all(|h| h.policy_violations == 0)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Reliable rollback: rollback correctness >= 80% when attempted
|
||||
let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);
|
||||
|
||||
// Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
|
||||
let infinite_gradeable_tasks = true;
|
||||
|
||||
// Cost trending down: solved_per_cost increases over time
|
||||
let cost_trending_down = if history.len() >= 3 {
|
||||
let first_third: f64 = history[..history.len() / 3]
|
||||
.iter()
|
||||
.map(|h| h.solved_per_cost)
|
||||
.sum::<f64>()
|
||||
/ (history.len() / 3) as f64;
|
||||
let last_third: f64 = history[history.len() * 2 / 3..]
|
||||
.iter()
|
||||
.map(|h| h.solved_per_cost)
|
||||
.sum::<f64>()
|
||||
/ (history.len() - history.len() * 2 / 3) as f64;
|
||||
last_third > first_third
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
ViabilityChecklist {
|
||||
deterministic_replay,
|
||||
improving_without_violations,
|
||||
reliable_rollback,
|
||||
infinite_gradeable_tasks,
|
||||
cost_trending_down,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn all_pass(&self) -> bool {
|
||||
self.deterministic_replay
|
||||
&& self.improving_without_violations
|
||||
&& self.reliable_rollback
|
||||
&& self.infinite_gradeable_tasks
|
||||
&& self.cost_trending_down
|
||||
}
|
||||
|
||||
pub fn print(&self) {
|
||||
let check = |b: bool| if b { "PASS" } else { "FAIL" };
|
||||
println!(" Viability Checklist:");
|
||||
println!(
|
||||
" 1. Deterministic replay: {}",
|
||||
check(self.deterministic_replay)
|
||||
);
|
||||
println!(
|
||||
" 2. Improving w/o violations: {}",
|
||||
check(self.improving_without_violations)
|
||||
);
|
||||
println!(
|
||||
" 3. Reliable rollback: {}",
|
||||
check(self.reliable_rollback)
|
||||
);
|
||||
println!(
|
||||
" 4. Infinite gradeable tasks: {}",
|
||||
check(self.infinite_gradeable_tasks)
|
||||
);
|
||||
println!(
|
||||
" 5. Cost trending down: {}",
|
||||
check(self.cost_trending_down)
|
||||
);
|
||||
println!(
|
||||
" Overall: {}",
|
||||
if self.all_pass() {
|
||||
"VIABLE AGI TRAJECTORY"
|
||||
} else {
|
||||
"NOT YET VIABLE"
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Tests
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn contract_health_from_raw() {
|
||||
let mut raw = RawMetrics::default();
|
||||
raw.tasks_attempted = 100;
|
||||
raw.tasks_completed = 95;
|
||||
raw.tasks_correct = 92;
|
||||
raw.total_steps = 600;
|
||||
raw.noise_tasks_attempted = 30;
|
||||
raw.noise_tasks_correct = 25;
|
||||
raw.contradictions = 0; // zero contradictions for compliance
|
||||
raw.rollback_attempts = 5;
|
||||
raw.rollback_successes = 4;
|
||||
|
||||
let health = ContractHealth::from_raw(&raw);
|
||||
assert!((health.accuracy - 0.92).abs() < 0.01);
|
||||
assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
|
||||
assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
|
||||
assert!((health.contradiction_rate).abs() < 0.001);
|
||||
assert!((health.rollback_correctness - 0.8).abs() < 0.01);
|
||||
assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn contract_delta_detects_improvement() {
|
||||
let before = ContractHealth {
|
||||
solved_per_cost: 0.10,
|
||||
noise_stability: 0.70,
|
||||
contradiction_rate: 0.03,
|
||||
rollback_correctness: 0.80,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.85,
|
||||
cost_efficiency: 0.50,
|
||||
compliant: false,
|
||||
};
|
||||
let after = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.85,
|
||||
contradiction_rate: 0.01,
|
||||
rollback_correctness: 0.90,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.93,
|
||||
cost_efficiency: 0.70,
|
||||
compliant: true,
|
||||
};
|
||||
let delta = ContractDelta::between(&before, &after);
|
||||
assert_eq!(delta.dimensions_improved, 6);
|
||||
assert_eq!(delta.dimensions_regressed, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn autonomy_ladder_advances() {
|
||||
let evaluator = AutonomyEvaluator::default();
|
||||
|
||||
// No history => ReadOnly
|
||||
assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);
|
||||
|
||||
// 3 compliant cycles at L1 level
|
||||
let h = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.55,
|
||||
contradiction_rate: 0.04,
|
||||
rollback_correctness: 1.0,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.75,
|
||||
cost_efficiency: 0.30,
|
||||
compliant: true,
|
||||
};
|
||||
let history = vec![h.clone(), h.clone(), h.clone()];
|
||||
assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn viability_checklist_basic() {
|
||||
let h1 = ContractHealth {
|
||||
solved_per_cost: 0.10,
|
||||
noise_stability: 0.70,
|
||||
contradiction_rate: 0.01,
|
||||
rollback_correctness: 0.90,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.85,
|
||||
cost_efficiency: 0.50,
|
||||
compliant: true,
|
||||
};
|
||||
let h2 = ContractHealth {
|
||||
solved_per_cost: 0.12,
|
||||
noise_stability: 0.80,
|
||||
contradiction_rate: 0.005,
|
||||
rollback_correctness: 0.95,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.90,
|
||||
cost_efficiency: 0.60,
|
||||
compliant: true,
|
||||
};
|
||||
let h3 = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.85,
|
||||
contradiction_rate: 0.002,
|
||||
rollback_correctness: 0.95,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.93,
|
||||
cost_efficiency: 0.70,
|
||||
compliant: true,
|
||||
};
|
||||
let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
|
||||
assert!(viability.deterministic_replay);
|
||||
assert!(viability.improving_without_violations);
|
||||
assert!(viability.reliable_rollback);
|
||||
assert!(viability.infinite_gradeable_tasks);
|
||||
assert!(viability.cost_trending_down);
|
||||
assert!(viability.all_pass());
|
||||
}
|
||||
}
|
||||
166
vendor/ruvector/examples/benchmarks/src/bin/acceptance_rvf.rs
vendored
Normal file
166
vendor/ruvector/examples/benchmarks/src/bin/acceptance_rvf.rs
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
//! Publishable RVF Acceptance Test — CLI entry point.
|
||||
//!
|
||||
//! Generates or verifies a deterministic acceptance test manifest with
|
||||
//! SHAKE-256 witness chain (rvf-crypto native). Same seed → same outcomes
|
||||
//! → same root hash.
|
||||
//!
|
||||
//! ```bash
|
||||
//! # Generate manifest (JSON + .rvf binary)
|
||||
//! cargo run --bin acceptance-rvf -- generate -o manifest.json
|
||||
//!
|
||||
//! # Generate with custom config
|
||||
//! cargo run --bin acceptance-rvf -- generate -o manifest.json \
|
||||
//! --holdout 200 --training 200 --cycles 5
|
||||
//!
|
||||
//! # Verify a manifest (re-runs and compares root hash)
|
||||
//! cargo run --bin acceptance-rvf -- verify -i manifest.json
|
||||
//!
|
||||
//! # Verify the .rvf binary witness chain
|
||||
//! cargo run --bin acceptance-rvf -- verify-rvf -i acceptance_manifest.rvf
|
||||
//! ```
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use ruvector_benchmarks::acceptance_test::HoldoutConfig;
|
||||
use ruvector_benchmarks::publishable_rvf::{
|
||||
generate_manifest_with_rvf, verify_manifest, verify_rvf_binary,
|
||||
};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "acceptance-rvf")]
|
||||
#[command(about = "Publishable RVF acceptance test with SHAKE-256 witness chain")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Generate a new acceptance test manifest (JSON + .rvf binary)
|
||||
Generate {
|
||||
/// Output JSON file path
|
||||
#[arg(short, long, default_value = "acceptance_manifest.json")]
|
||||
output: String,
|
||||
|
||||
/// Holdout set size
|
||||
#[arg(long, default_value_t = 200)]
|
||||
holdout: usize,
|
||||
|
||||
/// Training puzzles per cycle
|
||||
#[arg(long, default_value_t = 200)]
|
||||
training: usize,
|
||||
|
||||
/// Number of training cycles
|
||||
#[arg(long, default_value_t = 5)]
|
||||
cycles: usize,
|
||||
|
||||
/// Step budget per puzzle
|
||||
#[arg(long, default_value_t = 400)]
|
||||
budget: usize,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
},
|
||||
/// Verify an existing manifest by replaying and comparing root hash
|
||||
Verify {
|
||||
/// Input JSON file path
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
},
|
||||
/// Verify a native .rvf binary witness chain
|
||||
VerifyRvf {
|
||||
/// Input .rvf file path
|
||||
#[arg(short, long)]
|
||||
input: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Generate {
|
||||
output,
|
||||
holdout,
|
||||
training,
|
||||
cycles,
|
||||
budget,
|
||||
verbose,
|
||||
} => {
|
||||
let config = HoldoutConfig {
|
||||
holdout_size: holdout,
|
||||
training_per_cycle: training,
|
||||
cycles,
|
||||
step_budget: budget,
|
||||
min_accuracy: 0.50,
|
||||
min_dimensions_improved: 1,
|
||||
verbose,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Derive .rvf path from JSON output path
|
||||
let rvf_path = output.replace(".json", ".rvf");
|
||||
|
||||
println!("Generating acceptance test manifest...");
|
||||
println!(
|
||||
" holdout={}, training={}, cycles={}, budget={}",
|
||||
holdout, training, cycles, budget
|
||||
);
|
||||
println!();
|
||||
|
||||
let manifest = generate_manifest_with_rvf(&config, Some(&rvf_path))?;
|
||||
manifest.print_summary();
|
||||
|
||||
let json = serde_json::to_string_pretty(&manifest)?;
|
||||
std::fs::write(&output, &json)?;
|
||||
println!(" JSON manifest: {}", output);
|
||||
println!(" RVF binary: {}", rvf_path);
|
||||
println!(" Chain root hash: {}", manifest.chain_root_hash);
|
||||
println!();
|
||||
|
||||
if manifest.all_passed {
|
||||
std::process::exit(0);
|
||||
} else {
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
Commands::Verify { input } => {
|
||||
println!("Loading manifest from: {}", input);
|
||||
let json = std::fs::read_to_string(&input)?;
|
||||
let manifest: ruvector_benchmarks::publishable_rvf::RvfManifest =
|
||||
serde_json::from_str(&json)?;
|
||||
|
||||
println!(" Chain length: {}", manifest.chain_length);
|
||||
println!(
|
||||
" Expected root: {}",
|
||||
&manifest.chain_root_hash[..32.min(manifest.chain_root_hash.len())]
|
||||
);
|
||||
println!();
|
||||
println!("Re-running acceptance test with same config...");
|
||||
|
||||
let result = verify_manifest(&manifest)?;
|
||||
result.print();
|
||||
|
||||
if result.passed() {
|
||||
println!(" VERIFICATION: PASSED — outcomes are identical");
|
||||
std::process::exit(0);
|
||||
} else {
|
||||
println!(" VERIFICATION: FAILED — outcomes differ");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
Commands::VerifyRvf { input } => {
|
||||
println!("Verifying .rvf witness chain: {}", input);
|
||||
match verify_rvf_binary(&input) {
|
||||
Ok(count) => {
|
||||
println!(" WITNESS_SEG verified: {} entries, chain intact", count);
|
||||
std::process::exit(0);
|
||||
}
|
||||
Err(e) => {
|
||||
println!(" VERIFICATION FAILED: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
204
vendor/ruvector/examples/benchmarks/src/bin/agi_proof_harness.rs
vendored
Normal file
204
vendor/ruvector/examples/benchmarks/src/bin/agi_proof_harness.rs
vendored
Normal file
@@ -0,0 +1,204 @@
|
||||
//! AGI Proof Harness — Nightly runner that publishes contract metrics.
|
||||
//!
|
||||
//! Publishes:
|
||||
//! - Success rate
|
||||
//! - Cost per solve
|
||||
//! - Robustness under noise
|
||||
//! - Policy compliance
|
||||
//! - Contradiction rate
|
||||
//! - Rollback correctness
|
||||
//! - Viability checklist status
|
||||
//! - Autonomy level
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin agi-proof-harness
|
||||
//! cargo run --bin agi-proof-harness -- --holdout 1000 --cycles 10 --verbose
|
||||
//! cargo run --bin agi-proof-harness -- --full # 10K training, 1K holdout, 10 cycles
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::acceptance_test::{
|
||||
run_ablation_comparison, run_acceptance_test, HoldoutConfig,
|
||||
};
|
||||
use ruvector_benchmarks::agi_contract::{AutonomyEvaluator, ContractHealth, ViabilityChecklist};
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "agi-proof-harness")]
|
||||
#[command(about = "AGI contract proof harness — publishes nightly metrics")]
|
||||
struct Args {
|
||||
/// Holdout evaluation set size
|
||||
#[arg(long, default_value = "200")]
|
||||
holdout: usize,
|
||||
|
||||
/// Training tasks per cycle
|
||||
#[arg(long, default_value = "200")]
|
||||
training: usize,
|
||||
|
||||
/// Number of improvement cycles
|
||||
#[arg(long, default_value = "5")]
|
||||
cycles: usize,
|
||||
|
||||
/// Frozen holdout seed
|
||||
#[arg(long, default_value = "3735928559")]
|
||||
holdout_seed: u64,
|
||||
|
||||
/// Training seed
|
||||
#[arg(long, default_value = "42")]
|
||||
training_seed: u64,
|
||||
|
||||
/// Noise injection rate
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per task
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Full acceptance test (10K training, 1K holdout, 10 cycles)
|
||||
#[arg(long)]
|
||||
full: bool,
|
||||
|
||||
/// Minimum accuracy threshold
|
||||
#[arg(long, default_value = "0.80")]
|
||||
min_accuracy: f64,
|
||||
|
||||
/// Run three-mode ablation comparison (A/B/C)
|
||||
#[arg(long)]
|
||||
ablation: bool,
|
||||
|
||||
/// Also run the 5-level SI pathway
|
||||
#[arg(long)]
|
||||
pathway: bool,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ AGI PROOF HARNESS ║");
|
||||
println!("║ Contract-based intelligence measurement ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let config = if args.full {
|
||||
HoldoutConfig {
|
||||
holdout_size: 1000,
|
||||
training_per_cycle: 1000,
|
||||
cycles: 10,
|
||||
holdout_seed: args.holdout_seed,
|
||||
training_seed: args.training_seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
min_accuracy: 0.95,
|
||||
min_dimensions_improved: 2,
|
||||
verbose: args.verbose,
|
||||
}
|
||||
} else {
|
||||
HoldoutConfig {
|
||||
holdout_size: args.holdout,
|
||||
training_per_cycle: args.training,
|
||||
cycles: args.cycles,
|
||||
holdout_seed: args.holdout_seed,
|
||||
training_seed: args.training_seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
min_accuracy: args.min_accuracy,
|
||||
min_dimensions_improved: 2,
|
||||
verbose: args.verbose,
|
||||
}
|
||||
};
|
||||
|
||||
println!(
|
||||
" Config: holdout={}, training/cycle={}, cycles={}, noise={:.0}%",
|
||||
config.holdout_size,
|
||||
config.training_per_cycle,
|
||||
config.cycles,
|
||||
config.noise_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" Seeds: holdout=0x{:X}, training={}",
|
||||
config.holdout_seed, config.training_seed
|
||||
);
|
||||
println!();
|
||||
|
||||
// ─── Run Acceptance Test ─────────────────────────────────────────
|
||||
println!(" Running acceptance test...");
|
||||
let result = run_acceptance_test(&config)?;
|
||||
result.print();
|
||||
|
||||
// ─── Ablation Comparison ─────────────────────────────────────────
|
||||
if args.ablation {
|
||||
println!(" Running ablation comparison (A / B / C)...");
|
||||
let comparison = run_ablation_comparison(&config)?;
|
||||
comparison.print();
|
||||
}
|
||||
|
||||
// ─── Contract Health Summary ─────────────────────────────────────
|
||||
if let Some(last_cycle) = result.cycles.last() {
|
||||
println!();
|
||||
last_cycle.contract_health.print();
|
||||
|
||||
// ─── Autonomy Level ──────────────────────────────────────────
|
||||
let health_history: Vec<ContractHealth> = result
|
||||
.cycles
|
||||
.iter()
|
||||
.map(|c| c.contract_health.clone())
|
||||
.collect();
|
||||
let evaluator = AutonomyEvaluator::default();
|
||||
let level = evaluator.evaluate(&health_history);
|
||||
println!();
|
||||
evaluator.print_status(level, &last_cycle.contract_health);
|
||||
|
||||
// ─── Viability Checklist ─────────────────────────────────────
|
||||
let viability = ViabilityChecklist::evaluate(&health_history);
|
||||
println!();
|
||||
viability.print();
|
||||
}
|
||||
|
||||
// ─── Optional: SI Pathway ────────────────────────────────────────
|
||||
if args.pathway {
|
||||
println!();
|
||||
println!(" Running 5-level SI pathway...");
|
||||
let si_config = SIConfig {
|
||||
episodes_per_level: 6,
|
||||
tasks_per_episode: 15,
|
||||
verbose: args.verbose,
|
||||
..Default::default()
|
||||
};
|
||||
let pathway_result = run_pathway(&si_config)?;
|
||||
pathway_result.print();
|
||||
|
||||
// Show contract health for peak level
|
||||
if let Some(peak) = pathway_result
|
||||
.levels
|
||||
.iter()
|
||||
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
|
||||
{
|
||||
let health = ContractHealth::from_raw(&peak.raw_metrics);
|
||||
println!(" Peak Level ({}) Contract:", peak.name);
|
||||
health.print();
|
||||
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&peak.raw_metrics);
|
||||
println!(" Multi-dimensional IQ: {:.1}", assessment.overall_score);
|
||||
println!(
|
||||
" Cost efficiency: {:.2}",
|
||||
assessment.cost.cost_efficiency
|
||||
);
|
||||
println!(
|
||||
" Robustness score: {:.2}",
|
||||
assessment.robustness.robustness_score
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
Ok(())
|
||||
}
|
||||
355
vendor/ruvector/examples/benchmarks/src/bin/intelligence_assessment.rs
vendored
Normal file
355
vendor/ruvector/examples/benchmarks/src/bin/intelligence_assessment.rs
vendored
Normal file
@@ -0,0 +1,355 @@
|
||||
//! Intelligence Assessment Runner
|
||||
//!
|
||||
//! Runs comprehensive intelligence assessment across all benchmark types.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin intelligence-assessment -- --episodes 10 --puzzles 50
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
intelligence_metrics::{
|
||||
print_intelligence_report, DifficultyStats, EpisodeMetrics, IntelligenceCalculator,
|
||||
RawMetrics,
|
||||
},
|
||||
swarm_regret::SwarmController,
|
||||
temporal::{AdaptiveSolver, TemporalSolver},
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
|
||||
};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "intelligence-assessment")]
|
||||
#[command(about = "Run comprehensive intelligence assessment")]
|
||||
struct Args {
|
||||
/// Number of episodes for regret tracking
|
||||
#[arg(short, long, default_value = "10")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "10")]
|
||||
tasks_per_episode: usize,
|
||||
|
||||
/// Enable calendar tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable adaptive learning (ReasoningBank)
|
||||
#[arg(long, default_value = "true")]
|
||||
adaptive: bool,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Comprehensive Intelligence Assessment ║");
|
||||
println!("║ Measuring Reasoning, Learning & Cognitive Abilities ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize metrics collector
|
||||
let mut raw_metrics = RawMetrics::default();
|
||||
|
||||
// Initialize components
|
||||
let mut controller = SwarmController::new(args.tasks_per_episode);
|
||||
|
||||
// Choose solver based on adaptive flag
|
||||
let mut adaptive_solver = if args.adaptive {
|
||||
Some(AdaptiveSolver::new())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut basic_solver = if !args.adaptive {
|
||||
let mut s = TemporalSolver::with_tools(args.calendar, false);
|
||||
s.max_steps = 100;
|
||||
Some(s)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let puzzle_config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
seed: args.seed,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks_per_episode);
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Adaptive learning:{}", args.adaptive);
|
||||
println!();
|
||||
|
||||
println!("🏃 Running assessment...");
|
||||
println!();
|
||||
|
||||
// Run episodes
|
||||
for ep in 0..args.episodes {
|
||||
controller.start_episode();
|
||||
|
||||
// Generate puzzles for this episode
|
||||
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
|
||||
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
|
||||
|
||||
let mut solved = 0;
|
||||
let mut correct = 0;
|
||||
let mut total_steps = 0;
|
||||
let mut total_tool_calls = 0;
|
||||
let mut total_latency = 0u64;
|
||||
|
||||
// Solve puzzles and collect metrics
|
||||
for puzzle in &puzzles {
|
||||
raw_metrics.tasks_attempted += 1;
|
||||
|
||||
// Use adaptive or basic solver
|
||||
let result = if let Some(ref mut solver) = adaptive_solver {
|
||||
solver.solve(puzzle)?
|
||||
} else if let Some(ref mut solver) = basic_solver {
|
||||
solver.solve(puzzle)?
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
if result.solved {
|
||||
solved += 1;
|
||||
raw_metrics.tasks_completed += 1;
|
||||
}
|
||||
if result.correct {
|
||||
correct += 1;
|
||||
raw_metrics.tasks_correct += 1;
|
||||
}
|
||||
|
||||
total_steps += result.steps;
|
||||
total_tool_calls += result.tool_calls;
|
||||
total_latency += result.latency_ms;
|
||||
|
||||
raw_metrics.total_steps += result.steps;
|
||||
raw_metrics.total_tool_calls += result.tool_calls;
|
||||
raw_metrics.total_latency_ms += result.latency_ms;
|
||||
|
||||
// Track by difficulty
|
||||
let entry = raw_metrics
|
||||
.by_difficulty
|
||||
.entry(puzzle.difficulty)
|
||||
.or_insert(DifficultyStats {
|
||||
attempted: 0,
|
||||
completed: 0,
|
||||
correct: 0,
|
||||
avg_steps: 0.0,
|
||||
});
|
||||
entry.attempted += 1;
|
||||
if result.solved {
|
||||
entry.completed += 1;
|
||||
}
|
||||
if result.correct {
|
||||
entry.correct += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Record episode for swarm controller
|
||||
controller.complete_episode(
|
||||
solved,
|
||||
correct,
|
||||
total_steps,
|
||||
total_tool_calls,
|
||||
total_latency,
|
||||
);
|
||||
|
||||
// Record episode metrics
|
||||
let episode_accuracy = if args.tasks_per_episode > 0 {
|
||||
correct as f64 / args.tasks_per_episode as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let last_ep = controller.regret.episodes.last().unwrap();
|
||||
raw_metrics.episodes.push(EpisodeMetrics {
|
||||
episode: ep + 1,
|
||||
accuracy: episode_accuracy,
|
||||
reward: last_ep.reward,
|
||||
regret: last_ep.regret(),
|
||||
cumulative_regret: controller.regret.current_cumulative_regret(),
|
||||
});
|
||||
|
||||
if args.verbose {
|
||||
println!(
|
||||
" Episode {:2}: Accuracy {:.1}%, Regret {:.2}",
|
||||
ep + 1,
|
||||
episode_accuracy * 100.0,
|
||||
last_ep.regret()
|
||||
);
|
||||
} else {
|
||||
print!(".");
|
||||
use std::io::Write;
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
}
|
||||
|
||||
if !args.verbose {
|
||||
println!();
|
||||
}
|
||||
println!();
|
||||
|
||||
// Update difficulty stats with average steps
|
||||
for (_, stats) in raw_metrics.by_difficulty.iter_mut() {
|
||||
if stats.attempted > 0 {
|
||||
// This is a simplification - we'd need to track this properly
|
||||
stats.avg_steps = raw_metrics.total_steps as f64 / raw_metrics.tasks_attempted as f64;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate intelligence assessment
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&raw_metrics);
|
||||
|
||||
// Print report
|
||||
print_intelligence_report(&assessment);
|
||||
|
||||
// Additional insights
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Performance Summary ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
println!("📊 Task Performance:");
|
||||
println!(" Tasks Attempted: {}", raw_metrics.tasks_attempted);
|
||||
println!(" Tasks Completed: {}", raw_metrics.tasks_completed);
|
||||
println!(" Tasks Correct: {}", raw_metrics.tasks_correct);
|
||||
println!(
|
||||
" Overall Accuracy: {:.1}%",
|
||||
raw_metrics.tasks_correct as f64 / raw_metrics.tasks_attempted as f64 * 100.0
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("📈 Learning Progress:");
|
||||
let regret_summary = controller.regret.summary();
|
||||
println!(" Cumulative Regret: {:.2}", regret_summary.total_regret);
|
||||
println!(" Average Regret: {:.4}", regret_summary.average_regret);
|
||||
println!(
|
||||
" Sublinear: {}",
|
||||
if regret_summary.is_sublinear {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" Regret Trend: {:.4} ({})",
|
||||
regret_summary.regret_trend,
|
||||
if regret_summary.regret_trend < 0.0 {
|
||||
"decreasing ✓"
|
||||
} else {
|
||||
"increasing ✗"
|
||||
}
|
||||
);
|
||||
println!();
|
||||
|
||||
// Grade the overall performance
|
||||
let grade = if assessment.overall_score >= 90.0 {
|
||||
"A+ (Excellent)"
|
||||
} else if assessment.overall_score >= 80.0 {
|
||||
"A (Very Good)"
|
||||
} else if assessment.overall_score >= 70.0 {
|
||||
"B (Good)"
|
||||
} else if assessment.overall_score >= 60.0 {
|
||||
"C (Adequate)"
|
||||
} else if assessment.overall_score >= 50.0 {
|
||||
"D (Below Average)"
|
||||
} else {
|
||||
"F (Needs Improvement)"
|
||||
};
|
||||
|
||||
println!("🎯 Final Grade: {}", grade);
|
||||
println!();
|
||||
|
||||
// Recommendations
|
||||
println!("💡 Recommendations:");
|
||||
if assessment.capabilities.temporal_reasoning < 70.0 {
|
||||
println!(" • Improve temporal reasoning with more constraint examples");
|
||||
}
|
||||
if assessment.learning.regret_sublinearity < 0.5 {
|
||||
println!(" • Increase episodes to achieve sublinear regret");
|
||||
}
|
||||
if assessment.tool_use.utilization_effectiveness < 0.7 {
|
||||
println!(" • Better tool selection needed for complex tasks");
|
||||
}
|
||||
if assessment.meta_cognition.strategy_adaptation < 0.5 {
|
||||
println!(" • Enable adaptive strategy switching");
|
||||
}
|
||||
if assessment.overall_score >= 70.0 {
|
||||
println!(" • Good performance! Consider harder difficulty levels");
|
||||
}
|
||||
|
||||
// Show adaptive learning progress if enabled
|
||||
if let Some(ref solver) = adaptive_solver {
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Adaptive Learning Progress ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let progress = solver.learning_progress();
|
||||
println!("🧠 ReasoningBank Statistics:");
|
||||
println!(" Total trajectories: {}", progress.total_trajectories);
|
||||
println!(
|
||||
" Success rate: {:.1}%",
|
||||
progress.success_rate * 100.0
|
||||
);
|
||||
println!(" Improvement rate: {:.4}", progress.improvement_rate);
|
||||
println!(" Patterns learned: {}", progress.patterns_learned);
|
||||
println!(" Strategies tried: {}", progress.strategies_tried);
|
||||
println!(
|
||||
" Is improving: {}",
|
||||
if progress.is_improving {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
|
||||
// Show learned patterns
|
||||
if !solver.reasoning_bank.patterns.is_empty() {
|
||||
println!();
|
||||
println!("📚 Learned Patterns:");
|
||||
for (constraint_type, patterns) in &solver.reasoning_bank.patterns {
|
||||
for p in patterns.iter().filter(|p| p.observations >= 3) {
|
||||
println!(
|
||||
" • {}: {} strategy ({:.0}% success, {} obs)",
|
||||
constraint_type,
|
||||
p.best_strategy,
|
||||
p.success_rate * 100.0,
|
||||
p.observations
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Show strategy stats
|
||||
if !solver.reasoning_bank.strategy_stats.is_empty() {
|
||||
println!();
|
||||
println!("📊 Strategy Performance:");
|
||||
for (strategy, stats) in &solver.reasoning_bank.strategy_stats {
|
||||
println!(
|
||||
" • {}: {:.1}% success ({} attempts, {:.1} avg steps)",
|
||||
strategy,
|
||||
stats.success_rate() * 100.0,
|
||||
stats.attempts,
|
||||
stats.avg_steps()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
180
vendor/ruvector/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
vendored
Normal file
180
vendor/ruvector/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
//! RVF Intelligence Benchmark Runner
|
||||
//!
|
||||
//! Runs head-to-head comparison across 6 intelligence verticals:
|
||||
//! Baseline (no learning) vs. RVF-Learning (full pipeline).
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin rvf-intelligence-bench -- --episodes 15 --tasks 25 --verbose
|
||||
//! cargo run --bin rvf-intelligence-bench -- --noise 0.4 --step-budget 300
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::rvf_intelligence_bench::{run_comparison, BenchmarkConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "rvf-intelligence-bench")]
|
||||
#[command(about = "Benchmark intelligence with and without RVF learning across 6 verticals")]
|
||||
struct Args {
|
||||
/// Number of episodes per mode
|
||||
#[arg(short, long, default_value = "10")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "20")]
|
||||
tasks: usize,
|
||||
|
||||
/// Minimum difficulty (1-10)
|
||||
#[arg(long, default_value = "1")]
|
||||
min_diff: u8,
|
||||
|
||||
/// Maximum difficulty (1-10)
|
||||
#[arg(long, default_value = "10")]
|
||||
max_diff: u8,
|
||||
|
||||
/// Random seed for reproducibility
|
||||
#[arg(long, default_value = "42")]
|
||||
seed: u64,
|
||||
|
||||
/// Noise probability (0.0-1.0)
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per episode
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Max retries for error recovery (RVF only)
|
||||
#[arg(long, default_value = "2")]
|
||||
max_retries: usize,
|
||||
|
||||
/// Retention fraction (0.0-1.0)
|
||||
#[arg(long, default_value = "0.15")]
|
||||
retention: f64,
|
||||
|
||||
/// Token budget per episode (RVF mode)
|
||||
#[arg(long, default_value = "200000")]
|
||||
token_budget: u32,
|
||||
|
||||
/// Tool call budget per episode (RVF mode)
|
||||
#[arg(long, default_value = "50")]
|
||||
tool_budget: u16,
|
||||
|
||||
/// Verbose per-episode output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("================================================================");
|
||||
println!(" RVF Intelligence Benchmark v2 — Six Verticals");
|
||||
println!(" Baseline vs. RVF-Learning (noise + step limits + retry + transfer)");
|
||||
println!("================================================================");
|
||||
println!();
|
||||
println!(" Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks);
|
||||
println!(" Difficulty: {}-{}", args.min_diff, args.max_diff);
|
||||
println!(" Seed: {}", args.seed);
|
||||
println!(" Noise prob: {:.0}%", args.noise * 100.0);
|
||||
println!(" Step budget/ep: {}", args.step_budget);
|
||||
println!(" Max retries: {}", args.max_retries);
|
||||
println!(" Retention: {:.0}%", args.retention * 100.0);
|
||||
println!();
|
||||
|
||||
let config = BenchmarkConfig {
|
||||
episodes: args.episodes,
|
||||
tasks_per_episode: args.tasks,
|
||||
min_difficulty: args.min_diff,
|
||||
max_difficulty: args.max_diff,
|
||||
seed: Some(args.seed),
|
||||
token_budget: args.token_budget,
|
||||
tool_call_budget: args.tool_budget,
|
||||
verbose: args.verbose,
|
||||
noise_probability: args.noise,
|
||||
step_budget_per_episode: args.step_budget,
|
||||
max_retries: args.max_retries,
|
||||
retention_fraction: args.retention,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!(" Phase 1/2: Running baseline (no learning)...");
|
||||
let report = run_comparison(&config)?;
|
||||
|
||||
// Print comparison report
|
||||
report.print();
|
||||
|
||||
// Full IQ assessment
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
|
||||
println!("----------------------------------------------------------------");
|
||||
println!(" Detailed Intelligence Assessment: Baseline");
|
||||
println!("----------------------------------------------------------------");
|
||||
let base_assessment = calculator.calculate(&report.baseline.raw_metrics);
|
||||
print_compact_assessment(&base_assessment);
|
||||
|
||||
println!();
|
||||
println!("----------------------------------------------------------------");
|
||||
println!(" Detailed Intelligence Assessment: RVF-Learning");
|
||||
println!("----------------------------------------------------------------");
|
||||
let rvf_assessment = calculator.calculate(&report.rvf_learning.raw_metrics);
|
||||
print_compact_assessment(&rvf_assessment);
|
||||
|
||||
// Final IQ comparison
|
||||
println!();
|
||||
println!("================================================================");
|
||||
println!(" Intelligence Score Comparison");
|
||||
println!("================================================================");
|
||||
println!(
|
||||
" Baseline IQ Score: {:.1}/100",
|
||||
base_assessment.overall_score
|
||||
);
|
||||
println!(
|
||||
" RVF-Learning IQ Score: {:.1}/100",
|
||||
rvf_assessment.overall_score
|
||||
);
|
||||
let iq_delta = rvf_assessment.overall_score - base_assessment.overall_score;
|
||||
println!(" Delta: {:+.1}", iq_delta);
|
||||
println!();
|
||||
|
||||
if iq_delta > 10.0 {
|
||||
println!(" >> RVF learning loop provides a DRAMATIC intelligence boost.");
|
||||
} else if iq_delta > 5.0 {
|
||||
println!(" >> RVF learning loop provides a SIGNIFICANT intelligence boost.");
|
||||
} else if iq_delta > 1.0 {
|
||||
println!(" >> RVF learning loop provides a MEASURABLE intelligence improvement.");
|
||||
} else if iq_delta > 0.0 {
|
||||
println!(" >> RVF learning loop provides a MARGINAL intelligence gain.");
|
||||
} else {
|
||||
println!(" >> Performance is comparable. Increase noise or reduce step budget.");
|
||||
}
|
||||
println!();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_compact_assessment(a: &ruvector_benchmarks::intelligence_metrics::IntelligenceAssessment) {
|
||||
println!(" Overall Score: {:.1}/100", a.overall_score);
|
||||
println!(
|
||||
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
|
||||
a.reasoning.logical_coherence, a.reasoning.reasoning_efficiency, a.reasoning.error_rate,
|
||||
);
|
||||
println!(
|
||||
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}, gen={:.2}",
|
||||
a.learning.sample_efficiency,
|
||||
a.learning.regret_sublinearity,
|
||||
a.learning.learning_rate,
|
||||
a.learning.generalization,
|
||||
);
|
||||
println!(
|
||||
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
|
||||
a.capabilities.pattern_recognition, a.capabilities.planning, a.capabilities.adaptation,
|
||||
);
|
||||
println!(
|
||||
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
|
||||
a.meta_cognition.self_correction_rate, a.meta_cognition.strategy_adaptation,
|
||||
);
|
||||
}
|
||||
135
vendor/ruvector/examples/benchmarks/src/bin/superintelligence.rs
vendored
Normal file
135
vendor/ruvector/examples/benchmarks/src/bin/superintelligence.rs
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
//! Superintelligence Pathway Runner
|
||||
//!
|
||||
//! Runs a 5-level recursive intelligence amplification pipeline and tracks
|
||||
//! IQ progression from foundation (~85) toward superintelligence (~98+).
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin superintelligence -- --verbose
|
||||
//! cargo run --bin superintelligence -- --episodes 15 --tasks 30 --target 95
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
|
||||
use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "superintelligence")]
|
||||
#[command(about = "Run 5-level superintelligence pathway with IQ tracking")]
|
||||
struct Args {
|
||||
/// Episodes per level
|
||||
#[arg(short, long, default_value = "12")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "25")]
|
||||
tasks: usize,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long, default_value = "42")]
|
||||
seed: u64,
|
||||
|
||||
/// Noise injection rate (0.0-1.0)
|
||||
#[arg(long, default_value = "0.25")]
|
||||
noise: f64,
|
||||
|
||||
/// Step budget per episode
|
||||
#[arg(long, default_value = "400")]
|
||||
step_budget: usize,
|
||||
|
||||
/// Target IQ score
|
||||
#[arg(long, default_value = "98.0")]
|
||||
target: f64,
|
||||
|
||||
/// Ensemble size for Level 3
|
||||
#[arg(long, default_value = "4")]
|
||||
ensemble: usize,
|
||||
|
||||
/// Recursive improvement cycles for Level 4
|
||||
#[arg(long, default_value = "3")]
|
||||
cycles: usize,
|
||||
|
||||
/// Adversarial pressure multiplier for Level 5
|
||||
#[arg(long, default_value = "1.5")]
|
||||
pressure: f64,
|
||||
|
||||
/// Verbose per-episode output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!();
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ SUPERINTELLIGENCE PATHWAY ENGINE ║");
|
||||
println!("║ 5-Level Recursive Intelligence Amplification ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!(
|
||||
" Config: {} eps/level x {} tasks, noise={:.0}%, target IQ={:.0}",
|
||||
args.episodes,
|
||||
args.tasks,
|
||||
args.noise * 100.0,
|
||||
args.target
|
||||
);
|
||||
println!(
|
||||
" Ensemble={}, Cycles={}, Pressure={:.1}",
|
||||
args.ensemble, args.cycles, args.pressure
|
||||
);
|
||||
println!();
|
||||
|
||||
let config = SIConfig {
|
||||
episodes_per_level: args.episodes,
|
||||
tasks_per_episode: args.tasks,
|
||||
seed: args.seed,
|
||||
noise_rate: args.noise,
|
||||
step_budget: args.step_budget,
|
||||
target_iq: args.target,
|
||||
ensemble_size: args.ensemble,
|
||||
recursive_cycles: args.cycles,
|
||||
adversarial_pressure: args.pressure,
|
||||
verbose: args.verbose,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = run_pathway(&config)?;
|
||||
result.print();
|
||||
|
||||
// Detailed assessment for peak level
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
if let Some(peak) = result
|
||||
.levels
|
||||
.iter()
|
||||
.max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
|
||||
{
|
||||
println!(" Peak Level ({}) Assessment:", peak.name);
|
||||
let assessment = calculator.calculate(&peak.raw_metrics);
|
||||
println!(
|
||||
" Reasoning: coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
|
||||
assessment.reasoning.logical_coherence,
|
||||
assessment.reasoning.reasoning_efficiency,
|
||||
assessment.reasoning.error_rate
|
||||
);
|
||||
println!(
|
||||
" Learning: sample_eff={:.2}, regret_sub={:.2}, rate={:.2}",
|
||||
assessment.learning.sample_efficiency,
|
||||
assessment.learning.regret_sublinearity,
|
||||
assessment.learning.learning_rate
|
||||
);
|
||||
println!(
|
||||
" Capabilities: pattern={:.1}, planning={:.1}, adaptation={:.1}",
|
||||
assessment.capabilities.pattern_recognition,
|
||||
assessment.capabilities.planning,
|
||||
assessment.capabilities.adaptation
|
||||
);
|
||||
println!(
|
||||
" Meta-cog: self_correct={:.2}, strategy_adapt={:.2}",
|
||||
assessment.meta_cognition.self_correction_rate,
|
||||
assessment.meta_cognition.strategy_adaptation
|
||||
);
|
||||
println!();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
247
vendor/ruvector/examples/benchmarks/src/bin/swarm_regret.rs
vendored
Normal file
247
vendor/ruvector/examples/benchmarks/src/bin/swarm_regret.rs
vendored
Normal file
@@ -0,0 +1,247 @@
|
||||
//! Swarm Regret Tracking Runner
|
||||
//!
|
||||
//! Track sublinear regret across episodes for swarm controller evaluation.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin swarm-regret -- --episodes 20 --tasks-per-episode 20
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
swarm_regret::SwarmController,
|
||||
temporal::TemporalSolver,
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "swarm-regret")]
|
||||
#[command(about = "Track sublinear regret for swarm controller")]
|
||||
struct Args {
|
||||
/// Number of episodes to run
|
||||
#[arg(short, long, default_value = "20")]
|
||||
episodes: usize,
|
||||
|
||||
/// Tasks per episode
|
||||
#[arg(short, long, default_value = "20")]
|
||||
tasks_per_episode: usize,
|
||||
|
||||
/// Enable calendar tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable web search tool
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Maximum steps per task
|
||||
#[arg(long, default_value = "100")]
|
||||
max_steps: usize,
|
||||
|
||||
/// Random seed
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/swarm_regret.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Swarm Controller Regret Tracking ║");
|
||||
println!("║ Sublinear Regret for Multi-Agent Control ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting regret tracking", "swarm-regret")?;
|
||||
|
||||
let mut controller = SwarmController::new(args.tasks_per_episode);
|
||||
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
|
||||
solver.max_steps = args.max_steps;
|
||||
|
||||
let puzzle_config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
seed: args.seed,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Episodes: {}", args.episodes);
|
||||
println!(" Tasks/episode: {}", args.tasks_per_episode);
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Max steps/task: {}", args.max_steps);
|
||||
println!();
|
||||
|
||||
println!("🏃 Running episodes...");
|
||||
println!();
|
||||
println!("┌────────┬────────┬─────────┬─────────┬──────────┬───────────┐");
|
||||
println!("│Episode │ Acc(%) │ Regret │ Cum.Reg │ Avg.Reg │ Sublinear │");
|
||||
println!("├────────┼────────┼─────────┼─────────┼──────────┼───────────┤");
|
||||
|
||||
let total_start = Instant::now();
|
||||
|
||||
for ep in 0..args.episodes {
|
||||
controller.start_episode();
|
||||
|
||||
// Generate puzzles for this episode
|
||||
let mut generator = PuzzleGenerator::new(puzzle_config.clone());
|
||||
let puzzles = generator.generate_batch(args.tasks_per_episode)?;
|
||||
|
||||
let mut solved = 0;
|
||||
let mut correct = 0;
|
||||
let mut total_steps = 0;
|
||||
let mut total_tool_calls = 0;
|
||||
let mut total_latency = 0u64;
|
||||
|
||||
// Solve puzzles
|
||||
for puzzle in &puzzles {
|
||||
let result = solver.solve(puzzle)?;
|
||||
if result.solved {
|
||||
solved += 1;
|
||||
}
|
||||
if result.correct {
|
||||
correct += 1;
|
||||
}
|
||||
total_steps += result.steps;
|
||||
total_tool_calls += result.tool_calls;
|
||||
total_latency += result.latency_ms;
|
||||
}
|
||||
|
||||
// Record episode
|
||||
controller.complete_episode(
|
||||
solved,
|
||||
correct,
|
||||
total_steps,
|
||||
total_tool_calls,
|
||||
total_latency,
|
||||
);
|
||||
|
||||
// Get status
|
||||
let summary = controller.regret.summary();
|
||||
let last_episode = controller.regret.episodes.last().unwrap();
|
||||
|
||||
// Log episode
|
||||
logger.log_swarm(
|
||||
ep + 1,
|
||||
args.tasks_per_episode,
|
||||
solved,
|
||||
correct,
|
||||
last_episode.reward,
|
||||
last_episode.oracle_reward,
|
||||
summary.total_regret,
|
||||
summary.average_regret,
|
||||
summary.is_sublinear,
|
||||
)?;
|
||||
|
||||
// Print row
|
||||
let sublinear = if summary.is_sublinear { "✓" } else { "✗" };
|
||||
println!(
|
||||
"│ {:6} │ {:5.1} │ {:7.2} │ {:7.2} │ {:8.4} │ {} │",
|
||||
ep + 1,
|
||||
last_episode.accuracy() * 100.0,
|
||||
last_episode.regret(),
|
||||
summary.total_regret,
|
||||
summary.average_regret,
|
||||
sublinear
|
||||
);
|
||||
}
|
||||
|
||||
println!("└────────┴────────┴─────────┴─────────┴──────────┴───────────┘");
|
||||
println!();
|
||||
|
||||
let total_time = total_start.elapsed();
|
||||
|
||||
// Final summary
|
||||
let summary = controller.regret.summary();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Final Summary ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Regret Analysis:");
|
||||
println!(" Total episodes: {}", summary.total_episodes);
|
||||
println!(" Cumulative regret: {:.2}", summary.total_regret);
|
||||
println!(" Average regret: {:.4}", summary.average_regret);
|
||||
println!(
|
||||
" Regret trend: {:.6} ({})",
|
||||
summary.regret_trend,
|
||||
if summary.regret_trend < 0.0 {
|
||||
"decreasing ✓"
|
||||
} else {
|
||||
"increasing ✗"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" Sublinear: {}",
|
||||
if summary.is_sublinear {
|
||||
"Yes ✓"
|
||||
} else {
|
||||
"No ✗"
|
||||
}
|
||||
);
|
||||
println!();
|
||||
println!("📈 Performance:");
|
||||
println!(
|
||||
" Average accuracy: {:.1}%",
|
||||
summary.average_accuracy * 100.0
|
||||
);
|
||||
println!(" Average reward: {:.2}", summary.average_reward);
|
||||
println!(
|
||||
" Moving avg reward: {:.2}",
|
||||
summary.moving_average_reward
|
||||
);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Regret curve analysis
|
||||
if controller.regret.average_regret.len() >= 5 {
|
||||
println!("📉 Regret Curve (R_k/k):");
|
||||
let regrets = &controller.regret.average_regret;
|
||||
let step = regrets.len().max(10) / 10;
|
||||
for (i, r) in regrets.iter().enumerate() {
|
||||
if i % step == 0 || i == regrets.len() - 1 {
|
||||
let bar_len = (r * 50.0).min(50.0) as usize;
|
||||
let bar = "█".repeat(bar_len);
|
||||
println!(" Episode {:3}: {:.4} {}", i + 1, r, bar);
|
||||
}
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
// Goal check
|
||||
println!("🎯 Goal Status:");
|
||||
if summary.is_sublinear && summary.regret_trend < 0.0 {
|
||||
println!(" ✓ Achieving sublinear regret - average regret trending to zero");
|
||||
} else if summary.is_sublinear {
|
||||
println!(" ~ Sublinear but trend not clearly decreasing");
|
||||
} else {
|
||||
println!(" ✗ Not yet achieving sublinear regret");
|
||||
println!(" Recommendation: Increase episodes or tune solver parameters");
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
// Save summary
|
||||
let summary_path = args.output.replace(".jsonl", "_summary.json");
|
||||
let summary_json = serde_json::to_string_pretty(&summary)?;
|
||||
std::fs::write(&summary_path, summary_json)?;
|
||||
println!("📝 Summary saved to: {}", summary_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
262
vendor/ruvector/examples/benchmarks/src/bin/temporal_benchmark.rs
vendored
Normal file
262
vendor/ruvector/examples/benchmarks/src/bin/temporal_benchmark.rs
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
//! Temporal Benchmark Runner
|
||||
//!
|
||||
//! Run temporal reasoning benchmarks based on TimePuzzles methodology.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin temporal-benchmark -- --puzzles 50 --calendar --web-search
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
temporal::{BenchmarkConfig, BenchmarkResults, TemporalSolver},
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "temporal-benchmark")]
|
||||
#[command(about = "Run temporal reasoning benchmarks")]
|
||||
struct Args {
|
||||
/// Number of puzzles to run
|
||||
#[arg(short = 'n', long, default_value = "50")]
|
||||
puzzles: usize,
|
||||
|
||||
/// Minimum difficulty (1-10)
|
||||
#[arg(long, default_value = "1")]
|
||||
min_difficulty: u8,
|
||||
|
||||
/// Maximum difficulty (1-10)
|
||||
#[arg(long, default_value = "10")]
|
||||
max_difficulty: u8,
|
||||
|
||||
/// Enable calendar math tool
|
||||
#[arg(long, default_value = "true")]
|
||||
calendar: bool,
|
||||
|
||||
/// Enable web search tool
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Maximum steps per puzzle
|
||||
#[arg(long, default_value = "100")]
|
||||
max_steps: usize,
|
||||
|
||||
/// Constraint density (1-5)
|
||||
#[arg(long, default_value = "3")]
|
||||
constraint_density: u8,
|
||||
|
||||
/// Random seed for reproducibility
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/temporal_benchmark.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Use sample puzzles instead of generating
|
||||
#[arg(long)]
|
||||
use_samples: bool,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Temporal Reasoning Benchmark Runner ║");
|
||||
println!("║ Based on TimePuzzles (arXiv:2601.07148) ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize logger
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting benchmark run", "temporal-benchmark")?;
|
||||
|
||||
// Generate or load puzzles
|
||||
let puzzles = if args.use_samples {
|
||||
println!("📚 Using sample puzzle set (50 puzzles)...");
|
||||
SamplePuzzles::mixed_sample()
|
||||
} else {
|
||||
println!(
|
||||
"🎲 Generating {} puzzles (difficulty {}-{})...",
|
||||
args.puzzles, args.min_difficulty, args.max_difficulty
|
||||
);
|
||||
|
||||
let config = PuzzleGeneratorConfig {
|
||||
min_difficulty: args.min_difficulty,
|
||||
max_difficulty: args.max_difficulty,
|
||||
constraint_density: args.constraint_density,
|
||||
cross_cultural: true,
|
||||
relative_constraints: true,
|
||||
year_range: (2000, 2030),
|
||||
seed: args.seed,
|
||||
};
|
||||
|
||||
let mut generator = PuzzleGenerator::new(config);
|
||||
generator.generate_batch(args.puzzles)?
|
||||
};
|
||||
|
||||
println!("✓ Loaded {} puzzles", puzzles.len());
|
||||
println!();
|
||||
|
||||
// Configure solver
|
||||
let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
|
||||
solver.max_steps = args.max_steps;
|
||||
|
||||
println!("🔧 Solver configuration:");
|
||||
println!(" Calendar tool: {}", args.calendar);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Max steps: {}", args.max_steps);
|
||||
println!();
|
||||
|
||||
// Run benchmarks
|
||||
println!("🏃 Running benchmarks...");
|
||||
println!();
|
||||
|
||||
let benchmark_id = format!(
|
||||
"bench-{}-{}",
|
||||
chrono::Utc::now().format("%Y%m%d-%H%M%S"),
|
||||
args.seed.unwrap_or(0)
|
||||
);
|
||||
|
||||
let mut results = Vec::new();
|
||||
let start = Instant::now();
|
||||
|
||||
for (i, puzzle) in puzzles.iter().enumerate() {
|
||||
let result = solver.solve(puzzle)?;
|
||||
|
||||
// Log result
|
||||
logger.log_temporal(
|
||||
&benchmark_id,
|
||||
&puzzle.id,
|
||||
puzzle.difficulty,
|
||||
result.solved,
|
||||
result.correct,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms,
|
||||
puzzle.constraints.len(),
|
||||
args.calendar,
|
||||
args.web_search,
|
||||
)?;
|
||||
|
||||
if args.verbose {
|
||||
let status = if result.correct {
|
||||
"✓"
|
||||
} else if result.solved {
|
||||
"~"
|
||||
} else {
|
||||
"✗"
|
||||
};
|
||||
println!(
|
||||
" {} Puzzle {:3}: {} (steps: {}, latency: {}ms)",
|
||||
status,
|
||||
i + 1,
|
||||
puzzle.id,
|
||||
result.steps,
|
||||
result.latency_ms
|
||||
);
|
||||
} else if (i + 1) % 10 == 0 {
|
||||
print!(".");
|
||||
use std::io::Write;
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_time = start.elapsed();
|
||||
|
||||
if !args.verbose {
|
||||
println!();
|
||||
}
|
||||
println!();
|
||||
|
||||
// Compute aggregate results
|
||||
let config = BenchmarkConfig {
|
||||
num_puzzles: puzzles.len(),
|
||||
difficulty_range: (args.min_difficulty, args.max_difficulty),
|
||||
calendar_tool: args.calendar,
|
||||
web_search_tool: args.web_search,
|
||||
max_steps: args.max_steps,
|
||||
constraint_density: args.constraint_density,
|
||||
};
|
||||
|
||||
let benchmark_results = BenchmarkResults::from_results(config, results);
|
||||
|
||||
// Print results
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Benchmark Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Summary:");
|
||||
println!(" Total puzzles: {}", benchmark_results.total_puzzles);
|
||||
println!(" Solved: {}", benchmark_results.solved_count);
|
||||
println!(" Correct: {}", benchmark_results.correct_count);
|
||||
println!(
|
||||
" Accuracy: {:.1}%",
|
||||
benchmark_results.accuracy * 100.0
|
||||
);
|
||||
println!();
|
||||
println!("⏱️ Performance:");
|
||||
println!(" Avg steps: {:.1}", benchmark_results.avg_steps);
|
||||
println!(" Avg tool calls: {:.1}", benchmark_results.avg_tool_calls);
|
||||
println!(
|
||||
" Avg latency: {:.1}ms",
|
||||
benchmark_results.avg_latency_ms
|
||||
);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Compute accuracy by difficulty
|
||||
let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
|
||||
std::collections::HashMap::new();
|
||||
for (puzzle, result) in puzzles.iter().zip(benchmark_results.results.iter()) {
|
||||
let entry = by_difficulty.entry(puzzle.difficulty).or_insert((0, 0));
|
||||
entry.0 += 1;
|
||||
if result.correct {
|
||||
entry.1 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
println!("📈 Accuracy by Difficulty:");
|
||||
let mut difficulties: Vec<_> = by_difficulty.keys().copied().collect();
|
||||
difficulties.sort();
|
||||
for d in difficulties {
|
||||
let (total, correct) = by_difficulty[&d];
|
||||
let acc = correct as f64 / total as f64 * 100.0;
|
||||
println!(" Difficulty {}: {:5.1}% ({}/{})", d, acc, correct, total);
|
||||
}
|
||||
println!();
|
||||
|
||||
// Tool usage analysis
|
||||
if args.calendar {
|
||||
let with_rewriting = benchmark_results
|
||||
.results
|
||||
.iter()
|
||||
.filter(|r| r.tool_calls > 0 && r.correct)
|
||||
.count();
|
||||
println!("🔧 Tool Analysis:");
|
||||
println!(
|
||||
" Calendar rewriting success: {}/{}",
|
||||
with_rewriting, benchmark_results.total_puzzles
|
||||
);
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
// Save full results as JSON
|
||||
let results_path = args.output.replace(".jsonl", "_summary.json");
|
||||
let results_json = serde_json::to_string_pretty(&benchmark_results)?;
|
||||
std::fs::write(&results_path, results_json)?;
|
||||
println!("📝 Summary saved to: {}", results_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
308
vendor/ruvector/examples/benchmarks/src/bin/timepuzzle_runner.rs
vendored
Normal file
308
vendor/ruvector/examples/benchmarks/src/bin/timepuzzle_runner.rs
vendored
Normal file
@@ -0,0 +1,308 @@
|
||||
//! TimePuzzle Quick Runner
|
||||
//!
|
||||
//! 10-minute probe for temporal reasoning with tool augmentation.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin timepuzzle-runner -- --quick
|
||||
//! cargo run --bin timepuzzle-runner -- --depth 5
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger, temporal::TemporalSolver, timepuzzles::SamplePuzzles,
|
||||
};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "timepuzzle-runner")]
|
||||
#[command(about = "Quick TimePuzzle probe for agent testing")]
|
||||
struct Args {
|
||||
/// Quick mode: 50 puzzles, depth-limited steps
|
||||
#[arg(long)]
|
||||
quick: bool,
|
||||
|
||||
/// Maximum depth (steps) per puzzle
|
||||
#[arg(short, long, default_value = "50")]
|
||||
depth: usize,
|
||||
|
||||
/// Number of puzzles
|
||||
#[arg(short = 'n', long, default_value = "50")]
|
||||
puzzles: usize,
|
||||
|
||||
/// Tool latency cap (abort if tool > 1.5x median)
|
||||
#[arg(long, default_value = "1.5")]
|
||||
latency_cap: f64,
|
||||
|
||||
/// Timeout in seconds
|
||||
#[arg(long, default_value = "600")]
|
||||
timeout: u64,
|
||||
|
||||
/// Enable constraint rewriting (calendar math)
|
||||
#[arg(long, default_value = "true")]
|
||||
rewrite: bool,
|
||||
|
||||
/// Enable web search (for factual anchors)
|
||||
#[arg(long, default_value = "false")]
|
||||
web_search: bool,
|
||||
|
||||
/// Output file
|
||||
#[arg(short, long, default_value = "logs/timepuzzle_probe.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose mode
|
||||
#[arg(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ TimePuzzle Quick Probe Runner ║");
|
||||
println!("║ Tool-Augmented Iterative Temporal Reasoning ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting TimePuzzle probe", "timepuzzle-runner")?;
|
||||
|
||||
// Quick mode settings
|
||||
let (num_puzzles, max_depth) = if args.quick {
|
||||
println!("⚡ Quick mode enabled (50 puzzles, depth {})", args.depth);
|
||||
(50, args.depth)
|
||||
} else {
|
||||
(args.puzzles, args.depth)
|
||||
};
|
||||
|
||||
let timeout = Duration::from_secs(args.timeout);
|
||||
|
||||
println!();
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Puzzles: {}", num_puzzles);
|
||||
println!(" Max depth: {}", max_depth);
|
||||
println!(" Rewriting: {}", args.rewrite);
|
||||
println!(" Web search: {}", args.web_search);
|
||||
println!(" Latency cap: {}x median", args.latency_cap);
|
||||
println!(" Timeout: {}s", args.timeout);
|
||||
println!();
|
||||
|
||||
// Generate puzzles with varying constraint density
|
||||
println!("🎲 Generating puzzles...");
|
||||
let puzzles = SamplePuzzles::mixed_sample()
|
||||
.into_iter()
|
||||
.take(num_puzzles)
|
||||
.collect::<Vec<_>>();
|
||||
println!("✓ Loaded {} puzzles", puzzles.len());
|
||||
println!();
|
||||
|
||||
// Configure solver
|
||||
let mut solver = TemporalSolver::with_tools(args.rewrite, args.web_search);
|
||||
solver.max_steps = max_depth;
|
||||
|
||||
// Run probe
|
||||
println!("🏃 Running probe...");
|
||||
println!();
|
||||
|
||||
let probe_start = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
let mut latencies: Vec<u64> = Vec::new();
|
||||
let mut median_latency: f64 = 100.0; // Initial estimate
|
||||
|
||||
for (i, puzzle) in puzzles.iter().enumerate() {
|
||||
// Check timeout
|
||||
if probe_start.elapsed() > timeout {
|
||||
println!("⚠️ Timeout reached after {} puzzles", i);
|
||||
break;
|
||||
}
|
||||
|
||||
let result = solver.solve(puzzle)?;
|
||||
|
||||
// Check latency cap
|
||||
if latencies.len() >= 10 {
|
||||
let mut sorted = latencies.clone();
|
||||
sorted.sort();
|
||||
median_latency = sorted[sorted.len() / 2] as f64;
|
||||
|
||||
if result.latency_ms as f64 > median_latency * args.latency_cap {
|
||||
if args.verbose {
|
||||
println!(
|
||||
" ⚠ Puzzle {} aborted: latency {}ms > {:.0}ms cap",
|
||||
puzzle.id,
|
||||
result.latency_ms,
|
||||
median_latency * args.latency_cap
|
||||
);
|
||||
}
|
||||
// Still record but mark as slow
|
||||
}
|
||||
}
|
||||
|
||||
latencies.push(result.latency_ms);
|
||||
|
||||
// Log
|
||||
logger.log_temporal(
|
||||
"timepuzzle-probe",
|
||||
&puzzle.id,
|
||||
puzzle.difficulty,
|
||||
result.solved,
|
||||
result.correct,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms,
|
||||
puzzle.constraints.len(),
|
||||
args.rewrite,
|
||||
args.web_search,
|
||||
)?;
|
||||
|
||||
if args.verbose {
|
||||
let status = if result.correct {
|
||||
"✓"
|
||||
} else if result.solved {
|
||||
"~"
|
||||
} else {
|
||||
"✗"
|
||||
};
|
||||
println!(
|
||||
" {} [{:2}] {}: steps={}, tools={}, {}ms",
|
||||
status,
|
||||
puzzle.difficulty,
|
||||
puzzle.id,
|
||||
result.steps,
|
||||
result.tool_calls,
|
||||
result.latency_ms
|
||||
);
|
||||
}
|
||||
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_time = probe_start.elapsed();
|
||||
println!();
|
||||
|
||||
// Analyze results
|
||||
let solved = results.iter().filter(|r| r.solved).count();
|
||||
let correct = results.iter().filter(|r| r.correct).count();
|
||||
let total = results.len();
|
||||
let accuracy = correct as f64 / total as f64;
|
||||
|
||||
let avg_steps = results.iter().map(|r| r.steps).sum::<usize>() as f64 / total as f64;
|
||||
let avg_tools = results.iter().map(|r| r.tool_calls).sum::<usize>() as f64 / total as f64;
|
||||
let avg_latency = results.iter().map(|r| r.latency_ms).sum::<u64>() as f64 / total as f64;
|
||||
|
||||
// Tool toggle analysis
|
||||
let with_tool_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.tool_calls > 0 && r.correct)
|
||||
.count();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Probe Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("📊 Overall Performance:");
|
||||
println!(" Puzzles run: {}", total);
|
||||
println!(
|
||||
" Solved: {} ({:.1}%)",
|
||||
solved,
|
||||
solved as f64 / total as f64 * 100.0
|
||||
);
|
||||
println!(
|
||||
" Correct: {} ({:.1}%)",
|
||||
correct,
|
||||
accuracy * 100.0
|
||||
);
|
||||
println!();
|
||||
println!("⏱️ Efficiency:");
|
||||
println!(" Avg steps: {:.1}", avg_steps);
|
||||
println!(" Avg tool calls: {:.1}", avg_tools);
|
||||
println!(" Avg latency: {:.1}ms", avg_latency);
|
||||
println!(" Median latency: {:.0}ms", median_latency);
|
||||
println!(" Total time: {:.2}s", total_time.as_secs_f64());
|
||||
println!();
|
||||
|
||||
// Scaling curves
|
||||
println!("📈 Tool Toggle Analysis:");
|
||||
println!(
|
||||
" With rewriting: {}/{} ({:.1}%)",
|
||||
with_tool_correct,
|
||||
total,
|
||||
with_tool_correct as f64 / total as f64 * 100.0
|
||||
);
|
||||
|
||||
// Sensitivity analysis
|
||||
let fast_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms < median_latency as u64 && r.correct)
|
||||
.count();
|
||||
let slow_correct = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms >= median_latency as u64 && r.correct)
|
||||
.count();
|
||||
let fast_total = results
|
||||
.iter()
|
||||
.filter(|r| r.latency_ms < median_latency as u64)
|
||||
.count();
|
||||
let slow_total = total - fast_total;
|
||||
|
||||
if fast_total > 0 && slow_total > 0 {
|
||||
println!();
|
||||
println!("⚡ Latency Sensitivity:");
|
||||
println!(
|
||||
" Fast (<{:.0}ms): {}/{} ({:.1}%)",
|
||||
median_latency,
|
||||
fast_correct,
|
||||
fast_total,
|
||||
fast_correct as f64 / fast_total as f64 * 100.0
|
||||
);
|
||||
println!(
|
||||
" Slow (>={:.0}ms): {}/{} ({:.1}%)",
|
||||
median_latency,
|
||||
slow_correct,
|
||||
slow_total,
|
||||
slow_correct as f64 / slow_total as f64 * 100.0
|
||||
);
|
||||
}
|
||||
|
||||
// Accuracy by difficulty
|
||||
println!();
|
||||
println!("🎯 Accuracy by Difficulty:");
|
||||
let mut by_diff: std::collections::HashMap<u8, (usize, usize)> =
|
||||
std::collections::HashMap::new();
|
||||
for (p, r) in puzzles.iter().zip(results.iter()) {
|
||||
let e = by_diff.entry(p.difficulty).or_insert((0, 0));
|
||||
e.0 += 1;
|
||||
if r.correct {
|
||||
e.1 += 1;
|
||||
}
|
||||
}
|
||||
let mut diffs: Vec<_> = by_diff.keys().copied().collect();
|
||||
diffs.sort();
|
||||
for d in diffs {
|
||||
let (t, c) = by_diff[&d];
|
||||
let pct = c as f64 / t as f64 * 100.0;
|
||||
let bar = "█".repeat((pct / 5.0) as usize);
|
||||
println!(" Level {:2}: {:5.1}% {}", d, pct, bar);
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
println!();
|
||||
println!("💡 Insights:");
|
||||
if accuracy < 0.5 {
|
||||
println!(" • Low accuracy - consider enabling constraint rewriting");
|
||||
}
|
||||
if avg_steps > max_depth as f64 * 0.8 {
|
||||
println!(" • High step count - search may be inefficient");
|
||||
}
|
||||
if args.web_search && with_tool_correct > correct / 2 {
|
||||
println!(" • Web search providing substantial gains");
|
||||
}
|
||||
if accuracy >= 0.8 {
|
||||
println!(" • Good performance - ready for harder puzzles");
|
||||
}
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!();
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
248
vendor/ruvector/examples/benchmarks/src/bin/vector_benchmark.rs
vendored
Normal file
248
vendor/ruvector/examples/benchmarks/src/bin/vector_benchmark.rs
vendored
Normal file
@@ -0,0 +1,248 @@
|
||||
//! Vector Index Benchmark Runner
|
||||
//!
|
||||
//! Benchmark vector operations with IVF and coherence gating.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin vector-benchmark -- --dim 128 --vectors 10000
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "vector-benchmark")]
|
||||
#[command(about = "Benchmark vector index operations")]
|
||||
struct Args {
|
||||
/// Vector dimensionality
|
||||
#[arg(short, long, default_value = "128")]
|
||||
dim: usize,
|
||||
|
||||
/// Number of vectors to insert
|
||||
#[arg(short = 'n', long, default_value = "10000")]
|
||||
vectors: usize,
|
||||
|
||||
/// Number of queries to run
|
||||
#[arg(short, long, default_value = "1000")]
|
||||
queries: usize,
|
||||
|
||||
/// Top-k results per query
|
||||
#[arg(short, long, default_value = "10")]
|
||||
top_k: usize,
|
||||
|
||||
/// Enable IVF indexing
|
||||
#[arg(long, default_value = "true")]
|
||||
ivf: bool,
|
||||
|
||||
/// Number of IVF clusters
|
||||
#[arg(long, default_value = "64")]
|
||||
clusters: usize,
|
||||
|
||||
/// Number of clusters to probe
|
||||
#[arg(long, default_value = "4")]
|
||||
probes: usize,
|
||||
|
||||
/// Enable coherence gate
|
||||
#[arg(long)]
|
||||
gate: bool,
|
||||
|
||||
/// Coherence gate threshold
|
||||
#[arg(long, default_value = "0.5")]
|
||||
gate_threshold: f32,
|
||||
|
||||
/// Output log file
|
||||
#[arg(short, long, default_value = "logs/vector_benchmark.jsonl")]
|
||||
output: String,
|
||||
|
||||
/// Verbose output
|
||||
#[arg(short = 'V', long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Vector Index Benchmark Runner ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
|
||||
// Initialize logger
|
||||
let mut logger = BenchmarkLogger::new(&args.output)?;
|
||||
logger.log_system("INFO", "Starting vector benchmark", "vector-benchmark")?;
|
||||
|
||||
// Create index
|
||||
println!("🔧 Configuration:");
|
||||
println!(" Dimensions: {}", args.dim);
|
||||
println!(" Vectors: {}", args.vectors);
|
||||
println!(" Queries: {}", args.queries);
|
||||
println!(" Top-K: {}", args.top_k);
|
||||
println!(" IVF: {}", args.ivf);
|
||||
if args.ivf {
|
||||
println!(" Clusters: {}", args.clusters);
|
||||
println!(" Probes: {}", args.probes);
|
||||
}
|
||||
println!(" Gate: {}", args.gate);
|
||||
if args.gate {
|
||||
println!(" Threshold: {}", args.gate_threshold);
|
||||
}
|
||||
println!();
|
||||
|
||||
let mut index = VectorIndex::new(args.dim);
|
||||
|
||||
if args.gate {
|
||||
index = index.with_gate(CoherenceGate::new(args.gate_threshold));
|
||||
}
|
||||
|
||||
if args.ivf {
|
||||
index = index.with_ivf(IvfConfig::new(args.clusters, args.probes));
|
||||
}
|
||||
|
||||
// Insert vectors
|
||||
println!("📥 Inserting {} vectors...", args.vectors);
|
||||
let insert_start = Instant::now();
|
||||
|
||||
for i in 0..args.vectors {
|
||||
index.insert(DenseVec::random(args.dim))?;
|
||||
if args.verbose && (i + 1) % 1000 == 0 {
|
||||
println!(" Inserted {} vectors", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let insert_time = insert_start.elapsed();
|
||||
println!(
|
||||
"✓ Insert complete ({:.2}s, {:.0} vec/s)",
|
||||
insert_time.as_secs_f64(),
|
||||
args.vectors as f64 / insert_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
|
||||
// Build IVF if enabled
|
||||
if args.ivf {
|
||||
println!("🏗️ Building IVF index...");
|
||||
let build_start = Instant::now();
|
||||
index.rebuild_ivf()?;
|
||||
let build_time = build_start.elapsed();
|
||||
println!("✓ IVF build complete ({:.2}s)", build_time.as_secs_f64());
|
||||
println!();
|
||||
}
|
||||
|
||||
// Print index stats
|
||||
let stats = index.stats();
|
||||
println!("📊 Index Statistics:");
|
||||
println!(" Active vectors: {}", stats.active_vectors);
|
||||
println!(" IVF clusters: {}", stats.ivf_clusters);
|
||||
println!();
|
||||
|
||||
// Run queries
|
||||
println!("🔍 Running {} queries...", args.queries);
|
||||
let query_start = Instant::now();
|
||||
|
||||
let mut latencies: Vec<u64> = Vec::with_capacity(args.queries);
|
||||
let mut total_results = 0usize;
|
||||
|
||||
for i in 0..args.queries {
|
||||
let q = DenseVec::random(args.dim);
|
||||
let coherence = if args.gate {
|
||||
rand::random::<f32>()
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let results = index.search(&q, args.top_k, coherence)?;
|
||||
let latency_us = start.elapsed().as_micros() as u64;
|
||||
|
||||
latencies.push(latency_us);
|
||||
total_results += results.len();
|
||||
|
||||
// Log query
|
||||
logger.log_vector(
|
||||
"search",
|
||||
args.dim,
|
||||
stats.active_vectors,
|
||||
1,
|
||||
args.top_k,
|
||||
args.ivf,
|
||||
coherence,
|
||||
latency_us,
|
||||
results.len(),
|
||||
)?;
|
||||
|
||||
if args.verbose && (i + 1) % 100 == 0 {
|
||||
println!(" Completed {} queries", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let query_time = query_start.elapsed();
|
||||
println!(
|
||||
"✓ Queries complete ({:.2}s, {:.0} q/s)",
|
||||
query_time.as_secs_f64(),
|
||||
args.queries as f64 / query_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
|
||||
// Compute statistics
|
||||
latencies.sort();
|
||||
let p50 = latencies[latencies.len() / 2];
|
||||
let p95 = latencies[latencies.len() * 95 / 100];
|
||||
let p99 = latencies[latencies.len() * 99 / 100];
|
||||
let avg = latencies.iter().sum::<u64>() / latencies.len() as u64;
|
||||
let max = *latencies.last().unwrap();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Benchmark Results ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!("⏱️ Latency (microseconds):");
|
||||
println!(" Average: {}µs", avg);
|
||||
println!(" P50: {}µs", p50);
|
||||
println!(" P95: {}µs", p95);
|
||||
println!(" P99: {}µs", p99);
|
||||
println!(" Max: {}µs", max);
|
||||
println!();
|
||||
println!("📈 Throughput:");
|
||||
println!(
|
||||
" Queries/sec: {:.0}",
|
||||
args.queries as f64 / query_time.as_secs_f64()
|
||||
);
|
||||
println!(
|
||||
" Insert/sec: {:.0}",
|
||||
args.vectors as f64 / insert_time.as_secs_f64()
|
||||
);
|
||||
println!();
|
||||
println!("📊 Results:");
|
||||
println!(" Total results: {}", total_results);
|
||||
println!(
|
||||
" Avg results: {:.2}",
|
||||
total_results as f64 / args.queries as f64
|
||||
);
|
||||
|
||||
if args.gate {
|
||||
let gated = latencies
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &l)| l < 10)
|
||||
.count();
|
||||
println!(
|
||||
" Gated queries: {:.1}%",
|
||||
gated as f64 / args.queries as f64 * 100.0
|
||||
);
|
||||
}
|
||||
|
||||
// Save index
|
||||
println!();
|
||||
let index_path = "data/vector_index.bin";
|
||||
std::fs::create_dir_all("data")?;
|
||||
index.save_to_file(index_path)?;
|
||||
println!("💾 Index saved to: {}", index_path);
|
||||
|
||||
// Flush logs
|
||||
logger.flush()?;
|
||||
println!("📝 Results saved to: {}", args.output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
197
vendor/ruvector/examples/benchmarks/src/bin/wasm_solver_bench.rs
vendored
Normal file
197
vendor/ruvector/examples/benchmarks/src/bin/wasm_solver_bench.rs
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
//! WASM Solver Benchmark — Compares native vs WASM AGI solver performance.
|
||||
//!
|
||||
//! Runs the same acceptance test configuration through:
|
||||
//! 1. Native Rust solver (benchmarks crate)
|
||||
//! 2. Reference metrics comparison
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --bin wasm-solver-bench [-- --holdout <N> --training <N> --cycles <N>]
|
||||
|
||||
use clap::Parser;
|
||||
use ruvector_benchmarks::acceptance_test::{run_acceptance_test_mode, AblationMode, HoldoutConfig};
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "wasm-solver-bench")]
|
||||
struct Args {
|
||||
#[arg(long, default_value = "50")]
|
||||
holdout: usize,
|
||||
#[arg(long, default_value = "50")]
|
||||
training: usize,
|
||||
#[arg(long, default_value = "3")]
|
||||
cycles: usize,
|
||||
#[arg(long, default_value = "200")]
|
||||
budget: usize,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ WASM vs Native AGI Solver Benchmark ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!(
|
||||
" Config: holdout={}, training={}, cycles={}, budget={}",
|
||||
args.holdout, args.training, args.cycles, args.budget
|
||||
);
|
||||
println!();
|
||||
|
||||
let config = HoldoutConfig {
|
||||
holdout_size: args.holdout,
|
||||
training_per_cycle: args.training,
|
||||
cycles: args.cycles,
|
||||
step_budget: args.budget,
|
||||
holdout_seed: 0xDEAD_BEEF,
|
||||
training_seed: 42,
|
||||
noise_rate: 0.25,
|
||||
min_accuracy: 0.50,
|
||||
min_dimensions_improved: 1,
|
||||
verbose: false,
|
||||
};
|
||||
|
||||
// ── Native Mode A (Baseline) ──────────────────────────────────
|
||||
println!(" Running Native Mode A (baseline)...");
|
||||
let t0 = Instant::now();
|
||||
let native_a = run_acceptance_test_mode(&config, &AblationMode::Baseline).unwrap();
|
||||
let native_a_ms = t0.elapsed().as_millis();
|
||||
|
||||
// ── Native Mode B (Compiler) ──────────────────────────────────
|
||||
println!(" Running Native Mode B (compiler)...");
|
||||
let t0 = Instant::now();
|
||||
let native_b = run_acceptance_test_mode(&config, &AblationMode::CompilerOnly).unwrap();
|
||||
let native_b_ms = t0.elapsed().as_millis();
|
||||
|
||||
// ── Native Mode C (Full learned) ──────────────────────────────
|
||||
println!(" Running Native Mode C (full learned)...");
|
||||
let t0 = Instant::now();
|
||||
let native_c = run_acceptance_test_mode(&config, &AblationMode::Full).unwrap();
|
||||
let native_c_ms = t0.elapsed().as_millis();
|
||||
|
||||
println!();
|
||||
println!(" ┌────────────────────────────────────────────────────────┐");
|
||||
println!(" │ NATIVE SOLVER RESULTS │");
|
||||
println!(" ├────────────────────────────────────────────────────────┤");
|
||||
println!(
|
||||
" │ {:<12} {:>8} {:>10} {:>10} {:>8} {:>8} │",
|
||||
"Mode", "Acc%", "Cost", "Noise%", "Time", "Pass"
|
||||
);
|
||||
println!(" │ {} │", "-".repeat(54));
|
||||
|
||||
for (label, result, ms) in [
|
||||
("A baseline", &native_a, native_a_ms),
|
||||
("B compiler", &native_b, native_b_ms),
|
||||
("C learned", &native_c, native_c_ms),
|
||||
] {
|
||||
let last = result.result.cycles.last().unwrap();
|
||||
println!(
|
||||
" │ {:<12} {:>6.1}% {:>9.1} {:>8.1}% {:>5}ms {:>7} │",
|
||||
label,
|
||||
last.holdout_accuracy * 100.0,
|
||||
last.holdout_cost_per_solve,
|
||||
last.holdout_noise_accuracy * 100.0,
|
||||
ms,
|
||||
if result.result.passed { "PASS" } else { "FAIL" }
|
||||
);
|
||||
}
|
||||
println!(" └────────────────────────────────────────────────────────┘");
|
||||
println!();
|
||||
|
||||
// ── WASM Reference Metrics ────────────────────────────────────
|
||||
// Since we can't run WASM directly from Rust without a runtime,
|
||||
// we output the reference metrics that the WASM module should match.
|
||||
println!(" ┌────────────────────────────────────────────────────────┐");
|
||||
println!(" │ WASM REFERENCE METRICS (for validation) │");
|
||||
println!(" ├────────────────────────────────────────────────────────┤");
|
||||
println!(" │ │");
|
||||
println!(" │ The rvf-solver-wasm module should produce: │");
|
||||
println!(" │ │");
|
||||
|
||||
let total_ms = native_a_ms + native_b_ms + native_c_ms;
|
||||
println!(
|
||||
" │ Native total time: {}ms │",
|
||||
total_ms
|
||||
);
|
||||
println!(
|
||||
" │ WASM expected: ~{}ms (2-5x native) │",
|
||||
total_ms * 3
|
||||
);
|
||||
println!(" │ │");
|
||||
|
||||
// PolicyKernel convergence check
|
||||
println!(" │ Mode C PolicyKernel: │");
|
||||
println!(
|
||||
" │ Context buckets: {} │",
|
||||
native_c.policy_context_buckets
|
||||
);
|
||||
println!(
|
||||
" │ Early commit rate: {:.2}% │",
|
||||
native_c.early_commit_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" │ Compiler hits: {} │",
|
||||
native_c.compiler_hits
|
||||
);
|
||||
println!(" │ │");
|
||||
|
||||
// Thompson Sampling convergence: Mode C should learn differently across contexts
|
||||
let c_unique_modes: std::collections::HashSet<&str> = native_c
|
||||
.skip_mode_distribution
|
||||
.values()
|
||||
.flat_map(|m| m.keys())
|
||||
.map(|s| s.as_str())
|
||||
.collect();
|
||||
println!(" │ Thompson Sampling convergence: │");
|
||||
println!(
|
||||
" │ Unique skip modes: {} (need >=2) │",
|
||||
c_unique_modes.len()
|
||||
);
|
||||
println!(" │ Skip distribution: │");
|
||||
for (bucket, dist) in &native_c.skip_mode_distribution {
|
||||
let total = dist.values().sum::<usize>().max(1);
|
||||
let parts: Vec<String> = dist
|
||||
.iter()
|
||||
.map(|(m, c)| format!("{}:{:.0}%", m, *c as f64 / total as f64 * 100.0))
|
||||
.collect();
|
||||
if parts.len() > 0 {
|
||||
println!(" │ {:<16} {} │", bucket, parts.join(" "));
|
||||
}
|
||||
}
|
||||
println!(" │ │");
|
||||
|
||||
// Ablation assertions
|
||||
let last_a = native_a.result.cycles.last().unwrap();
|
||||
let last_b = native_b.result.cycles.last().unwrap();
|
||||
let last_c = native_c.result.cycles.last().unwrap();
|
||||
let cost_decrease = if last_a.holdout_cost_per_solve > 0.0 {
|
||||
(1.0 - last_b.holdout_cost_per_solve / last_a.holdout_cost_per_solve) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let robustness_gain = (last_c.holdout_noise_accuracy - last_b.holdout_noise_accuracy) * 100.0;
|
||||
|
||||
println!(" │ Ablation assertions: │");
|
||||
println!(
|
||||
" │ B vs A cost decrease: {:.1}% (need >=15%) │",
|
||||
cost_decrease
|
||||
);
|
||||
println!(
|
||||
" │ C vs B robustness: {:.1}% (need >=10%) │",
|
||||
robustness_gain
|
||||
);
|
||||
println!(" │ │");
|
||||
println!(" │ WASM module must match these learning characteristics │");
|
||||
println!(" │ (exact values may differ due to float precision) │");
|
||||
println!(" └────────────────────────────────────────────────────────┘");
|
||||
println!();
|
||||
|
||||
// Final summary
|
||||
let all_passed = native_a.result.passed && native_b.result.passed && native_c.result.passed;
|
||||
if all_passed {
|
||||
println!(" NATIVE BENCHMARK: ALL MODES PASSED");
|
||||
} else {
|
||||
println!(" NATIVE BENCHMARK: SOME MODES FAILED");
|
||||
}
|
||||
println!(" Binary size: rvf-solver-wasm.wasm ~160 KB");
|
||||
println!();
|
||||
}
|
||||
960
vendor/ruvector/examples/benchmarks/src/intelligence_metrics.rs
vendored
Normal file
960
vendor/ruvector/examples/benchmarks/src/intelligence_metrics.rs
vendored
Normal file
@@ -0,0 +1,960 @@
|
||||
//! Intelligence Metrics Module
|
||||
//!
|
||||
//! Measures cognitive capabilities, reasoning quality, and learning indicators
|
||||
//! for agent evaluation based on established AI benchmarking methodologies.
|
||||
//!
|
||||
//! Key metrics tracked:
|
||||
//! - Reasoning quality (logical coherence, constraint satisfaction)
|
||||
//! - Learning efficiency (regret curves, sample efficiency)
|
||||
//! - Working memory (context utilization, information integration)
|
||||
//! - Tool use proficiency (appropriate selection, effective utilization)
|
||||
//! - Meta-cognitive awareness (self-correction, uncertainty estimation)
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Intelligence assessment result
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct IntelligenceAssessment {
|
||||
/// Overall intelligence score (0-100)
|
||||
pub overall_score: f64,
|
||||
/// Individual capability scores
|
||||
pub capabilities: CapabilityScores,
|
||||
/// Reasoning quality metrics
|
||||
pub reasoning: ReasoningMetrics,
|
||||
/// Learning efficiency metrics
|
||||
pub learning: LearningMetrics,
|
||||
/// Tool use proficiency
|
||||
pub tool_use: ToolUseMetrics,
|
||||
/// Meta-cognitive indicators
|
||||
pub meta_cognition: MetaCognitiveMetrics,
|
||||
/// Cost efficiency metrics
|
||||
pub cost: CostMetrics,
|
||||
/// Robustness under noise
|
||||
pub robustness: RobustnessMetrics,
|
||||
/// Raw performance data
|
||||
pub raw_data: RawMetrics,
|
||||
}
|
||||
|
||||
/// Capability scores across dimensions
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CapabilityScores {
|
||||
/// Temporal reasoning (date inference, calendar math)
|
||||
pub temporal_reasoning: f64,
|
||||
/// Constraint satisfaction (multi-constraint solving)
|
||||
pub constraint_satisfaction: f64,
|
||||
/// Information retrieval (semantic search, recall)
|
||||
pub information_retrieval: f64,
|
||||
/// Pattern recognition (learning from examples)
|
||||
pub pattern_recognition: f64,
|
||||
/// Planning and sequencing
|
||||
pub planning: f64,
|
||||
/// Error recovery and adaptation
|
||||
pub adaptation: f64,
|
||||
}
|
||||
|
||||
impl Default for CapabilityScores {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
temporal_reasoning: 0.0,
|
||||
constraint_satisfaction: 0.0,
|
||||
information_retrieval: 0.0,
|
||||
pattern_recognition: 0.0,
|
||||
planning: 0.0,
|
||||
adaptation: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CapabilityScores {
|
||||
/// Compute weighted average
|
||||
pub fn weighted_average(&self, weights: &[f64; 6]) -> f64 {
|
||||
let scores = [
|
||||
self.temporal_reasoning,
|
||||
self.constraint_satisfaction,
|
||||
self.information_retrieval,
|
||||
self.pattern_recognition,
|
||||
self.planning,
|
||||
self.adaptation,
|
||||
];
|
||||
let total_weight: f64 = weights.iter().sum();
|
||||
if total_weight == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
scores
|
||||
.iter()
|
||||
.zip(weights.iter())
|
||||
.map(|(s, w)| s * w)
|
||||
.sum::<f64>()
|
||||
/ total_weight
|
||||
}
|
||||
}
|
||||
|
||||
/// Reasoning quality metrics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ReasoningMetrics {
|
||||
/// Logical coherence (steps follow logically)
|
||||
pub logical_coherence: f64,
|
||||
/// Constraint satisfaction rate
|
||||
pub constraint_satisfaction_rate: f64,
|
||||
/// Solution optimality (vs. best possible)
|
||||
pub solution_optimality: f64,
|
||||
/// Reasoning efficiency (steps to solution)
|
||||
pub reasoning_efficiency: f64,
|
||||
/// Error rate in logical steps
|
||||
pub error_rate: f64,
|
||||
}
|
||||
|
||||
impl Default for ReasoningMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
logical_coherence: 0.0,
|
||||
constraint_satisfaction_rate: 0.0,
|
||||
solution_optimality: 0.0,
|
||||
reasoning_efficiency: 0.0,
|
||||
error_rate: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Learning efficiency metrics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct LearningMetrics {
|
||||
/// Sample efficiency (performance vs. examples seen)
|
||||
pub sample_efficiency: f64,
|
||||
/// Regret trajectory (sublinear indicator)
|
||||
pub regret_sublinearity: f64,
|
||||
/// Transfer learning capability
|
||||
pub transfer_capability: f64,
|
||||
/// Learning rate (improvement per episode)
|
||||
pub learning_rate: f64,
|
||||
/// Generalization ability
|
||||
pub generalization: f64,
|
||||
}
|
||||
|
||||
impl Default for LearningMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sample_efficiency: 0.0,
|
||||
regret_sublinearity: 0.0,
|
||||
transfer_capability: 0.0,
|
||||
learning_rate: 0.0,
|
||||
generalization: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tool use proficiency metrics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ToolUseMetrics {
|
||||
/// Tool selection appropriateness
|
||||
pub selection_appropriateness: f64,
|
||||
/// Tool utilization effectiveness
|
||||
pub utilization_effectiveness: f64,
|
||||
/// Tool composition (combining tools)
|
||||
pub composition_ability: f64,
|
||||
/// Tool discovery (finding needed tools)
|
||||
pub discovery_ability: f64,
|
||||
}
|
||||
|
||||
impl Default for ToolUseMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
selection_appropriateness: 0.0,
|
||||
utilization_effectiveness: 0.0,
|
||||
composition_ability: 0.0,
|
||||
discovery_ability: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta-cognitive metrics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct MetaCognitiveMetrics {
|
||||
/// Self-correction rate
|
||||
pub self_correction_rate: f64,
|
||||
/// Uncertainty calibration (confidence vs. accuracy)
|
||||
pub uncertainty_calibration: f64,
|
||||
/// Strategy adaptation
|
||||
pub strategy_adaptation: f64,
|
||||
/// Progress monitoring accuracy
|
||||
pub progress_monitoring: f64,
|
||||
}
|
||||
|
||||
impl Default for MetaCognitiveMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
self_correction_rate: 0.0,
|
||||
uncertainty_calibration: 0.0,
|
||||
strategy_adaptation: 0.0,
|
||||
progress_monitoring: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Cost efficiency metrics — first-class IQ dimension
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CostMetrics {
|
||||
/// Steps per correct solve (lower = better)
|
||||
pub steps_per_solve: f64,
|
||||
/// Tool calls per correct solve (lower = better)
|
||||
pub tools_per_solve: f64,
|
||||
/// Cost efficiency score (0-1, higher = cheaper)
|
||||
pub cost_efficiency: f64,
|
||||
/// Cost trend over episodes (positive = improving)
|
||||
pub cost_trend: f64,
|
||||
}
|
||||
|
||||
impl Default for CostMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
steps_per_solve: 100.0,
|
||||
tools_per_solve: 10.0,
|
||||
cost_efficiency: 0.0,
|
||||
cost_trend: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Robustness under adversarial conditions — first-class IQ dimension
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RobustnessMetrics {
|
||||
/// Accuracy on noise-injected tasks
|
||||
pub noise_accuracy: f64,
|
||||
/// Accuracy drop from clean to noisy (lower = more robust)
|
||||
pub noise_degradation: f64,
|
||||
/// Per-episode accuracy consistency (higher = steadier)
|
||||
pub consistency: f64,
|
||||
/// Composite robustness score (0-1)
|
||||
pub robustness_score: f64,
|
||||
}
|
||||
|
||||
impl Default for RobustnessMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
noise_accuracy: 0.0,
|
||||
noise_degradation: 1.0,
|
||||
consistency: 0.0,
|
||||
robustness_score: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw metrics from benchmarks
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RawMetrics {
|
||||
/// Total tasks attempted
|
||||
pub tasks_attempted: usize,
|
||||
/// Tasks completed successfully
|
||||
pub tasks_completed: usize,
|
||||
/// Tasks with correct solutions
|
||||
pub tasks_correct: usize,
|
||||
/// Total steps taken
|
||||
pub total_steps: usize,
|
||||
/// Total tool calls
|
||||
pub total_tool_calls: usize,
|
||||
/// Total latency in ms
|
||||
pub total_latency_ms: u64,
|
||||
/// Performance by difficulty
|
||||
pub by_difficulty: HashMap<u8, DifficultyStats>,
|
||||
/// Episode-level metrics
|
||||
pub episodes: Vec<EpisodeMetrics>,
|
||||
/// Tasks attempted under noise injection
|
||||
pub noise_tasks_attempted: usize,
|
||||
/// Tasks correct under noise injection
|
||||
pub noise_tasks_correct: usize,
|
||||
/// Policy violations (contradictions, budget overruns)
|
||||
pub policy_violations: usize,
|
||||
/// Solved-but-incorrect count (contradiction rate numerator)
|
||||
pub contradictions: usize,
|
||||
/// Successful rollbacks from noisy to clean
|
||||
pub rollback_successes: usize,
|
||||
/// Attempted rollbacks from noisy to clean
|
||||
pub rollback_attempts: usize,
|
||||
}
|
||||
|
||||
impl Default for RawMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tasks_attempted: 0,
|
||||
tasks_completed: 0,
|
||||
tasks_correct: 0,
|
||||
total_steps: 0,
|
||||
total_tool_calls: 0,
|
||||
total_latency_ms: 0,
|
||||
by_difficulty: HashMap::new(),
|
||||
episodes: Vec::new(),
|
||||
noise_tasks_attempted: 0,
|
||||
noise_tasks_correct: 0,
|
||||
policy_violations: 0,
|
||||
contradictions: 0,
|
||||
rollback_successes: 0,
|
||||
rollback_attempts: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stats per difficulty level
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DifficultyStats {
|
||||
pub attempted: usize,
|
||||
pub completed: usize,
|
||||
pub correct: usize,
|
||||
pub avg_steps: f64,
|
||||
}
|
||||
|
||||
/// Per-episode metrics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct EpisodeMetrics {
|
||||
pub episode: usize,
|
||||
pub accuracy: f64,
|
||||
pub reward: f64,
|
||||
pub regret: f64,
|
||||
pub cumulative_regret: f64,
|
||||
}
|
||||
|
||||
/// Intelligence metrics calculator
|
||||
pub struct IntelligenceCalculator {
|
||||
/// Weights for capability scoring
|
||||
pub capability_weights: [f64; 6],
|
||||
/// Baseline for comparison
|
||||
pub baseline_accuracy: f64,
|
||||
/// Oracle performance for regret calculation
|
||||
pub oracle_reward: f64,
|
||||
}
|
||||
|
||||
impl Default for IntelligenceCalculator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
capability_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
baseline_accuracy: 0.5,
|
||||
oracle_reward: 100.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntelligenceCalculator {
|
||||
/// Calculate intelligence assessment from raw metrics
|
||||
pub fn calculate(&self, raw: &RawMetrics) -> IntelligenceAssessment {
|
||||
let capabilities = self.calculate_capabilities(raw);
|
||||
let reasoning = self.calculate_reasoning(raw);
|
||||
let learning = self.calculate_learning(raw);
|
||||
let tool_use = self.calculate_tool_use(raw);
|
||||
let meta_cognition = self.calculate_meta_cognition(raw);
|
||||
let cost = self.calculate_cost(raw);
|
||||
let robustness = self.calculate_robustness(raw);
|
||||
|
||||
// Overall score: three equal pillars — graded outcomes, cost, robustness
|
||||
let overall_score = self.calculate_overall_score(
|
||||
&capabilities,
|
||||
&reasoning,
|
||||
&learning,
|
||||
&tool_use,
|
||||
&meta_cognition,
|
||||
&cost,
|
||||
&robustness,
|
||||
);
|
||||
|
||||
IntelligenceAssessment {
|
||||
overall_score,
|
||||
capabilities,
|
||||
reasoning,
|
||||
learning,
|
||||
tool_use,
|
||||
meta_cognition,
|
||||
cost,
|
||||
robustness,
|
||||
raw_data: raw.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_capabilities(&self, raw: &RawMetrics) -> CapabilityScores {
|
||||
let base_accuracy = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_correct as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Temporal reasoning: accuracy on time-based tasks
|
||||
let temporal_reasoning = base_accuracy * 100.0;
|
||||
|
||||
// Constraint satisfaction: correct solutions
|
||||
let constraint_satisfaction = base_accuracy * 100.0;
|
||||
|
||||
// Information retrieval: based on steps to solution
|
||||
let avg_steps = if raw.tasks_attempted > 0 {
|
||||
raw.total_steps as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
100.0
|
||||
};
|
||||
let information_retrieval = (100.0 - avg_steps).max(0.0).min(100.0);
|
||||
|
||||
// Pattern recognition: performance improvement across difficulties
|
||||
let pattern_recognition = self.calculate_pattern_recognition(raw);
|
||||
|
||||
// Planning: efficiency of tool use
|
||||
let avg_tools = if raw.tasks_attempted > 0 {
|
||||
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let planning = if avg_tools > 0.0 && avg_tools <= 2.0 {
|
||||
100.0 * (1.0 - (avg_tools - 1.0).abs() / 2.0)
|
||||
} else {
|
||||
50.0
|
||||
};
|
||||
|
||||
// Adaptation: improvement over episodes
|
||||
let adaptation = self.calculate_adaptation(raw);
|
||||
|
||||
CapabilityScores {
|
||||
temporal_reasoning,
|
||||
constraint_satisfaction,
|
||||
information_retrieval,
|
||||
pattern_recognition,
|
||||
planning,
|
||||
adaptation,
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_pattern_recognition(&self, raw: &RawMetrics) -> f64 {
|
||||
if raw.by_difficulty.len() < 2 {
|
||||
return 50.0;
|
||||
}
|
||||
|
||||
// Check if harder problems are still solvable
|
||||
let mut difficulties: Vec<_> = raw.by_difficulty.keys().copied().collect();
|
||||
difficulties.sort();
|
||||
|
||||
let mut scores = Vec::new();
|
||||
for d in &difficulties {
|
||||
if let Some(stats) = raw.by_difficulty.get(d) {
|
||||
if stats.attempted > 0 {
|
||||
scores.push(stats.correct as f64 / stats.attempted as f64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if scores.is_empty() {
|
||||
return 50.0;
|
||||
}
|
||||
|
||||
// Average accuracy across difficulties
|
||||
let avg: f64 = scores.iter().sum::<f64>() / scores.len() as f64;
|
||||
avg * 100.0
|
||||
}
|
||||
|
||||
fn calculate_adaptation(&self, raw: &RawMetrics) -> f64 {
|
||||
if raw.episodes.len() < 3 {
|
||||
return 50.0;
|
||||
}
|
||||
|
||||
// Check if accuracy improves over episodes
|
||||
let first_half: f64 = raw.episodes[..raw.episodes.len() / 2]
|
||||
.iter()
|
||||
.map(|e| e.accuracy)
|
||||
.sum::<f64>()
|
||||
/ (raw.episodes.len() / 2) as f64;
|
||||
|
||||
let second_half: f64 = raw.episodes[raw.episodes.len() / 2..]
|
||||
.iter()
|
||||
.map(|e| e.accuracy)
|
||||
.sum::<f64>()
|
||||
/ (raw.episodes.len() - raw.episodes.len() / 2) as f64;
|
||||
|
||||
let improvement = second_half - first_half;
|
||||
|
||||
// Scale: -0.2 to +0.2 improvement maps to 0-100
|
||||
((improvement + 0.2) / 0.4 * 100.0).max(0.0).min(100.0)
|
||||
}
|
||||
|
||||
fn calculate_reasoning(&self, raw: &RawMetrics) -> ReasoningMetrics {
|
||||
let constraint_satisfaction_rate = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_correct as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_steps = if raw.tasks_attempted > 0 {
|
||||
raw.total_steps as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
100.0
|
||||
};
|
||||
|
||||
// Reasoning efficiency: inverse of steps (normalized)
|
||||
let reasoning_efficiency = (100.0 - avg_steps).max(0.0).min(100.0) / 100.0;
|
||||
|
||||
// Logical coherence: based on completion rate vs correct rate
|
||||
let completion_rate = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_completed as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let logical_coherence = if completion_rate > 0.0 {
|
||||
constraint_satisfaction_rate / completion_rate
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
ReasoningMetrics {
|
||||
logical_coherence,
|
||||
constraint_satisfaction_rate,
|
||||
solution_optimality: constraint_satisfaction_rate,
|
||||
reasoning_efficiency,
|
||||
error_rate: 1.0 - constraint_satisfaction_rate,
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_learning(&self, raw: &RawMetrics) -> LearningMetrics {
|
||||
let mut learning = LearningMetrics::default();
|
||||
|
||||
if raw.episodes.is_empty() {
|
||||
return learning;
|
||||
}
|
||||
|
||||
// Sample efficiency: accuracy per episode
|
||||
learning.sample_efficiency =
|
||||
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
|
||||
|
||||
// Regret sublinearity: check if cumulative regret grows sublinearly
|
||||
// True sublinearity means R_k/k → 0 as k → ∞ (regret per episode decreasing)
|
||||
if raw.episodes.len() >= 5 {
|
||||
// Calculate regret trend using linear regression
|
||||
let n = raw.episodes.len() as f64;
|
||||
let mut sum_x = 0.0;
|
||||
let mut sum_y = 0.0;
|
||||
let mut sum_xy = 0.0;
|
||||
let mut sum_xx = 0.0;
|
||||
|
||||
for (i, ep) in raw.episodes.iter().enumerate() {
|
||||
let x = (i + 1) as f64;
|
||||
let y = ep.regret;
|
||||
sum_x += x;
|
||||
sum_y += y;
|
||||
sum_xy += x * y;
|
||||
sum_xx += x * x;
|
||||
}
|
||||
|
||||
let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
|
||||
|
||||
// Negative slope = decreasing regret = sublinear
|
||||
// Transform: slope < 0 → sublinearity > 0
|
||||
if slope < 0.0 {
|
||||
// Stronger negative slope = better sublinearity (cap at 1.0)
|
||||
learning.regret_sublinearity = (-slope / 10.0).min(1.0);
|
||||
}
|
||||
|
||||
// Also check cumulative average
|
||||
let last = raw.episodes.last().unwrap();
|
||||
let avg_regret = last.cumulative_regret / n;
|
||||
let first_half_avg = raw
|
||||
.episodes
|
||||
.iter()
|
||||
.take(raw.episodes.len() / 2)
|
||||
.map(|e| e.regret)
|
||||
.sum::<f64>()
|
||||
/ (n / 2.0);
|
||||
|
||||
// If second half has lower per-episode regret, that's sublinear
|
||||
if avg_regret < first_half_avg && learning.regret_sublinearity == 0.0 {
|
||||
learning.regret_sublinearity =
|
||||
((first_half_avg - avg_regret) / first_half_avg).max(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
// Learning rate: improvement in accuracy over episodes
|
||||
if raw.episodes.len() >= 2 {
|
||||
let first_acc = raw.episodes[0].accuracy;
|
||||
let last_acc = raw.episodes.last().unwrap().accuracy;
|
||||
learning.learning_rate = (last_acc - first_acc + 1.0) / 2.0;
|
||||
}
|
||||
|
||||
// Generalization: consistency across difficulties
|
||||
if raw.by_difficulty.len() >= 2 {
|
||||
let accuracies: Vec<f64> = raw
|
||||
.by_difficulty
|
||||
.values()
|
||||
.filter(|s| s.attempted > 0)
|
||||
.map(|s| s.correct as f64 / s.attempted as f64)
|
||||
.collect();
|
||||
|
||||
if !accuracies.is_empty() {
|
||||
let mean = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
|
||||
let variance = accuracies.iter().map(|a| (a - mean).powi(2)).sum::<f64>()
|
||||
/ accuracies.len() as f64;
|
||||
let std_dev = variance.sqrt();
|
||||
|
||||
// Lower variance = better generalization
|
||||
learning.generalization = (1.0 - std_dev).max(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
learning
|
||||
}
|
||||
|
||||
fn calculate_tool_use(&self, raw: &RawMetrics) -> ToolUseMetrics {
|
||||
let avg_tools = if raw.tasks_attempted > 0 {
|
||||
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Selection appropriateness: using tools when helpful
|
||||
let accuracy = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_correct as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Effectiveness: accuracy when tools are used
|
||||
let utilization_effectiveness = accuracy;
|
||||
|
||||
// Appropriateness: not overusing tools
|
||||
let selection_appropriateness = if avg_tools > 0.0 {
|
||||
(accuracy / avg_tools.min(2.0)).min(1.0)
|
||||
} else {
|
||||
0.5
|
||||
};
|
||||
|
||||
ToolUseMetrics {
|
||||
selection_appropriateness,
|
||||
utilization_effectiveness,
|
||||
composition_ability: avg_tools.min(1.0), // Using multiple tools
|
||||
discovery_ability: accuracy, // Finding solutions
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_meta_cognition(&self, raw: &RawMetrics) -> MetaCognitiveMetrics {
|
||||
// Self-correction: completed but not correct -> corrected
|
||||
let completed_but_wrong = raw.tasks_completed.saturating_sub(raw.tasks_correct);
|
||||
let self_correction_rate = if completed_but_wrong > 0 {
|
||||
0.0 // No self-correction if still wrong
|
||||
} else if raw.tasks_completed > 0 {
|
||||
1.0 // All completed are correct
|
||||
} else {
|
||||
0.5
|
||||
};
|
||||
|
||||
// Strategy adaptation: improvement over episodes
|
||||
let strategy_adaptation = if raw.episodes.len() >= 3 {
|
||||
let trend: f64 = raw
|
||||
.episodes
|
||||
.windows(2)
|
||||
.map(|w| {
|
||||
if w[1].accuracy > w[0].accuracy {
|
||||
1.0
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
})
|
||||
.sum::<f64>();
|
||||
trend / (raw.episodes.len() - 1) as f64
|
||||
} else {
|
||||
0.5
|
||||
};
|
||||
|
||||
MetaCognitiveMetrics {
|
||||
self_correction_rate,
|
||||
uncertainty_calibration: 0.5, // Would need confidence scores
|
||||
strategy_adaptation,
|
||||
progress_monitoring: strategy_adaptation, // Similar metric
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_cost(&self, raw: &RawMetrics) -> CostMetrics {
|
||||
let steps_per_solve = if raw.tasks_correct > 0 {
|
||||
raw.total_steps as f64 / raw.tasks_correct as f64
|
||||
} else if raw.tasks_attempted > 0 {
|
||||
raw.total_steps as f64
|
||||
} else {
|
||||
100.0
|
||||
};
|
||||
|
||||
let tools_per_solve = if raw.tasks_correct > 0 {
|
||||
raw.total_tool_calls as f64 / raw.tasks_correct as f64
|
||||
} else {
|
||||
10.0
|
||||
};
|
||||
|
||||
// Efficiency: 1.0 at <=5 steps/solve, 0.0 at >=100 steps/solve
|
||||
let cost_efficiency = (1.0 - (steps_per_solve - 5.0) / 95.0).clamp(0.0, 1.0);
|
||||
|
||||
// Cost trend: compare early vs late episode accuracy per step
|
||||
let cost_trend = if raw.episodes.len() >= 4 {
|
||||
let half = raw.episodes.len() / 2;
|
||||
let early_acc: f64 =
|
||||
raw.episodes[..half].iter().map(|e| e.accuracy).sum::<f64>() / half as f64;
|
||||
let late_acc: f64 = raw.episodes[half..].iter().map(|e| e.accuracy).sum::<f64>()
|
||||
/ (raw.episodes.len() - half) as f64;
|
||||
// If accuracy improves, effective cost per solve drops
|
||||
if early_acc > 0.01 {
|
||||
(late_acc - early_acc) / early_acc
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
CostMetrics {
|
||||
steps_per_solve,
|
||||
tools_per_solve,
|
||||
cost_efficiency,
|
||||
cost_trend,
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_robustness(&self, raw: &RawMetrics) -> RobustnessMetrics {
|
||||
let noise_accuracy = if raw.noise_tasks_attempted > 0 {
|
||||
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
|
||||
} else {
|
||||
0.5 // no noise data -> neutral prior
|
||||
};
|
||||
|
||||
let clean_attempted = raw
|
||||
.tasks_attempted
|
||||
.saturating_sub(raw.noise_tasks_attempted);
|
||||
let clean_correct = raw.tasks_correct.saturating_sub(raw.noise_tasks_correct);
|
||||
let clean_accuracy = if clean_attempted > 0 {
|
||||
clean_correct as f64 / clean_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let noise_degradation = (clean_accuracy - noise_accuracy).max(0.0);
|
||||
|
||||
let consistency = if raw.episodes.len() >= 2 {
|
||||
let mean =
|
||||
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
|
||||
let variance = raw
|
||||
.episodes
|
||||
.iter()
|
||||
.map(|e| (e.accuracy - mean).powi(2))
|
||||
.sum::<f64>()
|
||||
/ raw.episodes.len() as f64;
|
||||
(1.0 - variance.sqrt()).max(0.0)
|
||||
} else {
|
||||
0.5
|
||||
};
|
||||
|
||||
let robustness_score =
|
||||
noise_accuracy * 0.4 + (1.0 - noise_degradation.min(1.0)) * 0.3 + consistency * 0.3;
|
||||
|
||||
RobustnessMetrics {
|
||||
noise_accuracy,
|
||||
noise_degradation,
|
||||
consistency,
|
||||
robustness_score,
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_overall_score(
|
||||
&self,
|
||||
capabilities: &CapabilityScores,
|
||||
reasoning: &ReasoningMetrics,
|
||||
learning: &LearningMetrics,
|
||||
tool_use: &ToolUseMetrics,
|
||||
meta_cognition: &MetaCognitiveMetrics,
|
||||
cost: &CostMetrics,
|
||||
robustness: &RobustnessMetrics,
|
||||
) -> f64 {
|
||||
// Sub-scores (0-100 scale)
|
||||
let cap_score = capabilities.weighted_average(&self.capability_weights);
|
||||
|
||||
let reasoning_score = (reasoning.logical_coherence
|
||||
+ reasoning.constraint_satisfaction_rate
|
||||
+ reasoning.solution_optimality
|
||||
+ reasoning.reasoning_efficiency)
|
||||
/ 4.0
|
||||
* 100.0;
|
||||
|
||||
let learning_score = (learning.sample_efficiency
|
||||
+ learning.regret_sublinearity
|
||||
+ learning.learning_rate
|
||||
+ learning.generalization)
|
||||
/ 4.0
|
||||
* 100.0;
|
||||
|
||||
let tool_score = (tool_use.selection_appropriateness
|
||||
+ tool_use.utilization_effectiveness
|
||||
+ tool_use.composition_ability
|
||||
+ tool_use.discovery_ability)
|
||||
/ 4.0
|
||||
* 100.0;
|
||||
|
||||
let meta_score = (meta_cognition.self_correction_rate
|
||||
+ meta_cognition.strategy_adaptation
|
||||
+ meta_cognition.progress_monitoring)
|
||||
/ 3.0
|
||||
* 100.0;
|
||||
|
||||
let cost_score = cost.cost_efficiency * 100.0;
|
||||
let robustness_score = robustness.robustness_score * 100.0;
|
||||
|
||||
// Three equal pillars: graded outcomes (~0.34), cost (~0.33), robustness (~0.33)
|
||||
// Graded outcomes = capabilities + reasoning + learning + tool + meta
|
||||
cap_score * 0.12
|
||||
+ reasoning_score * 0.10
|
||||
+ learning_score * 0.06
|
||||
+ tool_score * 0.03
|
||||
+ meta_score * 0.03
|
||||
+ cost_score * 0.33
|
||||
+ robustness_score * 0.33
|
||||
}
|
||||
}
|
||||
|
||||
/// Print a formatted intelligence report
|
||||
pub fn print_intelligence_report(assessment: &IntelligenceAssessment) {
|
||||
println!("╔══════════════════════════════════════════════════════════════╗");
|
||||
println!("║ Intelligence Assessment Report ║");
|
||||
println!("╚══════════════════════════════════════════════════════════════╝");
|
||||
println!();
|
||||
println!(
|
||||
"🧠 Overall Intelligence Score: {:.1}/100",
|
||||
assessment.overall_score
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("📊 Capability Scores:");
|
||||
println!(
|
||||
" Temporal Reasoning: {:5.1}",
|
||||
assessment.capabilities.temporal_reasoning
|
||||
);
|
||||
println!(
|
||||
" Constraint Satisfaction:{:5.1}",
|
||||
assessment.capabilities.constraint_satisfaction
|
||||
);
|
||||
println!(
|
||||
" Information Retrieval: {:5.1}",
|
||||
assessment.capabilities.information_retrieval
|
||||
);
|
||||
println!(
|
||||
" Pattern Recognition: {:5.1}",
|
||||
assessment.capabilities.pattern_recognition
|
||||
);
|
||||
println!(
|
||||
" Planning: {:5.1}",
|
||||
assessment.capabilities.planning
|
||||
);
|
||||
println!(
|
||||
" Adaptation: {:5.1}",
|
||||
assessment.capabilities.adaptation
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("🔍 Reasoning Quality:");
|
||||
println!(
|
||||
" Logical Coherence: {:.2}",
|
||||
assessment.reasoning.logical_coherence
|
||||
);
|
||||
println!(
|
||||
" Constraint Satisfaction:{:.2}",
|
||||
assessment.reasoning.constraint_satisfaction_rate
|
||||
);
|
||||
println!(
|
||||
" Solution Optimality: {:.2}",
|
||||
assessment.reasoning.solution_optimality
|
||||
);
|
||||
println!(
|
||||
" Reasoning Efficiency: {:.2}",
|
||||
assessment.reasoning.reasoning_efficiency
|
||||
);
|
||||
println!(
|
||||
" Error Rate: {:.2}",
|
||||
assessment.reasoning.error_rate
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("📈 Learning Metrics:");
|
||||
println!(
|
||||
" Sample Efficiency: {:.2}",
|
||||
assessment.learning.sample_efficiency
|
||||
);
|
||||
println!(
|
||||
" Regret Sublinearity: {:.2}",
|
||||
assessment.learning.regret_sublinearity
|
||||
);
|
||||
println!(
|
||||
" Learning Rate: {:.2}",
|
||||
assessment.learning.learning_rate
|
||||
);
|
||||
println!(
|
||||
" Generalization: {:.2}",
|
||||
assessment.learning.generalization
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("🔧 Tool Use Proficiency:");
|
||||
println!(
|
||||
" Selection: {:.2}",
|
||||
assessment.tool_use.selection_appropriateness
|
||||
);
|
||||
println!(
|
||||
" Effectiveness: {:.2}",
|
||||
assessment.tool_use.utilization_effectiveness
|
||||
);
|
||||
println!(
|
||||
" Composition: {:.2}",
|
||||
assessment.tool_use.composition_ability
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("🪞 Meta-Cognitive Indicators:");
|
||||
println!(
|
||||
" Self-Correction: {:.2}",
|
||||
assessment.meta_cognition.self_correction_rate
|
||||
);
|
||||
println!(
|
||||
" Strategy Adaptation: {:.2}",
|
||||
assessment.meta_cognition.strategy_adaptation
|
||||
);
|
||||
println!(
|
||||
" Progress Monitoring: {:.2}",
|
||||
assessment.meta_cognition.progress_monitoring
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_intelligence_calculation() {
|
||||
let mut raw = RawMetrics::default();
|
||||
raw.tasks_attempted = 100;
|
||||
raw.tasks_completed = 90;
|
||||
raw.tasks_correct = 80;
|
||||
raw.total_steps = 500;
|
||||
raw.total_tool_calls = 100;
|
||||
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&raw);
|
||||
|
||||
assert!(assessment.overall_score > 0.0);
|
||||
assert!(assessment.capabilities.temporal_reasoning > 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_learning_metrics() {
|
||||
let mut raw = RawMetrics::default();
|
||||
raw.tasks_attempted = 50;
|
||||
raw.tasks_correct = 40;
|
||||
|
||||
// Add episodes showing improvement
|
||||
for i in 0..10 {
|
||||
raw.episodes.push(EpisodeMetrics {
|
||||
episode: i + 1,
|
||||
accuracy: 0.5 + 0.04 * i as f64,
|
||||
reward: 50.0 + 4.0 * i as f64,
|
||||
regret: 50.0 - 4.0 * i as f64,
|
||||
cumulative_regret: (0..=i).map(|j| 50.0 - 4.0 * j as f64).sum(),
|
||||
});
|
||||
}
|
||||
|
||||
let calculator = IntelligenceCalculator::default();
|
||||
let assessment = calculator.calculate(&raw);
|
||||
|
||||
// Should show learning (improvement over time)
|
||||
assert!(assessment.learning.learning_rate > 0.5);
|
||||
}
|
||||
}
|
||||
38
vendor/ruvector/examples/benchmarks/src/lib.rs
vendored
Normal file
38
vendor/ruvector/examples/benchmarks/src/lib.rs
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
//! RuVector Benchmarks Library
|
||||
//!
|
||||
//! Comprehensive benchmarking suite for:
|
||||
//! - Temporal reasoning (TimePuzzles-style constraint inference)
|
||||
//! - Vector index operations (IVF, coherence-gated search)
|
||||
//! - Swarm controller regret tracking
|
||||
//! - Intelligence metrics and cognitive capability assessment
|
||||
//! - Adaptive learning with ReasoningBank trajectory tracking
|
||||
//!
|
||||
//! Based on research from:
|
||||
//! - TimePuzzles benchmark (arXiv:2601.07148)
|
||||
//! - Sublinear regret in multi-agent control
|
||||
//! - Tool-augmented iterative temporal reasoning
|
||||
//! - Cognitive capability assessment frameworks
|
||||
//! - lean-agentic type theory for verified reasoning
|
||||
|
||||
pub mod acceptance_test;
|
||||
pub mod agi_contract;
|
||||
pub mod intelligence_metrics;
|
||||
pub mod logging;
|
||||
pub mod loop_gating;
|
||||
pub mod publishable_rvf;
|
||||
pub mod reasoning_bank;
|
||||
pub mod rvf_artifact;
|
||||
pub mod rvf_intelligence_bench;
|
||||
pub mod superintelligence;
|
||||
pub mod swarm_regret;
|
||||
pub mod temporal;
|
||||
pub mod timepuzzles;
|
||||
pub mod vector_index;
|
||||
|
||||
pub use intelligence_metrics::*;
|
||||
pub use logging::*;
|
||||
pub use reasoning_bank::*;
|
||||
pub use swarm_regret::*;
|
||||
pub use temporal::*;
|
||||
pub use timepuzzles::*;
|
||||
pub use vector_index::*;
|
||||
421
vendor/ruvector/examples/benchmarks/src/logging.rs
vendored
Normal file
421
vendor/ruvector/examples/benchmarks/src/logging.rs
vendored
Normal file
@@ -0,0 +1,421 @@
|
||||
//! Logging Schema for Benchmark Results
|
||||
//!
|
||||
//! Comprehensive logging for:
|
||||
//! - Temporal reasoning benchmarks
|
||||
//! - Vector operations
|
||||
//! - Swarm controller metrics
|
||||
//! - Tool usage tracking
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
/// Log entry types
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum LogEntry {
|
||||
/// Temporal benchmark run
|
||||
TemporalBenchmark(TemporalBenchmarkLog),
|
||||
/// Vector operation
|
||||
VectorOperation(VectorOperationLog),
|
||||
/// Swarm episode
|
||||
SwarmEpisode(SwarmEpisodeLog),
|
||||
/// Tool call
|
||||
ToolCall(ToolCallLog),
|
||||
/// System event
|
||||
System(SystemLog),
|
||||
}
|
||||
|
||||
/// Temporal benchmark log entry
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TemporalBenchmarkLog {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub benchmark_id: String,
|
||||
pub puzzle_id: String,
|
||||
pub difficulty: u8,
|
||||
pub solved: bool,
|
||||
pub correct: bool,
|
||||
pub steps: usize,
|
||||
pub tool_calls: usize,
|
||||
pub latency_ms: u64,
|
||||
pub constraint_count: usize,
|
||||
pub calendar_tool_enabled: bool,
|
||||
pub web_search_enabled: bool,
|
||||
}
|
||||
|
||||
/// Vector operation log entry
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct VectorOperationLog {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub operation: String,
|
||||
pub index_dim: usize,
|
||||
pub index_size: usize,
|
||||
pub query_count: usize,
|
||||
pub top_k: usize,
|
||||
pub ivf_enabled: bool,
|
||||
pub coherence_score: f32,
|
||||
pub latency_us: u64,
|
||||
pub results_count: usize,
|
||||
}
|
||||
|
||||
/// Swarm episode log entry
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmEpisodeLog {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub episode: usize,
|
||||
pub num_tasks: usize,
|
||||
pub solved: usize,
|
||||
pub correct: usize,
|
||||
pub reward: f64,
|
||||
pub oracle_reward: f64,
|
||||
pub regret: f64,
|
||||
pub cumulative_regret: f64,
|
||||
pub average_regret: f64,
|
||||
pub is_sublinear: bool,
|
||||
}
|
||||
|
||||
/// Tool call log entry
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ToolCallLog {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub tool_name: String,
|
||||
pub tool_type: String,
|
||||
pub input_summary: String,
|
||||
pub success: bool,
|
||||
pub latency_ms: u64,
|
||||
pub context: String,
|
||||
}
|
||||
|
||||
/// System log entry
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SystemLog {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub level: String,
|
||||
pub message: String,
|
||||
pub component: String,
|
||||
}
|
||||
|
||||
/// Benchmark logger
|
||||
pub struct BenchmarkLogger {
|
||||
/// Log file path
|
||||
path: String,
|
||||
/// Writer
|
||||
writer: Option<BufWriter<File>>,
|
||||
/// In-memory buffer for batch writes
|
||||
buffer: Vec<LogEntry>,
|
||||
/// Buffer size before flush
|
||||
flush_threshold: usize,
|
||||
}
|
||||
|
||||
impl BenchmarkLogger {
|
||||
/// Create a new logger
|
||||
pub fn new(path: impl Into<String>) -> Result<Self> {
|
||||
let path = path.into();
|
||||
|
||||
// Create parent directories
|
||||
if let Some(parent) = Path::new(&path).parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let file = OpenOptions::new().create(true).append(true).open(&path)?;
|
||||
|
||||
Ok(Self {
|
||||
path,
|
||||
writer: Some(BufWriter::new(file)),
|
||||
buffer: Vec::new(),
|
||||
flush_threshold: 100,
|
||||
})
|
||||
}
|
||||
|
||||
/// Log an entry
|
||||
pub fn log(&mut self, entry: LogEntry) -> Result<()> {
|
||||
self.buffer.push(entry);
|
||||
if self.buffer.len() >= self.flush_threshold {
|
||||
self.flush()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Log a temporal benchmark result
|
||||
pub fn log_temporal(
|
||||
&mut self,
|
||||
benchmark_id: impl Into<String>,
|
||||
puzzle_id: impl Into<String>,
|
||||
difficulty: u8,
|
||||
solved: bool,
|
||||
correct: bool,
|
||||
steps: usize,
|
||||
tool_calls: usize,
|
||||
latency_ms: u64,
|
||||
constraint_count: usize,
|
||||
calendar_tool: bool,
|
||||
web_search: bool,
|
||||
) -> Result<()> {
|
||||
self.log(LogEntry::TemporalBenchmark(TemporalBenchmarkLog {
|
||||
timestamp: Utc::now(),
|
||||
benchmark_id: benchmark_id.into(),
|
||||
puzzle_id: puzzle_id.into(),
|
||||
difficulty,
|
||||
solved,
|
||||
correct,
|
||||
steps,
|
||||
tool_calls,
|
||||
latency_ms,
|
||||
constraint_count,
|
||||
calendar_tool_enabled: calendar_tool,
|
||||
web_search_enabled: web_search,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Log a vector operation
|
||||
pub fn log_vector(
|
||||
&mut self,
|
||||
operation: impl Into<String>,
|
||||
index_dim: usize,
|
||||
index_size: usize,
|
||||
query_count: usize,
|
||||
top_k: usize,
|
||||
ivf_enabled: bool,
|
||||
coherence_score: f32,
|
||||
latency_us: u64,
|
||||
results_count: usize,
|
||||
) -> Result<()> {
|
||||
self.log(LogEntry::VectorOperation(VectorOperationLog {
|
||||
timestamp: Utc::now(),
|
||||
operation: operation.into(),
|
||||
index_dim,
|
||||
index_size,
|
||||
query_count,
|
||||
top_k,
|
||||
ivf_enabled,
|
||||
coherence_score,
|
||||
latency_us,
|
||||
results_count,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Log a swarm episode
|
||||
pub fn log_swarm(
|
||||
&mut self,
|
||||
episode: usize,
|
||||
num_tasks: usize,
|
||||
solved: usize,
|
||||
correct: usize,
|
||||
reward: f64,
|
||||
oracle_reward: f64,
|
||||
cumulative_regret: f64,
|
||||
average_regret: f64,
|
||||
is_sublinear: bool,
|
||||
) -> Result<()> {
|
||||
self.log(LogEntry::SwarmEpisode(SwarmEpisodeLog {
|
||||
timestamp: Utc::now(),
|
||||
episode,
|
||||
num_tasks,
|
||||
solved,
|
||||
correct,
|
||||
reward,
|
||||
oracle_reward,
|
||||
regret: oracle_reward - reward,
|
||||
cumulative_regret,
|
||||
average_regret,
|
||||
is_sublinear,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Log a tool call
|
||||
pub fn log_tool(
|
||||
&mut self,
|
||||
tool_name: impl Into<String>,
|
||||
tool_type: impl Into<String>,
|
||||
input_summary: impl Into<String>,
|
||||
success: bool,
|
||||
latency_ms: u64,
|
||||
context: impl Into<String>,
|
||||
) -> Result<()> {
|
||||
self.log(LogEntry::ToolCall(ToolCallLog {
|
||||
timestamp: Utc::now(),
|
||||
tool_name: tool_name.into(),
|
||||
tool_type: tool_type.into(),
|
||||
input_summary: input_summary.into(),
|
||||
success,
|
||||
latency_ms,
|
||||
context: context.into(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Log a system message
|
||||
pub fn log_system(
|
||||
&mut self,
|
||||
level: impl Into<String>,
|
||||
message: impl Into<String>,
|
||||
component: impl Into<String>,
|
||||
) -> Result<()> {
|
||||
self.log(LogEntry::System(SystemLog {
|
||||
timestamp: Utc::now(),
|
||||
level: level.into(),
|
||||
message: message.into(),
|
||||
component: component.into(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Flush buffer to file
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
if let Some(ref mut writer) = self.writer {
|
||||
for entry in self.buffer.drain(..) {
|
||||
let json = serde_json::to_string(&entry)?;
|
||||
writeln!(writer, "{}", json)?;
|
||||
}
|
||||
writer.flush()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close the logger
|
||||
pub fn close(&mut self) -> Result<()> {
|
||||
self.flush()?;
|
||||
self.writer = None;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get log file path
|
||||
pub fn path(&self) -> &str {
|
||||
&self.path
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BenchmarkLogger {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// Log reader for analysis
|
||||
pub struct LogReader {
|
||||
path: String,
|
||||
}
|
||||
|
||||
impl LogReader {
|
||||
/// Create a new reader
|
||||
pub fn new(path: impl Into<String>) -> Self {
|
||||
Self { path: path.into() }
|
||||
}
|
||||
|
||||
/// Read all entries
|
||||
pub fn read_all(&self) -> Result<Vec<LogEntry>> {
|
||||
let content = fs::read_to_string(&self.path)?;
|
||||
let mut entries = Vec::new();
|
||||
for line in content.lines() {
|
||||
if !line.is_empty() {
|
||||
let entry: LogEntry = serde_json::from_str(line)?;
|
||||
entries.push(entry);
|
||||
}
|
||||
}
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Read temporal benchmark entries only
|
||||
pub fn read_temporal(&self) -> Result<Vec<TemporalBenchmarkLog>> {
|
||||
let entries = self.read_all()?;
|
||||
Ok(entries
|
||||
.into_iter()
|
||||
.filter_map(|e| match e {
|
||||
LogEntry::TemporalBenchmark(t) => Some(t),
|
||||
_ => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Read swarm episode entries only
|
||||
pub fn read_swarm(&self) -> Result<Vec<SwarmEpisodeLog>> {
|
||||
let entries = self.read_all()?;
|
||||
Ok(entries
|
||||
.into_iter()
|
||||
.filter_map(|e| match e {
|
||||
LogEntry::SwarmEpisode(s) => Some(s),
|
||||
_ => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Compute aggregate statistics
|
||||
pub fn aggregate_temporal(&self) -> Result<TemporalAggregates> {
|
||||
let logs = self.read_temporal()?;
|
||||
if logs.is_empty() {
|
||||
return Ok(TemporalAggregates::default());
|
||||
}
|
||||
|
||||
let total = logs.len();
|
||||
let solved = logs.iter().filter(|l| l.solved).count();
|
||||
let correct = logs.iter().filter(|l| l.correct).count();
|
||||
let avg_steps = logs.iter().map(|l| l.steps).sum::<usize>() as f64 / total as f64;
|
||||
let avg_latency = logs.iter().map(|l| l.latency_ms).sum::<u64>() as f64 / total as f64;
|
||||
let avg_tools = logs.iter().map(|l| l.tool_calls).sum::<usize>() as f64 / total as f64;
|
||||
|
||||
// By difficulty
|
||||
let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
|
||||
std::collections::HashMap::new();
|
||||
for log in &logs {
|
||||
let entry = by_difficulty.entry(log.difficulty).or_insert((0, 0));
|
||||
entry.0 += 1;
|
||||
if log.correct {
|
||||
entry.1 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TemporalAggregates {
|
||||
total_puzzles: total,
|
||||
solved_count: solved,
|
||||
correct_count: correct,
|
||||
accuracy: correct as f64 / total as f64,
|
||||
avg_steps,
|
||||
avg_latency_ms: avg_latency,
|
||||
avg_tool_calls: avg_tools,
|
||||
accuracy_by_difficulty: by_difficulty
|
||||
.into_iter()
|
||||
.map(|(d, (t, c))| (d, c as f64 / t as f64))
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregate statistics for temporal benchmarks
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct TemporalAggregates {
|
||||
pub total_puzzles: usize,
|
||||
pub solved_count: usize,
|
||||
pub correct_count: usize,
|
||||
pub accuracy: f64,
|
||||
pub avg_steps: f64,
|
||||
pub avg_latency_ms: f64,
|
||||
pub avg_tool_calls: f64,
|
||||
pub accuracy_by_difficulty: std::collections::HashMap<u8, f64>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_logger() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.log");
|
||||
|
||||
let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
|
||||
|
||||
logger
|
||||
.log_temporal(
|
||||
"bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
logger.flush().unwrap();
|
||||
|
||||
let reader = LogReader::new(path.to_str().unwrap());
|
||||
let entries = reader.read_all().unwrap();
|
||||
assert_eq!(entries.len(), 1);
|
||||
}
|
||||
}
|
||||
603
vendor/ruvector/examples/benchmarks/src/loop_gating.rs
vendored
Normal file
603
vendor/ruvector/examples/benchmarks/src/loop_gating.rs
vendored
Normal file
@@ -0,0 +1,603 @@
|
||||
//! Three-Loop Gating Architecture
|
||||
//!
|
||||
//! Separates the intelligence engine into three explicit loops with strict gating:
|
||||
//!
|
||||
//! ## Fast Loop (per step)
|
||||
//! - Runs every step of every solver invocation
|
||||
//! - No planning, no model calls
|
||||
//! - Only checks invariants: allow, block, quarantine, or rollback
|
||||
//! - Outputs: GateDecision, HealthDelta, WitnessRecord
|
||||
//!
|
||||
//! ## Medium Loop (per attempt)
|
||||
//! - Runs per solve attempt (one puzzle)
|
||||
//! - Multi-strategy solver, ensemble vote, cascade passes
|
||||
//! - Can PROPOSE memory writes, but cannot COMMIT them
|
||||
//! - Outputs: CandidateSolution, AttemptTrace, ProposedMemoryWrites
|
||||
//!
|
||||
//! ## Slow Loop (per cycle)
|
||||
//! - Runs per training/evaluation cycle
|
||||
//! - Consolidation, compiler updates, promotion review, meta parameter updates
|
||||
//! - Only component that can PROMOTE patterns (Volatile → Trusted)
|
||||
//! - Outputs: NewPolicyCheckpoint, NewMemoryRoot, PromotionLog
|
||||
//!
|
||||
//! ## Critical Gating Rule
|
||||
//! Medium loop can propose memory writes.
|
||||
//! Fast loop is the only component allowed to commit them.
|
||||
//! Slow loop is the only component allowed to promote them.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::agi_contract::ContractHealth;
|
||||
use crate::reasoning_bank::{
|
||||
Counterexample, MemoryCheckpoint, MemoryClass, ReasoningBank, RollbackWitness, Trajectory,
|
||||
Verdict,
|
||||
};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Fast Loop: per-step invariant gating
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Decision made by the fast loop gate on each step.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum GateDecision {
|
||||
/// Allow the step to proceed
|
||||
Allow,
|
||||
/// Block: step would violate a policy
|
||||
Block { reason: String },
|
||||
/// Quarantine: result is suspicious, hold for review
|
||||
Quarantine { reason: String },
|
||||
/// Rollback: regression detected, revert to checkpoint
|
||||
Rollback {
|
||||
checkpoint_id: usize,
|
||||
reason: String,
|
||||
},
|
||||
}
|
||||
|
||||
/// Health delta tracked per step.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct HealthDelta {
|
||||
pub steps_taken: usize,
|
||||
pub contradictions_detected: usize,
|
||||
pub policy_violations: usize,
|
||||
pub cost_accumulated: f64,
|
||||
}
|
||||
|
||||
/// Fast loop gate: checks invariants on every step.
|
||||
/// This is the ONLY component allowed to commit memory writes.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FastGate {
|
||||
/// Maximum steps before forced halt
|
||||
pub step_limit: usize,
|
||||
/// Maximum cost accumulation before halt
|
||||
pub cost_limit: f64,
|
||||
/// Contradiction threshold before quarantine
|
||||
pub contradiction_threshold: usize,
|
||||
/// Running health delta
|
||||
pub delta: HealthDelta,
|
||||
/// Pending writes from medium loop (committed by fast loop)
|
||||
pub pending_writes: Vec<ProposedWrite>,
|
||||
/// Gate decisions log
|
||||
pub decisions: Vec<GateDecision>,
|
||||
}
|
||||
|
||||
impl FastGate {
|
||||
pub fn new(step_limit: usize) -> Self {
|
||||
Self {
|
||||
step_limit,
|
||||
cost_limit: f64::MAX,
|
||||
contradiction_threshold: 3,
|
||||
delta: HealthDelta::default(),
|
||||
pending_writes: Vec::new(),
|
||||
decisions: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check a step and return a gate decision.
|
||||
pub fn check_step(&mut self, step: usize, solved: bool, correct: bool) -> GateDecision {
|
||||
self.delta.steps_taken = step;
|
||||
|
||||
// Check step budget
|
||||
if step >= self.step_limit {
|
||||
let decision = GateDecision::Block {
|
||||
reason: format!("step budget exhausted ({}/{})", step, self.step_limit),
|
||||
};
|
||||
self.decisions.push(decision.clone());
|
||||
return decision;
|
||||
}
|
||||
|
||||
// Check contradiction (solved but wrong)
|
||||
if solved && !correct {
|
||||
self.delta.contradictions_detected += 1;
|
||||
if self.delta.contradictions_detected >= self.contradiction_threshold {
|
||||
let decision = GateDecision::Quarantine {
|
||||
reason: format!(
|
||||
"{} contradictions in this attempt",
|
||||
self.delta.contradictions_detected,
|
||||
),
|
||||
};
|
||||
self.decisions.push(decision.clone());
|
||||
return decision;
|
||||
}
|
||||
}
|
||||
|
||||
let decision = GateDecision::Allow;
|
||||
self.decisions.push(decision.clone());
|
||||
decision
|
||||
}
|
||||
|
||||
/// Commit pending writes from the medium loop into the bank.
|
||||
/// Only the fast loop has authority to do this.
|
||||
pub fn commit_writes(&mut self, bank: &mut ReasoningBank) -> usize {
|
||||
let count = self.pending_writes.len();
|
||||
for write in self.pending_writes.drain(..) {
|
||||
match write {
|
||||
ProposedWrite::RecordTrajectory(traj) => {
|
||||
bank.record_trajectory_gated(traj);
|
||||
}
|
||||
ProposedWrite::RecordCounterexample {
|
||||
constraint_type,
|
||||
trajectory,
|
||||
} => {
|
||||
bank.record_counterexample(&constraint_type, trajectory);
|
||||
}
|
||||
ProposedWrite::QuarantineTrajectory { trajectory, reason } => {
|
||||
bank.quarantine_trajectory(trajectory, &reason);
|
||||
}
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Reset for next attempt.
|
||||
pub fn reset(&mut self) {
|
||||
self.delta = HealthDelta::default();
|
||||
self.decisions.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// A proposed memory write from the medium loop.
|
||||
/// Cannot be committed directly — must go through FastGate.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub enum ProposedWrite {
|
||||
RecordTrajectory(Trajectory),
|
||||
RecordCounterexample {
|
||||
constraint_type: String,
|
||||
trajectory: Trajectory,
|
||||
},
|
||||
QuarantineTrajectory {
|
||||
trajectory: Trajectory,
|
||||
reason: String,
|
||||
},
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Medium Loop: per-attempt solving
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Trace of a single solve attempt.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AttemptTrace {
|
||||
/// Puzzle ID
|
||||
pub puzzle_id: String,
|
||||
/// Strategy used
|
||||
pub strategy: String,
|
||||
/// Steps taken
|
||||
pub steps: usize,
|
||||
/// Whether the answer was correct
|
||||
pub correct: bool,
|
||||
/// Whether a retry was attempted
|
||||
pub retried: bool,
|
||||
/// Gate decisions during this attempt
|
||||
pub gate_decisions: Vec<GateDecision>,
|
||||
/// Proposed memory writes (not yet committed)
|
||||
pub proposed_writes: Vec<ProposedWrite>,
|
||||
}
|
||||
|
||||
/// Medium loop: handles one puzzle solve attempt.
|
||||
/// Can propose memory writes but cannot commit them.
|
||||
pub struct MediumLoop {
|
||||
/// Fast gate for step-level invariant checking
|
||||
pub gate: FastGate,
|
||||
}
|
||||
|
||||
impl MediumLoop {
|
||||
pub fn new(step_limit: usize) -> Self {
|
||||
Self {
|
||||
gate: FastGate::new(step_limit),
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a solve result and produce an attempt trace.
|
||||
/// Proposes memory writes but does NOT commit them.
|
||||
pub fn process_result(
|
||||
&mut self,
|
||||
puzzle_id: &str,
|
||||
difficulty: u8,
|
||||
strategy: &str,
|
||||
steps: usize,
|
||||
solved: bool,
|
||||
correct: bool,
|
||||
constraint_types: &[String],
|
||||
) -> AttemptTrace {
|
||||
// Fast loop gate check
|
||||
let decision = self.gate.check_step(steps, solved, correct);
|
||||
|
||||
let mut proposed_writes = Vec::new();
|
||||
|
||||
// Build trajectory
|
||||
let mut traj = Trajectory::new(puzzle_id, difficulty);
|
||||
traj.constraint_types = constraint_types.to_vec();
|
||||
traj.record_attempt(
|
||||
if correct {
|
||||
"correct".to_string()
|
||||
} else {
|
||||
"incorrect".to_string()
|
||||
},
|
||||
if correct { 0.9 } else { 0.2 },
|
||||
steps,
|
||||
1,
|
||||
strategy,
|
||||
);
|
||||
traj.set_verdict(
|
||||
if correct {
|
||||
Verdict::Success
|
||||
} else {
|
||||
Verdict::Failed
|
||||
},
|
||||
None,
|
||||
);
|
||||
|
||||
match decision {
|
||||
GateDecision::Allow => {
|
||||
// Propose recording the trajectory
|
||||
proposed_writes.push(ProposedWrite::RecordTrajectory(traj));
|
||||
}
|
||||
GateDecision::Block { .. } => {
|
||||
// Don't record — budget exhausted
|
||||
}
|
||||
GateDecision::Quarantine { ref reason } => {
|
||||
proposed_writes.push(ProposedWrite::QuarantineTrajectory {
|
||||
trajectory: traj.clone(),
|
||||
reason: reason.clone(),
|
||||
});
|
||||
for ct in constraint_types {
|
||||
proposed_writes.push(ProposedWrite::RecordCounterexample {
|
||||
constraint_type: ct.clone(),
|
||||
trajectory: traj.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
GateDecision::Rollback { .. } => {
|
||||
// Rollback handled at fast loop level
|
||||
}
|
||||
}
|
||||
|
||||
AttemptTrace {
|
||||
puzzle_id: puzzle_id.to_string(),
|
||||
strategy: strategy.to_string(),
|
||||
steps,
|
||||
correct,
|
||||
retried: false,
|
||||
gate_decisions: vec![decision],
|
||||
proposed_writes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Finalize: transfer proposed writes to fast gate for commitment.
|
||||
pub fn finalize(&mut self, trace: &AttemptTrace) {
|
||||
for write in &trace.proposed_writes {
|
||||
self.gate.pending_writes.push(write.clone());
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset for next attempt.
|
||||
pub fn reset(&mut self) {
|
||||
self.gate.reset();
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Slow Loop: per-cycle consolidation
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Log of pattern promotions during a cycle.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct PromotionLog {
|
||||
/// Patterns promoted from Volatile → Trusted
|
||||
pub promoted: usize,
|
||||
/// Patterns demoted from Trusted → Quarantined
|
||||
pub demoted: usize,
|
||||
/// Patterns remaining in Volatile
|
||||
pub volatile_remaining: usize,
|
||||
/// Patterns in Trusted
|
||||
pub trusted_total: usize,
|
||||
/// Patterns in Quarantined
|
||||
pub quarantined_total: usize,
|
||||
}
|
||||
|
||||
/// Result of a slow loop cycle.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CycleConsolidation {
|
||||
/// Cycle number
|
||||
pub cycle: usize,
|
||||
/// Checkpoint created at start of cycle
|
||||
pub checkpoint_id: usize,
|
||||
/// Promotion log
|
||||
pub promotion_log: PromotionLog,
|
||||
/// Contract health after consolidation
|
||||
pub contract_health: Option<ContractHealth>,
|
||||
/// Whether a rollback occurred
|
||||
pub rolled_back: bool,
|
||||
/// Rollback witness if rollback occurred
|
||||
pub rollback_witness: Option<RollbackWitness>,
|
||||
}
|
||||
|
||||
/// Slow loop: handles per-cycle consolidation.
|
||||
/// Only component allowed to promote patterns.
|
||||
pub struct SlowLoop {
|
||||
/// History of consolidations
|
||||
pub history: Vec<CycleConsolidation>,
|
||||
}
|
||||
|
||||
impl SlowLoop {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
history: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run consolidation: promote eligible patterns, demote failing ones.
|
||||
/// This is the ONLY place where pattern promotion happens.
|
||||
pub fn consolidate(
|
||||
&mut self,
|
||||
bank: &mut ReasoningBank,
|
||||
cycle: usize,
|
||||
checkpoint_id: usize,
|
||||
holdout_accuracy: f64,
|
||||
prev_accuracy: Option<f64>,
|
||||
) -> CycleConsolidation {
|
||||
let mut rolled_back = false;
|
||||
let mut rollback_witness = None;
|
||||
|
||||
// Check for regression — if accuracy dropped, rollback
|
||||
if let Some(prev) = prev_accuracy {
|
||||
if holdout_accuracy < prev - 0.05 {
|
||||
let ok = bank.rollback_with_witness(
|
||||
checkpoint_id,
|
||||
"slow loop: accuracy regression",
|
||||
prev,
|
||||
holdout_accuracy,
|
||||
);
|
||||
if ok {
|
||||
rolled_back = true;
|
||||
rollback_witness = bank.rollback_witnesses.last().cloned();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Promote eligible patterns (requires counterexample)
|
||||
let promoted = bank.promote_patterns();
|
||||
|
||||
let log = PromotionLog {
|
||||
promoted,
|
||||
demoted: 0, // Demotions happen in the fast loop
|
||||
volatile_remaining: bank.volatile_count(),
|
||||
trusted_total: bank.trusted_count(),
|
||||
quarantined_total: bank.quarantined_pattern_count(),
|
||||
};
|
||||
|
||||
let consolidation = CycleConsolidation {
|
||||
cycle,
|
||||
checkpoint_id,
|
||||
promotion_log: log,
|
||||
contract_health: None,
|
||||
rolled_back,
|
||||
rollback_witness,
|
||||
};
|
||||
|
||||
self.history.push(consolidation.clone());
|
||||
consolidation
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Tests
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn fast_gate_allows_normal_step() {
|
||||
let mut gate = FastGate::new(100);
|
||||
let decision = gate.check_step(5, false, false);
|
||||
assert_eq!(decision, GateDecision::Allow);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fast_gate_blocks_over_budget() {
|
||||
let mut gate = FastGate::new(10);
|
||||
let decision = gate.check_step(10, false, false);
|
||||
assert!(matches!(decision, GateDecision::Block { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fast_gate_quarantines_contradictions() {
|
||||
let mut gate = FastGate::new(100);
|
||||
gate.contradiction_threshold = 2;
|
||||
|
||||
// First contradiction: still allowed
|
||||
let d1 = gate.check_step(1, true, false);
|
||||
assert_eq!(d1, GateDecision::Allow);
|
||||
|
||||
// Second contradiction: quarantine
|
||||
let d2 = gate.check_step(2, true, false);
|
||||
assert!(matches!(d2, GateDecision::Quarantine { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fast_gate_commits_pending_writes() {
|
||||
let mut gate = FastGate::new(100);
|
||||
let mut bank = ReasoningBank::new();
|
||||
|
||||
let mut traj = Trajectory::new("test_1", 5);
|
||||
traj.constraint_types.push("Before".to_string());
|
||||
traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
|
||||
traj.set_verdict(Verdict::Success, None);
|
||||
|
||||
gate.pending_writes
|
||||
.push(ProposedWrite::RecordTrajectory(traj));
|
||||
let committed = gate.commit_writes(&mut bank);
|
||||
assert_eq!(committed, 1);
|
||||
assert_eq!(bank.trajectories.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn medium_loop_proposes_writes() {
|
||||
let mut medium = MediumLoop::new(100);
|
||||
|
||||
let trace = medium.process_result(
|
||||
"puzzle_1",
|
||||
5,
|
||||
"adaptive",
|
||||
15,
|
||||
true,
|
||||
true,
|
||||
&["Before".to_string()],
|
||||
);
|
||||
|
||||
assert!(trace.correct);
|
||||
assert_eq!(trace.proposed_writes.len(), 1);
|
||||
assert!(matches!(
|
||||
trace.proposed_writes[0],
|
||||
ProposedWrite::RecordTrajectory(_)
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn medium_loop_quarantines_contradictions() {
|
||||
let mut medium = MediumLoop::new(100);
|
||||
medium.gate.contradiction_threshold = 1;
|
||||
|
||||
// Solved but wrong → quarantine (threshold 1)
|
||||
let trace = medium.process_result(
|
||||
"puzzle_1",
|
||||
5,
|
||||
"default",
|
||||
15,
|
||||
true,
|
||||
false,
|
||||
&["Month".to_string()],
|
||||
);
|
||||
|
||||
assert!(!trace.correct);
|
||||
// Should have quarantine + counterexample writes
|
||||
assert!(trace.proposed_writes.len() >= 2);
|
||||
assert!(trace
|
||||
.proposed_writes
|
||||
.iter()
|
||||
.any(|w| matches!(w, ProposedWrite::QuarantineTrajectory { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn slow_loop_promotes_patterns() {
|
||||
let mut bank = ReasoningBank::new();
|
||||
bank.evidence_threshold = 3;
|
||||
|
||||
// Build enough observations
|
||||
for i in 0..5 {
|
||||
let mut traj = Trajectory::new(&format!("s_{}", i), 5);
|
||||
traj.constraint_types.push("Year".to_string());
|
||||
traj.record_attempt("2024".into(), 0.9, 10, 1, "default");
|
||||
traj.set_verdict(Verdict::Success, None);
|
||||
bank.record_trajectory(traj);
|
||||
}
|
||||
|
||||
// Add counterexample (required for promotion)
|
||||
let ce_traj = Trajectory::new("fail_1", 5);
|
||||
bank.record_counterexample("Year", ce_traj);
|
||||
|
||||
let cp = bank.checkpoint();
|
||||
|
||||
let mut slow = SlowLoop::new();
|
||||
let result = slow.consolidate(&mut bank, 0, cp, 0.95, None);
|
||||
|
||||
assert_eq!(result.promotion_log.promoted, 1);
|
||||
assert_eq!(result.promotion_log.trusted_total, 1);
|
||||
assert!(!result.rolled_back);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn slow_loop_rolls_back_on_regression() {
|
||||
let mut bank = ReasoningBank::new();
|
||||
|
||||
for i in 0..3 {
|
||||
let mut traj = Trajectory::new(&format!("r_{}", i), 5);
|
||||
traj.constraint_types.push("DayOfWeek".to_string());
|
||||
traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
|
||||
traj.set_verdict(Verdict::Success, None);
|
||||
bank.record_trajectory(traj);
|
||||
}
|
||||
|
||||
let cp = bank.checkpoint();
|
||||
|
||||
// Simulate bad learning
|
||||
for i in 3..6 {
|
||||
let mut traj = Trajectory::new(&format!("r_{}", i), 5);
|
||||
traj.constraint_types.push("DayOfWeek".to_string());
|
||||
traj.record_attempt("wrong".into(), 0.1, 50, 1, "default");
|
||||
traj.set_verdict(Verdict::Failed, None);
|
||||
bank.record_trajectory(traj);
|
||||
}
|
||||
|
||||
let mut slow = SlowLoop::new();
|
||||
// Previous accuracy 0.95, current 0.80 → regression > 0.05
|
||||
let result = slow.consolidate(&mut bank, 1, cp, 0.80, Some(0.95));
|
||||
|
||||
assert!(result.rolled_back);
|
||||
assert!(result.rollback_witness.is_some());
|
||||
assert_eq!(bank.trajectories.len(), 3); // Rolled back to checkpoint
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_loop_integration() {
|
||||
let mut bank = ReasoningBank::new();
|
||||
bank.evidence_threshold = 2;
|
||||
|
||||
// === Cycle 1 ===
|
||||
let cp = bank.checkpoint();
|
||||
|
||||
// Medium loop: solve puzzles
|
||||
let mut medium = MediumLoop::new(100);
|
||||
|
||||
for i in 0..5 {
|
||||
let trace = medium.process_result(
|
||||
&format!("p_{}", i),
|
||||
5,
|
||||
"adaptive",
|
||||
10,
|
||||
true,
|
||||
true,
|
||||
&["Before".to_string()],
|
||||
);
|
||||
medium.finalize(&trace);
|
||||
}
|
||||
|
||||
// Fast loop: commit writes
|
||||
let committed = medium.gate.commit_writes(&mut bank);
|
||||
assert_eq!(committed, 5);
|
||||
medium.reset();
|
||||
|
||||
// Add counterexample (for promotion eligibility)
|
||||
let ce = Trajectory::new("ce_1", 5);
|
||||
bank.record_counterexample("Before", ce);
|
||||
|
||||
// Slow loop: consolidate
|
||||
let mut slow = SlowLoop::new();
|
||||
let consolidation = slow.consolidate(&mut bank, 0, cp, 0.90, None);
|
||||
|
||||
assert!(consolidation.promotion_log.promoted > 0);
|
||||
assert_eq!(bank.trusted_count(), 1);
|
||||
}
|
||||
}
|
||||
1004
vendor/ruvector/examples/benchmarks/src/publishable_rvf.rs
vendored
Normal file
1004
vendor/ruvector/examples/benchmarks/src/publishable_rvf.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1313
vendor/ruvector/examples/benchmarks/src/reasoning_bank.rs
vendored
Normal file
1313
vendor/ruvector/examples/benchmarks/src/reasoning_bank.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
648
vendor/ruvector/examples/benchmarks/src/rvf_artifact.rs
vendored
Normal file
648
vendor/ruvector/examples/benchmarks/src/rvf_artifact.rs
vendored
Normal file
@@ -0,0 +1,648 @@
|
||||
//! RVF Artifact Packaging
|
||||
//!
|
||||
//! Packages an intelligence experiment as a self-contained, reproducible artifact.
|
||||
//! Aligns with the "identical graded outcomes, not identical tokens" promise.
|
||||
//!
|
||||
//! ## Contents
|
||||
//!
|
||||
//! 1. **Manifest**: Engine version, pinned configs, seed set, holdout IDs
|
||||
//! 2. **Memory Snapshot**: ReasoningBank serialized, KnowledgeCompiler cache, promotion log
|
||||
//! 3. **Graders**: Deterministic scoring + ContractHealth evaluation
|
||||
//! 4. **Witness Chain**: Per-episode input/config/grade/memory hashes
|
||||
//!
|
||||
//! ## Run Modes
|
||||
//!
|
||||
//! - **Replay**: Uses stored tasks, stored grades, verifies witness chain
|
||||
//! - **Verify**: Regenerates tasks from seeds, reruns grader, must match grades exactly
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::agi_contract::ContractHealth;
|
||||
use crate::reasoning_bank::{MemoryClass, RollbackWitness};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Manifest
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// RVF Artifact Manifest — top-level metadata.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RvfManifest {
|
||||
/// Format version
|
||||
pub rvf_version: String,
|
||||
/// Engine version that produced this artifact
|
||||
pub engine_version: String,
|
||||
/// Pinned solver configuration
|
||||
pub solver_config: SolverConfig,
|
||||
/// Pinned generator configuration
|
||||
pub generator_config: GeneratorConfig,
|
||||
/// Seed set used for generation
|
||||
pub seed_set: SeedSet,
|
||||
/// Holdout puzzle IDs (frozen set)
|
||||
pub holdout_ids: Vec<String>,
|
||||
/// Number of training cycles
|
||||
pub cycles: usize,
|
||||
/// Creation timestamp
|
||||
pub created_at: String,
|
||||
/// SHA-256 of the full artifact (computed after serialization)
|
||||
pub artifact_hash: Option<String>,
|
||||
}
|
||||
|
||||
/// Pinned solver configuration.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SolverConfig {
|
||||
/// Step budget per task
|
||||
pub step_budget: usize,
|
||||
/// Noise injection rate
|
||||
pub noise_rate: f64,
|
||||
/// Retry enabled
|
||||
pub retry_enabled: bool,
|
||||
/// Beam width
|
||||
pub beam_width: usize,
|
||||
/// Minimum accuracy threshold
|
||||
pub min_accuracy: f64,
|
||||
}
|
||||
|
||||
/// Pinned generator configuration.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct GeneratorConfig {
|
||||
/// Min difficulty
|
||||
pub min_difficulty: u8,
|
||||
/// Max difficulty
|
||||
pub max_difficulty: u8,
|
||||
/// Constraint density
|
||||
pub constraint_density: usize,
|
||||
/// Domain type (e.g., "temporal_puzzles", "program_synthesis")
|
||||
pub domain: String,
|
||||
}
|
||||
|
||||
/// Seed set for deterministic replay.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SeedSet {
|
||||
/// Holdout generation seed (frozen)
|
||||
pub holdout_seed: u64,
|
||||
/// Training base seed
|
||||
pub training_seed: u64,
|
||||
/// Noise RNG seed
|
||||
pub noise_seed: u64,
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Memory Snapshot
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Serialized memory state at a point in time.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct MemorySnapshot {
|
||||
/// Serialized ReasoningBank (bincode or JSON)
|
||||
pub reasoning_bank_data: Vec<u8>,
|
||||
/// KnowledgeCompiler cache entries
|
||||
pub compiler_cache: Vec<CompiledEntry>,
|
||||
/// Promotion log: patterns promoted during this experiment
|
||||
pub promotion_log: Vec<PromotionRecord>,
|
||||
/// Memory class summary
|
||||
pub class_summary: MemoryClassSummary,
|
||||
}
|
||||
|
||||
/// A compiled knowledge entry (from KnowledgeCompiler).
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CompiledEntry {
|
||||
/// Constraint signature
|
||||
pub signature: String,
|
||||
/// Compiled solution
|
||||
pub solution: String,
|
||||
/// Max steps the compiled path takes
|
||||
pub max_steps: usize,
|
||||
/// Confidence in compiled solution
|
||||
pub confidence: f64,
|
||||
/// Number of times this entry was used
|
||||
pub hit_count: usize,
|
||||
}
|
||||
|
||||
/// Record of a pattern promotion.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PromotionRecord {
|
||||
/// Constraint type
|
||||
pub constraint_type: String,
|
||||
/// Strategy name
|
||||
pub strategy: String,
|
||||
/// From class
|
||||
pub from_class: String,
|
||||
/// To class
|
||||
pub to_class: String,
|
||||
/// Number of observations at promotion time
|
||||
pub observations: usize,
|
||||
/// Number of counterexamples at promotion time
|
||||
pub counterexamples: usize,
|
||||
/// Cycle when promotion occurred
|
||||
pub cycle: usize,
|
||||
}
|
||||
|
||||
/// Summary of memory classes.
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
|
||||
pub struct MemoryClassSummary {
|
||||
pub volatile: usize,
|
||||
pub trusted: usize,
|
||||
pub quarantined: usize,
|
||||
pub total_counterexamples: usize,
|
||||
pub total_rollback_witnesses: usize,
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Witness Chain
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Per-episode witness record for auditability.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct WitnessRecord {
|
||||
/// Episode/cycle number
|
||||
pub episode: usize,
|
||||
/// SHA-256 of input (puzzle set)
|
||||
pub input_hash: String,
|
||||
/// SHA-256 of config
|
||||
pub config_hash: String,
|
||||
/// SHA-256 of grade outputs
|
||||
pub grade_hash: String,
|
||||
/// Memory root hash before this episode
|
||||
pub memory_root_before: String,
|
||||
/// Memory root hash after this episode
|
||||
pub memory_root_after: String,
|
||||
/// Gate decisions hash
|
||||
pub gate_decisions_hash: String,
|
||||
/// Contract health at end of episode
|
||||
pub contract_health: ContractHealth,
|
||||
}
|
||||
|
||||
/// Complete witness chain for the experiment.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct WitnessChain {
|
||||
/// Ordered witness records (one per cycle)
|
||||
pub records: Vec<WitnessRecord>,
|
||||
/// Rollback witnesses that occurred during the experiment
|
||||
pub rollback_witnesses: Vec<RollbackWitness>,
|
||||
/// Final combined hash of the entire chain
|
||||
pub chain_hash: Option<String>,
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// RVF Artifact (top-level)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Complete RVF artifact — everything needed to replay or verify an experiment.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RvfArtifact {
|
||||
/// Manifest with pinned configuration
|
||||
pub manifest: RvfManifest,
|
||||
/// Memory snapshot
|
||||
pub memory: MemorySnapshot,
|
||||
/// Witness chain
|
||||
pub witness_chain: WitnessChain,
|
||||
/// Final contract health
|
||||
pub final_health: ContractHealth,
|
||||
/// Final IQ score
|
||||
pub final_iq: f64,
|
||||
}
|
||||
|
||||
/// Run mode for artifact verification.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum RunMode {
|
||||
/// Use stored tasks, stored grades, verify witness chain
|
||||
Replay,
|
||||
/// Regenerate tasks from seeds, rerun grader, grades must match
|
||||
Verify,
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Builder
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Builder for assembling an RVF artifact from experiment results.
|
||||
pub struct RvfArtifactBuilder {
|
||||
manifest: Option<RvfManifest>,
|
||||
memory: Option<MemorySnapshot>,
|
||||
witness_records: Vec<WitnessRecord>,
|
||||
rollback_witnesses: Vec<RollbackWitness>,
|
||||
final_health: Option<ContractHealth>,
|
||||
final_iq: f64,
|
||||
}
|
||||
|
||||
impl RvfArtifactBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
manifest: None,
|
||||
memory: None,
|
||||
witness_records: Vec::new(),
|
||||
rollback_witnesses: Vec::new(),
|
||||
final_health: None,
|
||||
final_iq: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn manifest(mut self, manifest: RvfManifest) -> Self {
|
||||
self.manifest = Some(manifest);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn memory(mut self, memory: MemorySnapshot) -> Self {
|
||||
self.memory = Some(memory);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_witness(&mut self, record: WitnessRecord) {
|
||||
self.witness_records.push(record);
|
||||
}
|
||||
|
||||
pub fn add_rollback_witness(&mut self, witness: RollbackWitness) {
|
||||
self.rollback_witnesses.push(witness);
|
||||
}
|
||||
|
||||
pub fn final_health(mut self, health: ContractHealth) -> Self {
|
||||
self.final_health = Some(health);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn final_iq(mut self, iq: f64) -> Self {
|
||||
self.final_iq = iq;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the artifact. Returns None if required fields are missing.
|
||||
pub fn build(self) -> Option<RvfArtifact> {
|
||||
let manifest = self.manifest?;
|
||||
let memory = self.memory?;
|
||||
let final_health = self.final_health?;
|
||||
|
||||
Some(RvfArtifact {
|
||||
manifest,
|
||||
memory,
|
||||
witness_chain: WitnessChain {
|
||||
records: self.witness_records,
|
||||
rollback_witnesses: self.rollback_witnesses,
|
||||
chain_hash: None,
|
||||
},
|
||||
final_health,
|
||||
final_iq: self.final_iq,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Hash utilities (simple deterministic hashing for witness chain)
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Simple deterministic hash for reproducibility checks.
|
||||
/// Uses a 64-bit FNV-1a hash displayed as hex.
|
||||
pub fn fnv_hash(data: &[u8]) -> String {
|
||||
let mut hash: u64 = 0xcbf29ce484222325;
|
||||
for &byte in data {
|
||||
hash ^= byte as u64;
|
||||
hash = hash.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
format!("{:016x}", hash)
|
||||
}
|
||||
|
||||
/// Hash a serializable value.
|
||||
pub fn hash_value<T: Serialize>(value: &T) -> String {
|
||||
let json = serde_json::to_vec(value).unwrap_or_default();
|
||||
fnv_hash(&json)
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Verification
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Result of artifact verification.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct VerificationResult {
|
||||
/// Overall pass/fail
|
||||
pub passed: bool,
|
||||
/// Per-witness verification
|
||||
pub witness_checks: Vec<WitnessCheck>,
|
||||
/// Number of hash mismatches
|
||||
pub mismatches: usize,
|
||||
/// Chain integrity (each record references previous hash)
|
||||
pub chain_intact: bool,
|
||||
}
|
||||
|
||||
/// Single witness check result.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct WitnessCheck {
|
||||
pub episode: usize,
|
||||
pub input_hash_ok: bool,
|
||||
pub grade_hash_ok: bool,
|
||||
pub memory_transition_ok: bool,
|
||||
}
|
||||
|
||||
/// Verify an artifact's witness chain integrity.
|
||||
pub fn verify_witness_chain(artifact: &RvfArtifact) -> VerificationResult {
|
||||
let mut checks = Vec::new();
|
||||
let mut mismatches = 0;
|
||||
let mut chain_intact = true;
|
||||
|
||||
let mut prev_memory_after = String::new();
|
||||
|
||||
for (i, record) in artifact.witness_chain.records.iter().enumerate() {
|
||||
let input_ok = !record.input_hash.is_empty();
|
||||
let grade_ok = !record.grade_hash.is_empty();
|
||||
|
||||
// Memory transition: after(N-1) == before(N)
|
||||
let memory_ok = if i == 0 {
|
||||
true
|
||||
} else {
|
||||
record.memory_root_before == prev_memory_after
|
||||
};
|
||||
|
||||
if !memory_ok {
|
||||
chain_intact = false;
|
||||
mismatches += 1;
|
||||
}
|
||||
if !input_ok {
|
||||
mismatches += 1;
|
||||
}
|
||||
if !grade_ok {
|
||||
mismatches += 1;
|
||||
}
|
||||
|
||||
prev_memory_after = record.memory_root_after.clone();
|
||||
|
||||
checks.push(WitnessCheck {
|
||||
episode: record.episode,
|
||||
input_hash_ok: input_ok,
|
||||
grade_hash_ok: grade_ok,
|
||||
memory_transition_ok: memory_ok,
|
||||
});
|
||||
}
|
||||
|
||||
VerificationResult {
|
||||
passed: mismatches == 0 && chain_intact,
|
||||
witness_checks: checks,
|
||||
mismatches,
|
||||
chain_intact,
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Tests
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn fnv_hash_deterministic() {
|
||||
let h1 = fnv_hash(b"hello world");
|
||||
let h2 = fnv_hash(b"hello world");
|
||||
assert_eq!(h1, h2);
|
||||
|
||||
let h3 = fnv_hash(b"hello world!");
|
||||
assert_ne!(h1, h3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn artifact_builder_works() {
|
||||
let manifest = RvfManifest {
|
||||
rvf_version: "1.0".to_string(),
|
||||
engine_version: "0.1.0".to_string(),
|
||||
solver_config: SolverConfig {
|
||||
step_budget: 400,
|
||||
noise_rate: 0.25,
|
||||
retry_enabled: true,
|
||||
beam_width: 3,
|
||||
min_accuracy: 0.80,
|
||||
},
|
||||
generator_config: GeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
domain: "temporal_puzzles".to_string(),
|
||||
},
|
||||
seed_set: SeedSet {
|
||||
holdout_seed: 0xDEAD_BEEF,
|
||||
training_seed: 42,
|
||||
noise_seed: 31337,
|
||||
},
|
||||
holdout_ids: vec!["p1".into(), "p2".into()],
|
||||
cycles: 10,
|
||||
created_at: "2026-02-15T00:00:00Z".to_string(),
|
||||
artifact_hash: None,
|
||||
};
|
||||
|
||||
let memory = MemorySnapshot {
|
||||
reasoning_bank_data: vec![1, 2, 3],
|
||||
compiler_cache: Vec::new(),
|
||||
promotion_log: Vec::new(),
|
||||
class_summary: MemoryClassSummary::default(),
|
||||
};
|
||||
|
||||
let health = ContractHealth {
|
||||
solved_per_cost: 0.85,
|
||||
noise_stability: 0.92,
|
||||
contradiction_rate: 0.01,
|
||||
rollback_correctness: 1.0,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.95,
|
||||
cost_efficiency: 0.85,
|
||||
compliant: true,
|
||||
};
|
||||
|
||||
let artifact = RvfArtifactBuilder::new()
|
||||
.manifest(manifest)
|
||||
.memory(memory)
|
||||
.final_health(health)
|
||||
.final_iq(95.0)
|
||||
.build();
|
||||
|
||||
assert!(artifact.is_some());
|
||||
let a = artifact.unwrap();
|
||||
assert_eq!(a.manifest.rvf_version, "1.0");
|
||||
assert_eq!(a.final_iq, 95.0);
|
||||
assert!(a.final_health.compliant);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn witness_chain_verification() {
|
||||
let mut builder = RvfArtifactBuilder::new();
|
||||
|
||||
// Build a 3-episode witness chain with consistent memory transitions
|
||||
let mem_root_0 = fnv_hash(b"initial");
|
||||
let mem_root_1 = fnv_hash(b"after_cycle_1");
|
||||
let mem_root_2 = fnv_hash(b"after_cycle_2");
|
||||
let mem_root_3 = fnv_hash(b"after_cycle_3");
|
||||
|
||||
let health = ContractHealth {
|
||||
solved_per_cost: 0.9,
|
||||
noise_stability: 0.95,
|
||||
contradiction_rate: 0.0,
|
||||
rollback_correctness: 1.0,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.95,
|
||||
cost_efficiency: 0.90,
|
||||
compliant: true,
|
||||
};
|
||||
|
||||
builder.add_witness(WitnessRecord {
|
||||
episode: 0,
|
||||
input_hash: fnv_hash(b"input_0"),
|
||||
config_hash: fnv_hash(b"config"),
|
||||
grade_hash: fnv_hash(b"grade_0"),
|
||||
memory_root_before: mem_root_0.clone(),
|
||||
memory_root_after: mem_root_1.clone(),
|
||||
gate_decisions_hash: fnv_hash(b"gates_0"),
|
||||
contract_health: health.clone(),
|
||||
});
|
||||
|
||||
builder.add_witness(WitnessRecord {
|
||||
episode: 1,
|
||||
input_hash: fnv_hash(b"input_1"),
|
||||
config_hash: fnv_hash(b"config"),
|
||||
grade_hash: fnv_hash(b"grade_1"),
|
||||
memory_root_before: mem_root_1.clone(), // matches prev after
|
||||
memory_root_after: mem_root_2.clone(),
|
||||
gate_decisions_hash: fnv_hash(b"gates_1"),
|
||||
contract_health: health.clone(),
|
||||
});
|
||||
|
||||
builder.add_witness(WitnessRecord {
|
||||
episode: 2,
|
||||
input_hash: fnv_hash(b"input_2"),
|
||||
config_hash: fnv_hash(b"config"),
|
||||
grade_hash: fnv_hash(b"grade_2"),
|
||||
memory_root_before: mem_root_2.clone(), // matches prev after
|
||||
memory_root_after: mem_root_3.clone(),
|
||||
gate_decisions_hash: fnv_hash(b"gates_2"),
|
||||
contract_health: health.clone(),
|
||||
});
|
||||
|
||||
let manifest = RvfManifest {
|
||||
rvf_version: "1.0".to_string(),
|
||||
engine_version: "0.1.0".to_string(),
|
||||
solver_config: SolverConfig {
|
||||
step_budget: 400,
|
||||
noise_rate: 0.25,
|
||||
retry_enabled: true,
|
||||
beam_width: 3,
|
||||
min_accuracy: 0.80,
|
||||
},
|
||||
generator_config: GeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
domain: "temporal_puzzles".to_string(),
|
||||
},
|
||||
seed_set: SeedSet {
|
||||
holdout_seed: 0xDEAD_BEEF,
|
||||
training_seed: 42,
|
||||
noise_seed: 31337,
|
||||
},
|
||||
holdout_ids: Vec::new(),
|
||||
cycles: 3,
|
||||
created_at: "2026-02-15T00:00:00Z".to_string(),
|
||||
artifact_hash: None,
|
||||
};
|
||||
|
||||
let artifact = RvfArtifactBuilder::new()
|
||||
.manifest(manifest)
|
||||
.memory(MemorySnapshot {
|
||||
reasoning_bank_data: Vec::new(),
|
||||
compiler_cache: Vec::new(),
|
||||
promotion_log: Vec::new(),
|
||||
class_summary: MemoryClassSummary::default(),
|
||||
})
|
||||
.final_health(health)
|
||||
.final_iq(90.0);
|
||||
|
||||
// Transfer witnesses
|
||||
let mut artifact_raw = artifact.build().unwrap();
|
||||
artifact_raw.witness_chain.records = builder.witness_records;
|
||||
|
||||
let result = verify_witness_chain(&artifact_raw);
|
||||
assert!(result.passed);
|
||||
assert!(result.chain_intact);
|
||||
assert_eq!(result.mismatches, 0);
|
||||
assert_eq!(result.witness_checks.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn witness_chain_detects_tampering() {
|
||||
let health = ContractHealth {
|
||||
solved_per_cost: 0.9,
|
||||
noise_stability: 0.95,
|
||||
contradiction_rate: 0.0,
|
||||
rollback_correctness: 1.0,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.95,
|
||||
cost_efficiency: 0.90,
|
||||
compliant: true,
|
||||
};
|
||||
|
||||
let mut artifact = RvfArtifact {
|
||||
manifest: RvfManifest {
|
||||
rvf_version: "1.0".to_string(),
|
||||
engine_version: "0.1.0".to_string(),
|
||||
solver_config: SolverConfig {
|
||||
step_budget: 400,
|
||||
noise_rate: 0.25,
|
||||
retry_enabled: true,
|
||||
beam_width: 3,
|
||||
min_accuracy: 0.80,
|
||||
},
|
||||
generator_config: GeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
domain: "temporal_puzzles".to_string(),
|
||||
},
|
||||
seed_set: SeedSet {
|
||||
holdout_seed: 0xDEAD_BEEF,
|
||||
training_seed: 42,
|
||||
noise_seed: 31337,
|
||||
},
|
||||
holdout_ids: Vec::new(),
|
||||
cycles: 2,
|
||||
created_at: "2026-02-15T00:00:00Z".to_string(),
|
||||
artifact_hash: None,
|
||||
},
|
||||
memory: MemorySnapshot {
|
||||
reasoning_bank_data: Vec::new(),
|
||||
compiler_cache: Vec::new(),
|
||||
promotion_log: Vec::new(),
|
||||
class_summary: MemoryClassSummary::default(),
|
||||
},
|
||||
witness_chain: WitnessChain {
|
||||
records: vec![
|
||||
WitnessRecord {
|
||||
episode: 0,
|
||||
input_hash: fnv_hash(b"in_0"),
|
||||
config_hash: fnv_hash(b"cfg"),
|
||||
grade_hash: fnv_hash(b"gr_0"),
|
||||
memory_root_before: fnv_hash(b"mem_0"),
|
||||
memory_root_after: fnv_hash(b"mem_1"),
|
||||
gate_decisions_hash: fnv_hash(b"g_0"),
|
||||
contract_health: health.clone(),
|
||||
},
|
||||
WitnessRecord {
|
||||
episode: 1,
|
||||
input_hash: fnv_hash(b"in_1"),
|
||||
config_hash: fnv_hash(b"cfg"),
|
||||
grade_hash: fnv_hash(b"gr_1"),
|
||||
// TAMPERED: memory_root_before doesn't match previous after
|
||||
memory_root_before: fnv_hash(b"WRONG"),
|
||||
memory_root_after: fnv_hash(b"mem_2"),
|
||||
gate_decisions_hash: fnv_hash(b"g_1"),
|
||||
contract_health: health.clone(),
|
||||
},
|
||||
],
|
||||
rollback_witnesses: Vec::new(),
|
||||
chain_hash: None,
|
||||
},
|
||||
final_health: health,
|
||||
final_iq: 90.0,
|
||||
};
|
||||
|
||||
let result = verify_witness_chain(&artifact);
|
||||
assert!(!result.passed);
|
||||
assert!(!result.chain_intact);
|
||||
assert!(result.mismatches > 0);
|
||||
}
|
||||
}
|
||||
1358
vendor/ruvector/examples/benchmarks/src/rvf_intelligence_bench.rs
vendored
Normal file
1358
vendor/ruvector/examples/benchmarks/src/rvf_intelligence_bench.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1524
vendor/ruvector/examples/benchmarks/src/superintelligence.rs
vendored
Normal file
1524
vendor/ruvector/examples/benchmarks/src/superintelligence.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
382
vendor/ruvector/examples/benchmarks/src/swarm_regret.rs
vendored
Normal file
382
vendor/ruvector/examples/benchmarks/src/swarm_regret.rs
vendored
Normal file
@@ -0,0 +1,382 @@
|
||||
//! Swarm Controller Regret Tracking
|
||||
//!
|
||||
//! Implements sublinear regret metrics for multi-agent control:
|
||||
//! - Episode-based regret computation
|
||||
//! - Oracle baseline comparison
|
||||
//! - Regret curve tracking (R_k/k should decrease)
|
||||
//!
|
||||
//! Based on research on sublinear regret in multi-agent and LLM-agent settings
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
|
||||
/// Episode result from agent execution
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct EpisodeResult {
|
||||
/// Episode number
|
||||
pub episode: usize,
|
||||
/// Number of puzzles/tasks in episode
|
||||
pub num_tasks: usize,
|
||||
/// Tasks solved
|
||||
pub solved: usize,
|
||||
/// Correct solutions
|
||||
pub correct: usize,
|
||||
/// Total steps taken
|
||||
pub total_steps: usize,
|
||||
/// Total tool calls
|
||||
pub tool_calls: usize,
|
||||
/// Total latency in ms
|
||||
pub latency_ms: u64,
|
||||
/// Agent reward (e.g., accuracy * 100 - steps / 10)
|
||||
pub reward: f64,
|
||||
/// Oracle reward (best possible performance)
|
||||
pub oracle_reward: f64,
|
||||
}
|
||||
|
||||
impl EpisodeResult {
|
||||
/// Compute instantaneous regret for this episode
|
||||
pub fn regret(&self) -> f64 {
|
||||
(self.oracle_reward - self.reward).max(0.0)
|
||||
}
|
||||
|
||||
/// Compute accuracy
|
||||
pub fn accuracy(&self) -> f64 {
|
||||
if self.num_tasks == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.correct as f64 / self.num_tasks as f64
|
||||
}
|
||||
}
|
||||
|
||||
/// Regret tracker for swarm controller
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RegretTracker {
|
||||
/// Episode results
|
||||
pub episodes: Vec<EpisodeResult>,
|
||||
/// Cumulative regret history
|
||||
pub cumulative_regret: Vec<f64>,
|
||||
/// Average regret history (R_k/k)
|
||||
pub average_regret: Vec<f64>,
|
||||
/// Window size for moving average
|
||||
pub window_size: usize,
|
||||
/// Recent rewards for moving average
|
||||
recent_rewards: VecDeque<f64>,
|
||||
}
|
||||
|
||||
impl Default for RegretTracker {
|
||||
fn default() -> Self {
|
||||
Self::new(20)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegretTracker {
|
||||
/// Create a new regret tracker
|
||||
pub fn new(window_size: usize) -> Self {
|
||||
Self {
|
||||
episodes: Vec::new(),
|
||||
cumulative_regret: Vec::new(),
|
||||
average_regret: Vec::new(),
|
||||
window_size,
|
||||
recent_rewards: VecDeque::with_capacity(window_size),
|
||||
}
|
||||
}
|
||||
|
||||
/// Record an episode result
|
||||
pub fn record_episode(&mut self, result: EpisodeResult) {
|
||||
let regret = result.regret();
|
||||
let k = self.episodes.len() + 1;
|
||||
|
||||
// Update cumulative regret
|
||||
let prev_cumulative = self.cumulative_regret.last().copied().unwrap_or(0.0);
|
||||
let new_cumulative = prev_cumulative + regret;
|
||||
self.cumulative_regret.push(new_cumulative);
|
||||
|
||||
// Update average regret (R_k/k)
|
||||
let avg_regret = new_cumulative / k as f64;
|
||||
self.average_regret.push(avg_regret);
|
||||
|
||||
// Update moving average window
|
||||
self.recent_rewards.push_back(result.reward);
|
||||
if self.recent_rewards.len() > self.window_size {
|
||||
self.recent_rewards.pop_front();
|
||||
}
|
||||
|
||||
self.episodes.push(result);
|
||||
}
|
||||
|
||||
/// Get current cumulative regret
|
||||
pub fn current_cumulative_regret(&self) -> f64 {
|
||||
self.cumulative_regret.last().copied().unwrap_or(0.0)
|
||||
}
|
||||
|
||||
/// Get current average regret (R_k/k)
|
||||
pub fn current_average_regret(&self) -> f64 {
|
||||
self.average_regret.last().copied().unwrap_or(0.0)
|
||||
}
|
||||
|
||||
/// Check if regret is sublinear (average regret decreasing)
|
||||
pub fn is_sublinear(&self) -> bool {
|
||||
if self.average_regret.len() < 5 {
|
||||
return true; // Not enough data
|
||||
}
|
||||
|
||||
// Check if trend is decreasing
|
||||
let n = self.average_regret.len();
|
||||
let recent = &self.average_regret[n.saturating_sub(5)..];
|
||||
let first = recent[0];
|
||||
let last = recent[recent.len() - 1];
|
||||
last < first
|
||||
}
|
||||
|
||||
/// Get regret trend (slope of average regret)
|
||||
pub fn regret_trend(&self) -> f64 {
|
||||
if self.average_regret.len() < 2 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let n = self.average_regret.len();
|
||||
let window = n.min(10);
|
||||
let recent = &self.average_regret[n - window..];
|
||||
|
||||
// Simple linear regression slope
|
||||
let x_mean = (window - 1) as f64 / 2.0;
|
||||
let y_mean: f64 = recent.iter().sum::<f64>() / window as f64;
|
||||
|
||||
let mut num = 0.0;
|
||||
let mut den = 0.0;
|
||||
for (i, y) in recent.iter().enumerate() {
|
||||
let x = i as f64;
|
||||
num += (x - x_mean) * (y - y_mean);
|
||||
den += (x - x_mean) * (x - x_mean);
|
||||
}
|
||||
|
||||
if den.abs() < 1e-10 {
|
||||
0.0
|
||||
} else {
|
||||
num / den
|
||||
}
|
||||
}
|
||||
|
||||
/// Get moving average reward
|
||||
pub fn moving_average_reward(&self) -> f64 {
|
||||
if self.recent_rewards.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
self.recent_rewards.iter().sum::<f64>() / self.recent_rewards.len() as f64
|
||||
}
|
||||
|
||||
/// Get summary statistics
|
||||
pub fn summary(&self) -> RegretSummary {
|
||||
let total_episodes = self.episodes.len();
|
||||
let total_regret = self.current_cumulative_regret();
|
||||
let avg_regret = self.current_average_regret();
|
||||
let trend = self.regret_trend();
|
||||
let is_sublinear = self.is_sublinear();
|
||||
|
||||
let avg_accuracy = if total_episodes > 0 {
|
||||
self.episodes.iter().map(|e| e.accuracy()).sum::<f64>() / total_episodes as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_reward = if total_episodes > 0 {
|
||||
self.episodes.iter().map(|e| e.reward).sum::<f64>() / total_episodes as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
RegretSummary {
|
||||
total_episodes,
|
||||
total_regret,
|
||||
average_regret: avg_regret,
|
||||
regret_trend: trend,
|
||||
is_sublinear,
|
||||
average_accuracy: avg_accuracy,
|
||||
average_reward: avg_reward,
|
||||
moving_average_reward: self.moving_average_reward(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Regret summary statistics
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct RegretSummary {
|
||||
pub total_episodes: usize,
|
||||
pub total_regret: f64,
|
||||
pub average_regret: f64,
|
||||
pub regret_trend: f64,
|
||||
pub is_sublinear: bool,
|
||||
pub average_accuracy: f64,
|
||||
pub average_reward: f64,
|
||||
pub moving_average_reward: f64,
|
||||
}
|
||||
|
||||
/// Oracle baseline for computing optimal rewards
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct OracleBaseline {
|
||||
/// Perfect accuracy reward
|
||||
pub perfect_accuracy_reward: f64,
|
||||
/// Step penalty factor
|
||||
pub step_penalty: f64,
|
||||
/// Minimum steps for optimal solution
|
||||
pub min_steps: usize,
|
||||
}
|
||||
|
||||
impl Default for OracleBaseline {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
perfect_accuracy_reward: 100.0,
|
||||
step_penalty: 0.1,
|
||||
min_steps: 5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OracleBaseline {
|
||||
/// Compute oracle reward for a task set
|
||||
pub fn compute_reward(&self, num_tasks: usize) -> f64 {
|
||||
// Oracle solves all tasks with minimum steps
|
||||
let accuracy_reward = self.perfect_accuracy_reward;
|
||||
let step_cost = (self.min_steps * num_tasks) as f64 * self.step_penalty;
|
||||
accuracy_reward - step_cost
|
||||
}
|
||||
}
|
||||
|
||||
/// Swarm controller with regret tracking
|
||||
pub struct SwarmController {
|
||||
/// Regret tracker
|
||||
pub regret: RegretTracker,
|
||||
/// Oracle baseline
|
||||
pub oracle: OracleBaseline,
|
||||
/// Current episode number
|
||||
pub current_episode: usize,
|
||||
/// Tasks per episode
|
||||
pub tasks_per_episode: usize,
|
||||
}
|
||||
|
||||
impl Default for SwarmController {
|
||||
fn default() -> Self {
|
||||
Self::new(20)
|
||||
}
|
||||
}
|
||||
|
||||
impl SwarmController {
|
||||
/// Create a new swarm controller
|
||||
pub fn new(tasks_per_episode: usize) -> Self {
|
||||
Self {
|
||||
regret: RegretTracker::new(20),
|
||||
oracle: OracleBaseline::default(),
|
||||
current_episode: 0,
|
||||
tasks_per_episode,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start a new episode
|
||||
pub fn start_episode(&mut self) {
|
||||
self.current_episode += 1;
|
||||
}
|
||||
|
||||
/// Record episode completion
|
||||
pub fn complete_episode(
|
||||
&mut self,
|
||||
solved: usize,
|
||||
correct: usize,
|
||||
total_steps: usize,
|
||||
tool_calls: usize,
|
||||
latency_ms: u64,
|
||||
) {
|
||||
let num_tasks = self.tasks_per_episode;
|
||||
|
||||
// Compute agent reward
|
||||
let accuracy = if num_tasks > 0 {
|
||||
correct as f64 / num_tasks as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let agent_reward = accuracy * self.oracle.perfect_accuracy_reward
|
||||
- total_steps as f64 * self.oracle.step_penalty;
|
||||
|
||||
// Compute oracle reward
|
||||
let oracle_reward = self.oracle.compute_reward(num_tasks);
|
||||
|
||||
let result = EpisodeResult {
|
||||
episode: self.current_episode,
|
||||
num_tasks,
|
||||
solved,
|
||||
correct,
|
||||
total_steps,
|
||||
tool_calls,
|
||||
latency_ms,
|
||||
reward: agent_reward,
|
||||
oracle_reward,
|
||||
};
|
||||
|
||||
self.regret.record_episode(result);
|
||||
}
|
||||
|
||||
/// Get current regret status
|
||||
pub fn status(&self) -> SwarmStatus {
|
||||
let summary = self.regret.summary();
|
||||
SwarmStatus {
|
||||
episode: self.current_episode,
|
||||
cumulative_regret: summary.total_regret,
|
||||
average_regret: summary.average_regret,
|
||||
is_improving: summary.is_sublinear,
|
||||
accuracy: summary.average_accuracy,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Swarm controller status
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmStatus {
|
||||
pub episode: usize,
|
||||
pub cumulative_regret: f64,
|
||||
pub average_regret: f64,
|
||||
pub is_improving: bool,
|
||||
pub accuracy: f64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_regret_tracking() {
|
||||
let mut tracker = RegretTracker::new(10);
|
||||
|
||||
// Simulate improving performance
|
||||
for i in 0..10 {
|
||||
let accuracy = 0.5 + 0.05 * i as f64;
|
||||
let result = EpisodeResult {
|
||||
episode: i + 1,
|
||||
num_tasks: 20,
|
||||
solved: (20.0 * accuracy) as usize,
|
||||
correct: (20.0 * accuracy) as usize,
|
||||
total_steps: 100 - i * 5,
|
||||
tool_calls: 20,
|
||||
latency_ms: 1000,
|
||||
reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
|
||||
oracle_reward: 99.0,
|
||||
};
|
||||
tracker.record_episode(result);
|
||||
}
|
||||
|
||||
assert!(tracker.is_sublinear());
|
||||
assert!(tracker.regret_trend() < 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_swarm_controller() {
|
||||
let mut controller = SwarmController::new(20);
|
||||
|
||||
for _ in 0..5 {
|
||||
controller.start_episode();
|
||||
controller.complete_episode(18, 17, 80, 20, 500);
|
||||
}
|
||||
|
||||
let status = controller.status();
|
||||
assert_eq!(status.episode, 5);
|
||||
assert!(status.accuracy > 0.8);
|
||||
}
|
||||
}
|
||||
2318
vendor/ruvector/examples/benchmarks/src/temporal.rs
vendored
Normal file
2318
vendor/ruvector/examples/benchmarks/src/temporal.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
657
vendor/ruvector/examples/benchmarks/src/timepuzzles.rs
vendored
Normal file
657
vendor/ruvector/examples/benchmarks/src/timepuzzles.rs
vendored
Normal file
@@ -0,0 +1,657 @@
|
||||
//! TimePuzzles Generator
|
||||
//!
|
||||
//! Generates constraint-based temporal reasoning puzzles
|
||||
//! based on the TimePuzzles benchmark methodology (arXiv:2601.07148)
|
||||
//!
|
||||
//! Key features:
|
||||
//! - Factual temporal anchors with calendar relations
|
||||
//! - Cross-cultural date systems
|
||||
//! - Controlled difficulty levels
|
||||
//! - Dynamic puzzle generation
|
||||
|
||||
use crate::temporal::{TemporalConstraint, TemporalPuzzle};
|
||||
use anyhow::Result;
|
||||
use chrono::{Datelike, NaiveDate};
|
||||
use rand::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Multi-dimensional difficulty vector.
|
||||
///
|
||||
/// Replaces single-axis difficulty to prevent collapsing effects.
|
||||
/// Higher difficulty = more work and more ambiguity, NOT tighter posterior.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DifficultyVector {
|
||||
/// Size of the search range (days)
|
||||
pub range_size: usize,
|
||||
/// Target number of valid candidates in posterior
|
||||
pub posterior_target: usize,
|
||||
/// Rate of distractor constraints (0.0 - 1.0)
|
||||
pub distractor_rate: f64,
|
||||
/// Rate of noise injection (0.0 - 1.0)
|
||||
pub noise_rate: f64,
|
||||
/// Number of ambiguous solutions (dates that almost satisfy constraints)
|
||||
pub ambiguity_count: usize,
|
||||
}
|
||||
|
||||
impl Default for DifficultyVector {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
range_size: 60,
|
||||
posterior_target: 60,
|
||||
distractor_rate: 0.0,
|
||||
noise_rate: 0.0,
|
||||
ambiguity_count: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DifficultyVector {
|
||||
/// Build from scalar difficulty (backward compatible).
|
||||
/// Higher difficulty = wider range, more distractors, more ambiguity.
|
||||
pub fn from_scalar(difficulty: u8) -> Self {
|
||||
let d = difficulty.min(10).max(1);
|
||||
Self {
|
||||
range_size: difficulty_to_range_size(d),
|
||||
posterior_target: difficulty_to_posterior(d),
|
||||
distractor_rate: difficulty_to_distractor_rate(d),
|
||||
noise_rate: difficulty_to_noise_rate(d),
|
||||
ambiguity_count: difficulty_to_ambiguity(d),
|
||||
}
|
||||
}
|
||||
|
||||
/// Scalar difficulty estimate (for backward compat).
|
||||
pub fn scalar(&self) -> u8 {
|
||||
// Weighted combination back to 1-10 scale
|
||||
let range_score = (self.range_size as f64 / 365.0 * 10.0).min(10.0);
|
||||
let distractor_score = self.distractor_rate * 10.0;
|
||||
let ambiguity_score = (self.ambiguity_count as f64 / 5.0 * 10.0).min(10.0);
|
||||
let combined = (range_score * 0.3 + distractor_score * 0.3 + ambiguity_score * 0.4) as u8;
|
||||
combined.max(1).min(10)
|
||||
}
|
||||
}
|
||||
|
||||
/// Puzzle generator configuration
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PuzzleGeneratorConfig {
|
||||
/// Minimum difficulty (1-10)
|
||||
pub min_difficulty: u8,
|
||||
/// Maximum difficulty (1-10)
|
||||
pub max_difficulty: u8,
|
||||
/// Constraint density (1-5)
|
||||
pub constraint_density: u8,
|
||||
/// Include cross-cultural references
|
||||
pub cross_cultural: bool,
|
||||
/// Include relative constraints
|
||||
pub relative_constraints: bool,
|
||||
/// Year range for puzzles
|
||||
pub year_range: (i32, i32),
|
||||
/// Random seed (optional)
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for PuzzleGeneratorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 10,
|
||||
constraint_density: 3,
|
||||
cross_cultural: true,
|
||||
relative_constraints: true,
|
||||
year_range: (2000, 2030),
|
||||
seed: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Known events for temporal anchoring
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TemporalAnchor {
|
||||
pub name: String,
|
||||
pub date: NaiveDate,
|
||||
pub category: String,
|
||||
pub culture: String,
|
||||
}
|
||||
|
||||
impl TemporalAnchor {
|
||||
pub fn new(
|
||||
name: impl Into<String>,
|
||||
year: i32,
|
||||
month: u32,
|
||||
day: u32,
|
||||
category: impl Into<String>,
|
||||
culture: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
date: NaiveDate::from_ymd_opt(year, month, day).unwrap(),
|
||||
category: category.into(),
|
||||
culture: culture.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// TimePuzzles generator
|
||||
pub struct PuzzleGenerator {
|
||||
config: PuzzleGeneratorConfig,
|
||||
anchors: Vec<TemporalAnchor>,
|
||||
rng: StdRng,
|
||||
}
|
||||
|
||||
impl PuzzleGenerator {
|
||||
/// Create a new generator with config
|
||||
pub fn new(config: PuzzleGeneratorConfig) -> Self {
|
||||
let rng = match config.seed {
|
||||
Some(s) => StdRng::seed_from_u64(s),
|
||||
None => StdRng::from_entropy(),
|
||||
};
|
||||
|
||||
let mut gen = Self {
|
||||
config,
|
||||
anchors: Vec::new(),
|
||||
rng,
|
||||
};
|
||||
gen.init_anchors();
|
||||
gen
|
||||
}
|
||||
|
||||
/// Initialize standard temporal anchors
|
||||
fn init_anchors(&mut self) {
|
||||
// Western holidays
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Christmas",
|
||||
2024,
|
||||
12,
|
||||
25,
|
||||
"holiday",
|
||||
"western",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"New Year", 2024, 1, 1, "holiday", "western",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Independence Day",
|
||||
2024,
|
||||
7,
|
||||
4,
|
||||
"holiday",
|
||||
"american",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Halloween",
|
||||
2024,
|
||||
10,
|
||||
31,
|
||||
"holiday",
|
||||
"western",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Valentine's Day",
|
||||
2024,
|
||||
2,
|
||||
14,
|
||||
"holiday",
|
||||
"western",
|
||||
));
|
||||
|
||||
// Cross-cultural events
|
||||
if self.config.cross_cultural {
|
||||
// Chinese New Year 2024 (Year of the Dragon)
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Chinese New Year 2024",
|
||||
2024,
|
||||
2,
|
||||
10,
|
||||
"holiday",
|
||||
"chinese",
|
||||
));
|
||||
// Diwali 2024
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Diwali 2024",
|
||||
2024,
|
||||
11,
|
||||
1,
|
||||
"holiday",
|
||||
"indian",
|
||||
));
|
||||
// Eid al-Fitr 2024
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Eid al-Fitr 2024",
|
||||
2024,
|
||||
4,
|
||||
10,
|
||||
"holiday",
|
||||
"islamic",
|
||||
));
|
||||
// Hanukkah 2024 (starts)
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Hanukkah 2024",
|
||||
2024,
|
||||
12,
|
||||
25,
|
||||
"holiday",
|
||||
"jewish",
|
||||
));
|
||||
}
|
||||
|
||||
// Historical events
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Moon Landing",
|
||||
1969,
|
||||
7,
|
||||
20,
|
||||
"historical",
|
||||
"global",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Fall of Berlin Wall",
|
||||
1989,
|
||||
11,
|
||||
9,
|
||||
"historical",
|
||||
"global",
|
||||
));
|
||||
self.anchors.push(TemporalAnchor::new(
|
||||
"Y2K",
|
||||
2000,
|
||||
1,
|
||||
1,
|
||||
"historical",
|
||||
"global",
|
||||
));
|
||||
}
|
||||
|
||||
/// Generate a single puzzle with multi-dimensional difficulty vector.
|
||||
///
|
||||
/// Difficulty scaling (higher = more work, not tighter posterior):
|
||||
/// - Low (1-2): small range, no DayOfWeek, no distractors
|
||||
/// - Medium (3-6): DayOfWeek + moderate range = 7x cost surface
|
||||
/// - High (7-10): wide range + distractors + ambiguity + anchor constraints
|
||||
///
|
||||
/// All modes have access to weekday skipping; what differs is the policy.
|
||||
pub fn generate_puzzle(&mut self, id: impl Into<String>) -> Result<TemporalPuzzle> {
|
||||
let id = id.into();
|
||||
let difficulty = self
|
||||
.rng
|
||||
.gen_range(self.config.min_difficulty..=self.config.max_difficulty);
|
||||
|
||||
// Build difficulty vector from scalar
|
||||
let dv = DifficultyVector::from_scalar(difficulty);
|
||||
|
||||
// DayOfWeek (difficulty 3+): creates cost surface for policy decisions
|
||||
let use_day_of_week = difficulty >= 3;
|
||||
|
||||
// Range size from difficulty vector (wider range at higher difficulty)
|
||||
let range_days = dv.range_size as i64;
|
||||
|
||||
// Pick target date
|
||||
let year = self
|
||||
.rng
|
||||
.gen_range(self.config.year_range.0..=self.config.year_range.1);
|
||||
let month = self.rng.gen_range(1..=12);
|
||||
let max_day = days_in_month(year, month);
|
||||
let day = self.rng.gen_range(1..=max_day);
|
||||
let target = NaiveDate::from_ymd_opt(year, month, day).unwrap();
|
||||
|
||||
// Build Between range centered on target, clamped to year
|
||||
let year_start = NaiveDate::from_ymd_opt(year, 1, 1).unwrap();
|
||||
let year_end = NaiveDate::from_ymd_opt(year, 12, 31).unwrap();
|
||||
let half = range_days / 2;
|
||||
let range_start = (target - chrono::Duration::days(half)).max(year_start);
|
||||
let range_end = (range_start + chrono::Duration::days(range_days - 1)).min(year_end);
|
||||
|
||||
let mut puzzle = TemporalPuzzle::new(id.clone(), format!("Find the date (puzzle {})", id))
|
||||
.with_difficulty(difficulty)
|
||||
.with_solutions(vec![target]);
|
||||
|
||||
// Attach difficulty vector
|
||||
puzzle.difficulty_vector = Some(dv.clone());
|
||||
|
||||
// Base constraints: InYear + Between (defines search range)
|
||||
puzzle
|
||||
.constraints
|
||||
.push(TemporalConstraint::InYear(target.year()));
|
||||
puzzle
|
||||
.constraints
|
||||
.push(TemporalConstraint::Between(range_start, range_end));
|
||||
|
||||
let mut used_anchors: Vec<TemporalAnchor> = Vec::new();
|
||||
|
||||
// DayOfWeek (difficulty 3+): creates cost surface for all modes
|
||||
if use_day_of_week {
|
||||
puzzle
|
||||
.constraints
|
||||
.push(TemporalConstraint::DayOfWeek(target.weekday()));
|
||||
}
|
||||
|
||||
// Anchor reference for high difficulty (7+)
|
||||
if difficulty >= 7 && self.config.relative_constraints {
|
||||
if let Some(anchor) = self.anchors.choose(&mut self.rng).cloned() {
|
||||
let diff = (target - anchor.date).num_days();
|
||||
let constraint = if diff >= 0 {
|
||||
TemporalConstraint::DaysAfter(anchor.name.clone(), diff)
|
||||
} else {
|
||||
TemporalConstraint::DaysBefore(anchor.name.clone(), diff.abs())
|
||||
};
|
||||
puzzle.constraints.push(constraint);
|
||||
used_anchors.push(anchor);
|
||||
}
|
||||
}
|
||||
|
||||
// Add anchor references
|
||||
for anchor in used_anchors {
|
||||
puzzle.references.insert(anchor.name.clone(), anchor.date);
|
||||
}
|
||||
|
||||
// Distractor injection (from difficulty vector rate)
|
||||
if dv.distractor_rate > 0.0 && self.rng.gen_bool(dv.distractor_rate.min(0.99)) {
|
||||
let distractor = self.generate_distractor(target, range_start, range_end);
|
||||
puzzle.constraints.push(distractor);
|
||||
}
|
||||
|
||||
// Distractor DayOfWeek (difficulty 6+): DayOfWeek present but misleading.
|
||||
// Adds a SECOND DayOfWeek that is a distractor — it matches the target
|
||||
// but unconditional weekday skipping on the wrong dow will miss solutions.
|
||||
// This creates a real tradeoff for the PolicyKernel.
|
||||
if difficulty >= 6 && use_day_of_week {
|
||||
let distractor_dow_chance: f64 = match difficulty {
|
||||
6 => 0.15,
|
||||
7 => 0.25,
|
||||
8 => 0.35,
|
||||
9..=10 => 0.50,
|
||||
_ => 0.0,
|
||||
};
|
||||
if self.rng.gen_bool(distractor_dow_chance.min(0.99)) {
|
||||
// Add a redundant wider Between that doesn't narrow search
|
||||
// but pairs with the existing DayOfWeek to create a trap:
|
||||
// the DayOfWeek is valid but the wider range means skip saves less
|
||||
let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(14..60));
|
||||
let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(14..60));
|
||||
puzzle
|
||||
.constraints
|
||||
.push(TemporalConstraint::Between(wider_start, wider_end));
|
||||
}
|
||||
}
|
||||
|
||||
// Ambiguity: add near-miss solutions at high difficulty
|
||||
// These are dates that satisfy most but not all constraints,
|
||||
// making early commits risky.
|
||||
if dv.ambiguity_count > 0 {
|
||||
// No-op structurally (solutions list stays correct),
|
||||
// but the wider range at high difficulty naturally creates more
|
||||
// dates that pass most constraints, increasing false-positive risk
|
||||
// for aggressive skip modes.
|
||||
}
|
||||
|
||||
// Count actual distractors injected (deterministic, observable)
|
||||
let actual_distractor_count = crate::temporal::count_distractors(&puzzle);
|
||||
|
||||
// Tags: all features visible to policies for deterministic observability
|
||||
puzzle.tags = vec![
|
||||
format!("difficulty:{}", difficulty),
|
||||
format!("year:{}", year),
|
||||
format!("range_size:{}", dv.range_size),
|
||||
format!("distractor_rate:{:.2}", dv.distractor_rate),
|
||||
format!("distractor_count:{}", actual_distractor_count),
|
||||
format!("ambiguity:{}", dv.ambiguity_count),
|
||||
format!("has_dow:{}", use_day_of_week),
|
||||
];
|
||||
|
||||
Ok(puzzle)
|
||||
}
|
||||
|
||||
/// Generate a distractor constraint: true for the target but doesn't narrow the search.
|
||||
fn generate_distractor(
|
||||
&mut self,
|
||||
target: NaiveDate,
|
||||
range_start: NaiveDate,
|
||||
range_end: NaiveDate,
|
||||
) -> TemporalConstraint {
|
||||
match self.rng.gen_range(0u8..3) {
|
||||
0 => {
|
||||
// Wider Between (superset of existing range → no shrink)
|
||||
let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(10..60));
|
||||
let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(10..60));
|
||||
TemporalConstraint::Between(wider_start, wider_end)
|
||||
}
|
||||
1 => {
|
||||
// Redundant InYear (already present)
|
||||
TemporalConstraint::InYear(target.year())
|
||||
}
|
||||
_ => {
|
||||
// After a date well before the range (no shrink)
|
||||
let days_before = self.rng.gen_range(30..180) as i64;
|
||||
TemporalConstraint::After(target - chrono::Duration::days(days_before))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a batch of puzzles
|
||||
pub fn generate_batch(&mut self, count: usize) -> Result<Vec<TemporalPuzzle>> {
|
||||
let mut puzzles = Vec::with_capacity(count);
|
||||
for i in 0..count {
|
||||
let puzzle = self.generate_puzzle(format!("puzzle-{:04}", i + 1))?;
|
||||
puzzles.push(puzzle);
|
||||
}
|
||||
Ok(puzzles)
|
||||
}
|
||||
|
||||
/// Generate puzzles at specific difficulty
|
||||
pub fn generate_at_difficulty(
|
||||
&mut self,
|
||||
count: usize,
|
||||
difficulty: u8,
|
||||
) -> Result<Vec<TemporalPuzzle>> {
|
||||
let orig_min = self.config.min_difficulty;
|
||||
let orig_max = self.config.max_difficulty;
|
||||
|
||||
self.config.min_difficulty = difficulty;
|
||||
self.config.max_difficulty = difficulty;
|
||||
|
||||
let puzzles = self.generate_batch(count);
|
||||
|
||||
self.config.min_difficulty = orig_min;
|
||||
self.config.max_difficulty = orig_max;
|
||||
|
||||
puzzles
|
||||
}
|
||||
}
|
||||
|
||||
/// Range size by difficulty level.
|
||||
/// Higher difficulty → wider range → more work for the solver.
|
||||
fn difficulty_to_range_size(difficulty: u8) -> usize {
|
||||
match difficulty {
|
||||
1 => 14,
|
||||
2 => 30,
|
||||
3 => 56, // 8 weeks
|
||||
4 => 84, // 12 weeks
|
||||
5 => 120,
|
||||
6 => 150,
|
||||
7 => 200,
|
||||
8 => 250,
|
||||
9 => 300,
|
||||
10 => 365,
|
||||
_ => 120,
|
||||
}
|
||||
}
|
||||
|
||||
/// Posterior target by difficulty level.
|
||||
/// Higher difficulty → more valid candidates → more ambiguity.
|
||||
/// (Flipped from old model: difficulty increases ambiguity, not reduces it.)
|
||||
fn difficulty_to_posterior(difficulty: u8) -> usize {
|
||||
match difficulty {
|
||||
1 => 2,
|
||||
2 => 4,
|
||||
3 => 8,
|
||||
4 => 12,
|
||||
5 => 18,
|
||||
6 => 25,
|
||||
7 => 35,
|
||||
8 => 50,
|
||||
9 => 70,
|
||||
10 => 100,
|
||||
_ => 18,
|
||||
}
|
||||
}
|
||||
|
||||
/// Distractor rate by difficulty level.
|
||||
fn difficulty_to_distractor_rate(difficulty: u8) -> f64 {
|
||||
match difficulty {
|
||||
1..=3 => 0.0,
|
||||
4 => 0.05,
|
||||
5 => 0.10,
|
||||
6 => 0.20,
|
||||
7 => 0.30,
|
||||
8 => 0.40,
|
||||
9 => 0.50,
|
||||
10 => 0.60,
|
||||
_ => 0.10,
|
||||
}
|
||||
}
|
||||
|
||||
/// Noise rate by difficulty level.
|
||||
fn difficulty_to_noise_rate(difficulty: u8) -> f64 {
|
||||
match difficulty {
|
||||
1..=3 => 0.0,
|
||||
4..=5 => 0.10,
|
||||
6..=7 => 0.20,
|
||||
8..=9 => 0.30,
|
||||
10 => 0.40,
|
||||
_ => 0.10,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ambiguity count by difficulty level (near-miss solutions).
|
||||
fn difficulty_to_ambiguity(difficulty: u8) -> usize {
|
||||
match difficulty {
|
||||
1..=4 => 0,
|
||||
5..=6 => 1,
|
||||
7..=8 => 2,
|
||||
9 => 3,
|
||||
10 => 5,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Days in a given month (handles leap years).
|
||||
fn days_in_month(year: i32, month: u32) -> u32 {
|
||||
match month {
|
||||
4 | 6 | 9 | 11 => 30,
|
||||
2 => {
|
||||
if year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) {
|
||||
29
|
||||
} else {
|
||||
28
|
||||
}
|
||||
}
|
||||
_ => 31,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sample puzzle sets
|
||||
pub struct SamplePuzzles;
|
||||
|
||||
impl SamplePuzzles {
|
||||
/// Get easy puzzles (difficulty 1-3)
|
||||
pub fn easy() -> Vec<TemporalPuzzle> {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
min_difficulty: 1,
|
||||
max_difficulty: 3,
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
gen.generate_batch(10).unwrap()
|
||||
}
|
||||
|
||||
/// Get medium puzzles (difficulty 4-6)
|
||||
pub fn medium() -> Vec<TemporalPuzzle> {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
min_difficulty: 4,
|
||||
max_difficulty: 6,
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
gen.generate_batch(10).unwrap()
|
||||
}
|
||||
|
||||
/// Get hard puzzles (difficulty 7-10)
|
||||
pub fn hard() -> Vec<TemporalPuzzle> {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
min_difficulty: 7,
|
||||
max_difficulty: 10,
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
gen.generate_batch(10).unwrap()
|
||||
}
|
||||
|
||||
/// Get cross-cultural puzzles
|
||||
pub fn cross_cultural() -> Vec<TemporalPuzzle> {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
cross_cultural: true,
|
||||
relative_constraints: true,
|
||||
min_difficulty: 5,
|
||||
max_difficulty: 8,
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
gen.generate_batch(10).unwrap()
|
||||
}
|
||||
|
||||
/// Get a mixed sample set (50 puzzles across all difficulties)
|
||||
pub fn mixed_sample() -> Vec<TemporalPuzzle> {
|
||||
let mut all = Vec::new();
|
||||
all.extend(Self::easy());
|
||||
all.extend(Self::medium());
|
||||
all.extend(Self::hard());
|
||||
all.extend(Self::cross_cultural());
|
||||
|
||||
// Add more easy/medium to match TimePuzzles distribution
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
min_difficulty: 2,
|
||||
max_difficulty: 5,
|
||||
seed: Some(123),
|
||||
..Default::default()
|
||||
});
|
||||
all.extend(gen.generate_batch(10).unwrap());
|
||||
|
||||
all
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_puzzle_generation() {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let puzzle = gen.generate_puzzle("test-1").unwrap();
|
||||
assert!(!puzzle.constraints.is_empty());
|
||||
assert!(!puzzle.solutions.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_generation() {
|
||||
let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let puzzles = gen.generate_batch(20).unwrap();
|
||||
assert_eq!(puzzles.len(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sample_puzzles() {
|
||||
let easy = SamplePuzzles::easy();
|
||||
assert_eq!(easy.len(), 10);
|
||||
assert!(easy.iter().all(|p| p.difficulty <= 3));
|
||||
|
||||
let hard = SamplePuzzles::hard();
|
||||
assert!(hard.iter().all(|p| p.difficulty >= 7));
|
||||
}
|
||||
}
|
||||
1029
vendor/ruvector/examples/benchmarks/src/vector_index.rs
vendored
Normal file
1029
vendor/ruvector/examples/benchmarks/src/vector_index.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
417
vendor/ruvector/examples/benchmarks/tests/integration_tests.rs
vendored
Normal file
417
vendor/ruvector/examples/benchmarks/tests/integration_tests.rs
vendored
Normal file
@@ -0,0 +1,417 @@
|
||||
//! Integration tests for benchmark suite
|
||||
|
||||
use chrono::{NaiveDate, Weekday};
|
||||
use ruvector_benchmarks::{
|
||||
logging::BenchmarkLogger,
|
||||
swarm_regret::{EpisodeResult, RegretTracker, SwarmController},
|
||||
temporal::{TemporalConstraint, TemporalPuzzle, TemporalSolver},
|
||||
timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
|
||||
vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
// ============================================================================
|
||||
// Vector Index Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_vector_index_insert_search() {
|
||||
let mut idx = VectorIndex::new(4);
|
||||
|
||||
let id1 = idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
|
||||
let id2 = idx.insert(DenseVec::new(vec![0.9, 0.1, 0.0, 0.0])).unwrap();
|
||||
let _id3 = idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
|
||||
|
||||
let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
|
||||
let results = idx.search(&q, 2, 1.0).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].id, id1);
|
||||
assert!(results[0].score > results[1].score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vector_index_coherence_gate() {
|
||||
let gate = CoherenceGate::new(0.5);
|
||||
let mut idx = VectorIndex::new(4).with_gate(gate);
|
||||
|
||||
idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
|
||||
idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
|
||||
|
||||
let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
|
||||
|
||||
// Low coherence - blocked
|
||||
let results = idx.search(&q, 10, 0.3).unwrap();
|
||||
assert!(results.is_empty());
|
||||
|
||||
// High coherence - allowed
|
||||
let results = idx.search(&q, 10, 0.7).unwrap();
|
||||
assert!(!results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vector_index_ivf() {
|
||||
let ivf = IvfConfig::new(4, 2);
|
||||
let mut idx = VectorIndex::new(8).with_ivf(ivf);
|
||||
|
||||
// Insert enough vectors for clustering
|
||||
for _ in 0..100 {
|
||||
idx.insert(DenseVec::random(8)).unwrap();
|
||||
}
|
||||
|
||||
idx.rebuild_ivf().unwrap();
|
||||
|
||||
let stats = idx.stats();
|
||||
assert!(stats.ivf_enabled);
|
||||
assert!(stats.ivf_clusters > 0);
|
||||
|
||||
// Search should work
|
||||
let q = DenseVec::random(8);
|
||||
let results = idx.search(&q, 5, 1.0).unwrap();
|
||||
assert!(results.len() <= 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vector_index_persistence() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test_index.bin");
|
||||
|
||||
let mut idx = VectorIndex::new(4);
|
||||
idx.insert(DenseVec::new(vec![1.0, 2.0, 3.0, 4.0])).unwrap();
|
||||
idx.insert(DenseVec::new(vec![5.0, 6.0, 7.0, 8.0])).unwrap();
|
||||
|
||||
idx.save_to_file(&path).unwrap();
|
||||
|
||||
let loaded = VectorIndex::load_from_file(&path).unwrap();
|
||||
assert_eq!(loaded.len(), 2);
|
||||
assert_eq!(loaded.dim(), 4);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Temporal Reasoning Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_temporal_puzzle_exact_date() {
|
||||
let target = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
|
||||
let puzzle = TemporalPuzzle::new("test", "Find June 15, 2024")
|
||||
.with_constraint(TemporalConstraint::Exact(target))
|
||||
.with_solutions(vec![target]);
|
||||
|
||||
assert!(puzzle.check_date(target).unwrap());
|
||||
assert!(!puzzle
|
||||
.check_date(NaiveDate::from_ymd_opt(2024, 6, 14).unwrap())
|
||||
.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temporal_puzzle_range() {
|
||||
let start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
|
||||
let end = NaiveDate::from_ymd_opt(2024, 1, 31).unwrap();
|
||||
|
||||
let puzzle = TemporalPuzzle::new("test", "Find a date in January 2024")
|
||||
.with_constraint(TemporalConstraint::Between(start, end));
|
||||
|
||||
assert!(puzzle
|
||||
.check_date(NaiveDate::from_ymd_opt(2024, 1, 15).unwrap())
|
||||
.unwrap());
|
||||
assert!(!puzzle
|
||||
.check_date(NaiveDate::from_ymd_opt(2024, 2, 1).unwrap())
|
||||
.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temporal_puzzle_day_of_week() {
|
||||
let puzzle = TemporalPuzzle::new("test", "Find a Monday in 2024")
|
||||
.with_constraint(TemporalConstraint::InYear(2024))
|
||||
.with_constraint(TemporalConstraint::DayOfWeek(Weekday::Mon));
|
||||
|
||||
// Jan 1, 2024 is a Monday
|
||||
assert!(puzzle
|
||||
.check_date(NaiveDate::from_ymd_opt(2024, 1, 1).unwrap())
|
||||
.unwrap());
|
||||
// Jan 2, 2024 is a Tuesday
|
||||
assert!(!puzzle
|
||||
.check_date(NaiveDate::from_ymd_opt(2024, 1, 2).unwrap())
|
||||
.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temporal_puzzle_relative() {
|
||||
let base = NaiveDate::from_ymd_opt(2024, 3, 1).unwrap();
|
||||
let puzzle = TemporalPuzzle::new("test", "Find 10 days after base")
|
||||
.with_reference("base", base)
|
||||
.with_constraint(TemporalConstraint::DaysAfter("base".to_string(), 10));
|
||||
|
||||
let target = NaiveDate::from_ymd_opt(2024, 3, 11).unwrap();
|
||||
assert!(puzzle.check_date(target).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temporal_solver_basic() {
|
||||
let target = NaiveDate::from_ymd_opt(2024, 5, 20).unwrap();
|
||||
let puzzle = TemporalPuzzle::new("test", "Simple puzzle")
|
||||
.with_constraint(TemporalConstraint::Exact(target))
|
||||
.with_solutions(vec![target]);
|
||||
|
||||
let mut solver = TemporalSolver::with_tools(true, false);
|
||||
let result = solver.solve(&puzzle).unwrap();
|
||||
|
||||
assert!(result.solved);
|
||||
assert!(result.correct);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temporal_solver_with_rewriting() {
|
||||
let base = NaiveDate::from_ymd_opt(2024, 7, 4).unwrap();
|
||||
let target = NaiveDate::from_ymd_opt(2024, 7, 14).unwrap();
|
||||
|
||||
let puzzle = TemporalPuzzle::new("test", "Relative puzzle")
|
||||
.with_reference("event", base)
|
||||
.with_constraint(TemporalConstraint::DaysAfter("event".to_string(), 10))
|
||||
.with_solutions(vec![target]);
|
||||
|
||||
let mut solver = TemporalSolver::with_tools(true, false);
|
||||
let result = solver.solve(&puzzle).unwrap();
|
||||
|
||||
assert!(result.solved);
|
||||
assert!(result.correct);
|
||||
assert!(result.tool_calls > 0); // Rewriting used
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TimePuzzles Generator Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_puzzle_generator_basic() {
|
||||
let config = PuzzleGeneratorConfig {
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut gen = PuzzleGenerator::new(config);
|
||||
let puzzle = gen.generate_puzzle("test-1").unwrap();
|
||||
|
||||
assert!(!puzzle.constraints.is_empty());
|
||||
assert!(!puzzle.solutions.is_empty());
|
||||
assert!(puzzle.difficulty >= 1 && puzzle.difficulty <= 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_puzzle_generator_batch() {
|
||||
let config = PuzzleGeneratorConfig {
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut gen = PuzzleGenerator::new(config);
|
||||
let puzzles = gen.generate_batch(20).unwrap();
|
||||
|
||||
assert_eq!(puzzles.len(), 20);
|
||||
|
||||
// All puzzles should be valid
|
||||
for puzzle in &puzzles {
|
||||
assert!(!puzzle.constraints.is_empty());
|
||||
assert!(!puzzle.solutions.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_puzzle_generator_difficulty() {
|
||||
let config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 7,
|
||||
max_difficulty: 10,
|
||||
seed: Some(42),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut gen = PuzzleGenerator::new(config);
|
||||
let puzzles = gen.generate_batch(10).unwrap();
|
||||
|
||||
for puzzle in &puzzles {
|
||||
assert!(puzzle.difficulty >= 7);
|
||||
assert!(puzzle.difficulty <= 10);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sample_puzzles() {
|
||||
let easy = SamplePuzzles::easy();
|
||||
assert_eq!(easy.len(), 10);
|
||||
assert!(easy.iter().all(|p| p.difficulty <= 3));
|
||||
|
||||
let medium = SamplePuzzles::medium();
|
||||
assert!(medium
|
||||
.iter()
|
||||
.all(|p| p.difficulty >= 4 && p.difficulty <= 6));
|
||||
|
||||
let hard = SamplePuzzles::hard();
|
||||
assert!(hard.iter().all(|p| p.difficulty >= 7));
|
||||
|
||||
let mixed = SamplePuzzles::mixed_sample();
|
||||
assert!(mixed.len() >= 40);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Swarm Regret Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_regret_tracker_basic() {
|
||||
let mut tracker = RegretTracker::new(10);
|
||||
|
||||
let result = EpisodeResult {
|
||||
episode: 1,
|
||||
num_tasks: 20,
|
||||
solved: 18,
|
||||
correct: 17,
|
||||
total_steps: 100,
|
||||
tool_calls: 20,
|
||||
latency_ms: 1000,
|
||||
reward: 80.0,
|
||||
oracle_reward: 99.0,
|
||||
};
|
||||
|
||||
tracker.record_episode(result);
|
||||
|
||||
assert_eq!(tracker.episodes.len(), 1);
|
||||
assert!((tracker.current_cumulative_regret() - 19.0).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regret_tracker_sublinear() {
|
||||
let mut tracker = RegretTracker::new(10);
|
||||
|
||||
// Simulate improving performance (decreasing regret)
|
||||
for i in 0..10 {
|
||||
let accuracy = 0.5 + 0.05 * i as f64;
|
||||
let result = EpisodeResult {
|
||||
episode: i + 1,
|
||||
num_tasks: 20,
|
||||
solved: (20.0 * accuracy) as usize,
|
||||
correct: (20.0 * accuracy) as usize,
|
||||
total_steps: 100 - i * 5,
|
||||
tool_calls: 20,
|
||||
latency_ms: 1000,
|
||||
reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
|
||||
oracle_reward: 99.0,
|
||||
};
|
||||
tracker.record_episode(result);
|
||||
}
|
||||
|
||||
// Average regret should be decreasing
|
||||
assert!(tracker.is_sublinear());
|
||||
assert!(tracker.regret_trend() < 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_swarm_controller() {
|
||||
let mut controller = SwarmController::new(20);
|
||||
|
||||
// Run a few episodes
|
||||
for _ in 0..5 {
|
||||
controller.start_episode();
|
||||
controller.complete_episode(18, 17, 80, 20, 500);
|
||||
}
|
||||
|
||||
let status = controller.status();
|
||||
assert_eq!(status.episode, 5);
|
||||
assert!(status.accuracy > 0.8);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Logging Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_benchmark_logger() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.log");
|
||||
|
||||
let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
|
||||
|
||||
logger
|
||||
.log_temporal(
|
||||
"bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
logger
|
||||
.log_vector("search", 128, 10000, 1, 10, true, 0.9, 500, 10)
|
||||
.unwrap();
|
||||
|
||||
logger
|
||||
.log_swarm(1, 20, 18, 17, 85.0, 99.0, 14.0, 14.0, true)
|
||||
.unwrap();
|
||||
|
||||
logger.flush().unwrap();
|
||||
|
||||
// Read back
|
||||
let reader = ruvector_benchmarks::logging::LogReader::new(path.to_str().unwrap());
|
||||
let entries = reader.read_all().unwrap();
|
||||
assert_eq!(entries.len(), 3);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// End-to-End Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_full_benchmark_workflow() {
|
||||
// Generate puzzles
|
||||
let config = PuzzleGeneratorConfig {
|
||||
min_difficulty: 2,
|
||||
max_difficulty: 5,
|
||||
seed: Some(12345),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut gen = PuzzleGenerator::new(config);
|
||||
let puzzles = gen.generate_batch(10).unwrap();
|
||||
|
||||
// Create solver (budget must cover wider posterior-based ranges)
|
||||
let mut solver = TemporalSolver::with_tools(true, false);
|
||||
solver.max_steps = 400;
|
||||
|
||||
// Run all puzzles
|
||||
let mut results = Vec::new();
|
||||
for puzzle in &puzzles {
|
||||
let result = solver.solve(puzzle).unwrap();
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
// Check results
|
||||
let solved = results.iter().filter(|r| r.solved).count();
|
||||
let correct = results.iter().filter(|r| r.correct).count();
|
||||
|
||||
// Should solve most easy-medium puzzles
|
||||
assert!(solved >= 5);
|
||||
assert!(correct >= 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vector_temporal_integration() {
|
||||
// This tests using vector index to store temporal embeddings
|
||||
let mut idx = VectorIndex::new(64);
|
||||
|
||||
// Create "embeddings" for dates (simplified)
|
||||
for day in 1..=31 {
|
||||
let mut values = vec![0.0f32; 64];
|
||||
values[0] = day as f32 / 31.0; // Day component
|
||||
values[1] = 1.0 / 12.0; // Month component (January)
|
||||
values[2] = 2024.0 / 3000.0; // Year component
|
||||
idx.insert(DenseVec::new(values)).unwrap();
|
||||
}
|
||||
|
||||
// Search for similar dates
|
||||
let mut query = vec![0.0f32; 64];
|
||||
query[0] = 15.0 / 31.0; // Looking for mid-month
|
||||
query[1] = 1.0 / 12.0;
|
||||
query[2] = 2024.0 / 3000.0;
|
||||
|
||||
let results = idx.search(&DenseVec::new(query), 5, 1.0).unwrap();
|
||||
|
||||
// Should find dates near the 15th
|
||||
assert!(!results.is_empty());
|
||||
}
|
||||
Reference in New Issue
Block a user