Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
627
examples/benchmarks/src/agi_contract.rs
Normal file
627
examples/benchmarks/src/agi_contract.rs
Normal file
@@ -0,0 +1,627 @@
|
||||
//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
|
||||
//!
|
||||
//! The AGI contract states: a system improves utility over time without violating
|
||||
//! policy, while maintaining structural health.
|
||||
//!
|
||||
//! ## Core Metrics (all deterministic, all auditable)
|
||||
//!
|
||||
//! - **Solved tasks per cost** — graded outcomes normalized by compute
|
||||
//! - **Stability under noise** — accuracy retention when inputs are corrupted
|
||||
//! - **Contradiction rate** — solved-but-wrong / total attempted
|
||||
//! - **Rollback correctness** — recovery rate when bad inputs are detected
|
||||
//! - **Policy violations** — budget overruns + contradictions (must be zero)
|
||||
//!
|
||||
//! ## Autonomy Ladder
|
||||
//!
|
||||
//! Each level requires sustained health metrics before advancement:
|
||||
//! 0. Read-only (observe only)
|
||||
//! 1. Write to memory (store episodes, no execution)
|
||||
//! 2. Execute tools (run solver, generate puzzles)
|
||||
//! 3. Write to external systems (publish results)
|
||||
//! 4. Deploy and operate (self-directed improvement)
|
||||
|
||||
use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Contract Health Snapshot
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// A single point-in-time health measurement against the AGI contract.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ContractHealth {
|
||||
/// Solved tasks per unit cost (tasks_correct / total_steps)
|
||||
pub solved_per_cost: f64,
|
||||
/// Accuracy on noise-injected tasks
|
||||
pub noise_stability: f64,
|
||||
/// Contradiction rate: solved-but-wrong / attempted
|
||||
pub contradiction_rate: f64,
|
||||
/// Rollback correctness: successful rollbacks / attempted rollbacks
|
||||
pub rollback_correctness: f64,
|
||||
/// Total policy violations (must be zero for contract compliance)
|
||||
pub policy_violations: usize,
|
||||
/// Clean accuracy (graded outcome baseline)
|
||||
pub accuracy: f64,
|
||||
/// Cost efficiency (0-1, higher = cheaper per solve)
|
||||
pub cost_efficiency: f64,
|
||||
/// Whether the contract is satisfied
|
||||
pub compliant: bool,
|
||||
}
|
||||
|
||||
impl ContractHealth {
|
||||
/// Evaluate contract health from raw metrics.
|
||||
pub fn from_raw(raw: &RawMetrics) -> Self {
|
||||
let accuracy = if raw.tasks_attempted > 0 {
|
||||
raw.tasks_correct as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let solved_per_cost = if raw.total_steps > 0 {
|
||||
raw.tasks_correct as f64 / raw.total_steps as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let noise_stability = if raw.noise_tasks_attempted > 0 {
|
||||
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let contradiction_rate = if raw.tasks_attempted > 0 {
|
||||
raw.contradictions as f64 / raw.tasks_attempted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let rollback_correctness = if raw.rollback_attempts > 0 {
|
||||
raw.rollback_successes as f64 / raw.rollback_attempts as f64
|
||||
} else {
|
||||
1.0 // no rollbacks needed => perfect
|
||||
};
|
||||
|
||||
let cost_efficiency = (1.0 - {
|
||||
let sps = if raw.tasks_correct > 0 {
|
||||
raw.total_steps as f64 / raw.tasks_correct as f64
|
||||
} else {
|
||||
100.0
|
||||
};
|
||||
(sps - 5.0) / 95.0
|
||||
})
|
||||
.clamp(0.0, 1.0);
|
||||
|
||||
let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;
|
||||
|
||||
ContractHealth {
|
||||
solved_per_cost,
|
||||
noise_stability,
|
||||
contradiction_rate,
|
||||
rollback_correctness,
|
||||
policy_violations: raw.policy_violations,
|
||||
accuracy,
|
||||
cost_efficiency,
|
||||
compliant,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate contract health from an IntelligenceAssessment.
|
||||
pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
|
||||
Self::from_raw(&assessment.raw_data)
|
||||
}
|
||||
|
||||
/// Print formatted contract health report.
|
||||
pub fn print(&self) {
|
||||
println!(" Contract Health:");
|
||||
println!(" Solved/Cost: {:.4}", self.solved_per_cost);
|
||||
println!(
|
||||
" Noise Stability: {:.2}%",
|
||||
self.noise_stability * 100.0
|
||||
);
|
||||
println!(
|
||||
" Contradiction Rate: {:.4}%",
|
||||
self.contradiction_rate * 100.0
|
||||
);
|
||||
println!(
|
||||
" Rollback Correct: {:.2}%",
|
||||
self.rollback_correctness * 100.0
|
||||
);
|
||||
println!(" Policy Violations: {}", self.policy_violations);
|
||||
println!(" Accuracy: {:.2}%", self.accuracy * 100.0);
|
||||
println!(
|
||||
" Cost Efficiency: {:.2}%",
|
||||
self.cost_efficiency * 100.0
|
||||
);
|
||||
println!(
|
||||
" Compliant: {}",
|
||||
if self.compliant { "YES" } else { "NO" }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Contract Trend — compares two snapshots
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Tracks improvement across contract dimensions between two measurement points.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ContractDelta {
|
||||
/// Change in solved-per-cost (positive = improving)
|
||||
pub solved_per_cost_delta: f64,
|
||||
/// Change in noise stability (positive = more robust)
|
||||
pub noise_stability_delta: f64,
|
||||
/// Change in contradiction rate (negative = improving)
|
||||
pub contradiction_rate_delta: f64,
|
||||
/// Change in rollback correctness (positive = better recovery)
|
||||
pub rollback_delta: f64,
|
||||
/// Change in accuracy (positive = better)
|
||||
pub accuracy_delta: f64,
|
||||
/// Change in cost efficiency (positive = cheaper)
|
||||
pub cost_efficiency_delta: f64,
|
||||
/// Number of dimensions that improved
|
||||
pub dimensions_improved: usize,
|
||||
/// Number of dimensions that regressed
|
||||
pub dimensions_regressed: usize,
|
||||
}
|
||||
|
||||
impl ContractDelta {
|
||||
/// Compute delta between two health snapshots.
|
||||
pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
|
||||
let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
|
||||
let noise_stability_delta = after.noise_stability - before.noise_stability;
|
||||
let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
|
||||
let rollback_delta = after.rollback_correctness - before.rollback_correctness;
|
||||
let accuracy_delta = after.accuracy - before.accuracy;
|
||||
let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;
|
||||
|
||||
// Count improvements (positive is better for all except contradiction_rate)
|
||||
let deltas = [
|
||||
solved_per_cost_delta > 0.001,
|
||||
noise_stability_delta > 0.001,
|
||||
contradiction_rate_delta < -0.001, // decrease = improvement
|
||||
rollback_delta > 0.001,
|
||||
accuracy_delta > 0.001,
|
||||
cost_efficiency_delta > 0.001,
|
||||
];
|
||||
let regressions = [
|
||||
solved_per_cost_delta < -0.001,
|
||||
noise_stability_delta < -0.001,
|
||||
contradiction_rate_delta > 0.001,
|
||||
rollback_delta < -0.001,
|
||||
accuracy_delta < -0.01,
|
||||
cost_efficiency_delta < -0.001,
|
||||
];
|
||||
|
||||
ContractDelta {
|
||||
solved_per_cost_delta,
|
||||
noise_stability_delta,
|
||||
contradiction_rate_delta,
|
||||
rollback_delta,
|
||||
accuracy_delta,
|
||||
cost_efficiency_delta,
|
||||
dimensions_improved: deltas.iter().filter(|&&d| d).count(),
|
||||
dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn print(&self) {
|
||||
let arrow = |v: f64, invert: bool| {
|
||||
let positive = if invert { v < 0.0 } else { v > 0.0 };
|
||||
if positive {
|
||||
"+"
|
||||
} else if v == 0.0 {
|
||||
"="
|
||||
} else {
|
||||
"-"
|
||||
}
|
||||
};
|
||||
println!(" Contract Delta:");
|
||||
println!(
|
||||
" Solved/Cost: {:>+.4} [{}]",
|
||||
self.solved_per_cost_delta,
|
||||
arrow(self.solved_per_cost_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Noise Stability: {:>+.4} [{}]",
|
||||
self.noise_stability_delta,
|
||||
arrow(self.noise_stability_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Contradiction: {:>+.4} [{}]",
|
||||
self.contradiction_rate_delta,
|
||||
arrow(self.contradiction_rate_delta, true)
|
||||
);
|
||||
println!(
|
||||
" Rollback: {:>+.4} [{}]",
|
||||
self.rollback_delta,
|
||||
arrow(self.rollback_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Accuracy: {:>+.4} [{}]",
|
||||
self.accuracy_delta,
|
||||
arrow(self.accuracy_delta, false)
|
||||
);
|
||||
println!(
|
||||
" Cost Efficiency: {:>+.4} [{}]",
|
||||
self.cost_efficiency_delta,
|
||||
arrow(self.cost_efficiency_delta, false)
|
||||
);
|
||||
println!(" Dimensions improved: {}/6", self.dimensions_improved);
|
||||
println!(" Dimensions regressed: {}/6", self.dimensions_regressed);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Autonomy Ladder
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Autonomy level gated by sustained contract health.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub enum AutonomyLevel {
|
||||
/// Level 0: Read-only observation
|
||||
ReadOnly = 0,
|
||||
/// Level 1: Write to memory (store episodes)
|
||||
WriteMemory = 1,
|
||||
/// Level 2: Execute tools (run solver)
|
||||
ExecuteTools = 2,
|
||||
/// Level 3: Write to external systems (publish results)
|
||||
WriteExternal = 3,
|
||||
/// Level 4: Deploy and operate (self-directed improvement)
|
||||
DeployOperate = 4,
|
||||
}
|
||||
|
||||
/// Thresholds for advancing autonomy levels.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AutonomyGates {
|
||||
/// Minimum consecutive compliant cycles to advance
|
||||
pub min_compliant_cycles: usize,
|
||||
/// Maximum allowed contradiction rate per level
|
||||
pub max_contradiction_rate: [f64; 5],
|
||||
/// Minimum accuracy per level
|
||||
pub min_accuracy: [f64; 5],
|
||||
/// Minimum cost efficiency per level
|
||||
pub min_cost_efficiency: [f64; 5],
|
||||
/// Minimum noise stability per level
|
||||
pub min_noise_stability: [f64; 5],
|
||||
/// Must have zero policy violations for levels >= 2
|
||||
pub zero_violations_above: AutonomyLevel,
|
||||
}
|
||||
|
||||
impl Default for AutonomyGates {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_compliant_cycles: 3,
|
||||
// L0 L1 L2 L3 L4
|
||||
max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
|
||||
min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
|
||||
min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
|
||||
min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
|
||||
zero_violations_above: AutonomyLevel::ExecuteTools,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluator that determines current autonomy level from contract history.
|
||||
pub struct AutonomyEvaluator {
|
||||
pub gates: AutonomyGates,
|
||||
}
|
||||
|
||||
impl Default for AutonomyEvaluator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gates: AutonomyGates::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AutonomyEvaluator {
|
||||
/// Determine the highest autonomy level supported by the health history.
|
||||
/// `history` is ordered oldest-first.
|
||||
pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
|
||||
if history.is_empty() {
|
||||
return AutonomyLevel::ReadOnly;
|
||||
}
|
||||
|
||||
let mut level = AutonomyLevel::ReadOnly;
|
||||
let levels = [
|
||||
AutonomyLevel::WriteMemory,
|
||||
AutonomyLevel::ExecuteTools,
|
||||
AutonomyLevel::WriteExternal,
|
||||
AutonomyLevel::DeployOperate,
|
||||
];
|
||||
|
||||
for &candidate in &levels {
|
||||
let idx = candidate as usize;
|
||||
let required = self.gates.min_compliant_cycles;
|
||||
|
||||
// Need enough recent history
|
||||
if history.len() < required {
|
||||
break;
|
||||
}
|
||||
|
||||
let recent = &history[history.len().saturating_sub(required)..];
|
||||
let all_pass = recent.iter().all(|h| {
|
||||
h.accuracy >= self.gates.min_accuracy[idx]
|
||||
&& h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
|
||||
&& h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
|
||||
&& h.noise_stability >= self.gates.min_noise_stability[idx]
|
||||
&& (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
|
||||
});
|
||||
|
||||
if all_pass {
|
||||
level = candidate;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
level
|
||||
}
|
||||
|
||||
pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
|
||||
let labels = [
|
||||
"Read-Only",
|
||||
"Write Memory",
|
||||
"Execute Tools",
|
||||
"Write External",
|
||||
"Deploy & Operate",
|
||||
];
|
||||
println!(
|
||||
" Autonomy Level: {} ({})",
|
||||
level as usize, labels[level as usize]
|
||||
);
|
||||
println!(" Gates for next level:");
|
||||
let next = (level as usize + 1).min(4);
|
||||
println!(
|
||||
" Accuracy: {:.0}% (need {:.0}%)",
|
||||
health.accuracy * 100.0,
|
||||
self.gates.min_accuracy[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Contradiction: {:.3}% (need <{:.3}%)",
|
||||
health.contradiction_rate * 100.0,
|
||||
self.gates.max_contradiction_rate[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Cost Eff: {:.0}% (need {:.0}%)",
|
||||
health.cost_efficiency * 100.0,
|
||||
self.gates.min_cost_efficiency[next] * 100.0
|
||||
);
|
||||
println!(
|
||||
" Noise Stab: {:.0}% (need {:.0}%)",
|
||||
health.noise_stability * 100.0,
|
||||
self.gates.min_noise_stability[next] * 100.0
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Viability Checklist
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/// The 5 viability checks that determine if the system is on an AGI trajectory.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ViabilityChecklist {
|
||||
/// Can replay runs and get identical grades
|
||||
pub deterministic_replay: bool,
|
||||
/// Improves utility over time without raising policy violations
|
||||
pub improving_without_violations: bool,
|
||||
/// Can roll back bad learning reliably
|
||||
pub reliable_rollback: bool,
|
||||
/// Can generate infinite novel tasks with automatic grading
|
||||
pub infinite_gradeable_tasks: bool,
|
||||
/// Cost per solve trending down over weeks
|
||||
pub cost_trending_down: bool,
|
||||
}
|
||||
|
||||
impl ViabilityChecklist {
|
||||
/// Evaluate from contract health history.
|
||||
pub fn evaluate(history: &[ContractHealth]) -> Self {
|
||||
// Deterministic replay: verified externally (always true in our harness)
|
||||
let deterministic_replay = true;
|
||||
|
||||
// Improving without violations: later health better than earlier, zero violations
|
||||
let improving_without_violations = if history.len() >= 2 {
|
||||
let first = &history[0];
|
||||
let last = &history[history.len() - 1];
|
||||
last.accuracy >= first.accuracy
|
||||
&& last.policy_violations == 0
|
||||
&& history.iter().all(|h| h.policy_violations == 0)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Reliable rollback: rollback correctness >= 80% when attempted
|
||||
let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);
|
||||
|
||||
// Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
|
||||
let infinite_gradeable_tasks = true;
|
||||
|
||||
// Cost trending down: solved_per_cost increases over time
|
||||
let cost_trending_down = if history.len() >= 3 {
|
||||
let first_third: f64 = history[..history.len() / 3]
|
||||
.iter()
|
||||
.map(|h| h.solved_per_cost)
|
||||
.sum::<f64>()
|
||||
/ (history.len() / 3) as f64;
|
||||
let last_third: f64 = history[history.len() * 2 / 3..]
|
||||
.iter()
|
||||
.map(|h| h.solved_per_cost)
|
||||
.sum::<f64>()
|
||||
/ (history.len() - history.len() * 2 / 3) as f64;
|
||||
last_third > first_third
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
ViabilityChecklist {
|
||||
deterministic_replay,
|
||||
improving_without_violations,
|
||||
reliable_rollback,
|
||||
infinite_gradeable_tasks,
|
||||
cost_trending_down,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn all_pass(&self) -> bool {
|
||||
self.deterministic_replay
|
||||
&& self.improving_without_violations
|
||||
&& self.reliable_rollback
|
||||
&& self.infinite_gradeable_tasks
|
||||
&& self.cost_trending_down
|
||||
}
|
||||
|
||||
pub fn print(&self) {
|
||||
let check = |b: bool| if b { "PASS" } else { "FAIL" };
|
||||
println!(" Viability Checklist:");
|
||||
println!(
|
||||
" 1. Deterministic replay: {}",
|
||||
check(self.deterministic_replay)
|
||||
);
|
||||
println!(
|
||||
" 2. Improving w/o violations: {}",
|
||||
check(self.improving_without_violations)
|
||||
);
|
||||
println!(
|
||||
" 3. Reliable rollback: {}",
|
||||
check(self.reliable_rollback)
|
||||
);
|
||||
println!(
|
||||
" 4. Infinite gradeable tasks: {}",
|
||||
check(self.infinite_gradeable_tasks)
|
||||
);
|
||||
println!(
|
||||
" 5. Cost trending down: {}",
|
||||
check(self.cost_trending_down)
|
||||
);
|
||||
println!(
|
||||
" Overall: {}",
|
||||
if self.all_pass() {
|
||||
"VIABLE AGI TRAJECTORY"
|
||||
} else {
|
||||
"NOT YET VIABLE"
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
// Tests
|
||||
// ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn contract_health_from_raw() {
|
||||
let mut raw = RawMetrics::default();
|
||||
raw.tasks_attempted = 100;
|
||||
raw.tasks_completed = 95;
|
||||
raw.tasks_correct = 92;
|
||||
raw.total_steps = 600;
|
||||
raw.noise_tasks_attempted = 30;
|
||||
raw.noise_tasks_correct = 25;
|
||||
raw.contradictions = 0; // zero contradictions for compliance
|
||||
raw.rollback_attempts = 5;
|
||||
raw.rollback_successes = 4;
|
||||
|
||||
let health = ContractHealth::from_raw(&raw);
|
||||
assert!((health.accuracy - 0.92).abs() < 0.01);
|
||||
assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
|
||||
assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
|
||||
assert!((health.contradiction_rate).abs() < 0.001);
|
||||
assert!((health.rollback_correctness - 0.8).abs() < 0.01);
|
||||
assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn contract_delta_detects_improvement() {
|
||||
let before = ContractHealth {
|
||||
solved_per_cost: 0.10,
|
||||
noise_stability: 0.70,
|
||||
contradiction_rate: 0.03,
|
||||
rollback_correctness: 0.80,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.85,
|
||||
cost_efficiency: 0.50,
|
||||
compliant: false,
|
||||
};
|
||||
let after = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.85,
|
||||
contradiction_rate: 0.01,
|
||||
rollback_correctness: 0.90,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.93,
|
||||
cost_efficiency: 0.70,
|
||||
compliant: true,
|
||||
};
|
||||
let delta = ContractDelta::between(&before, &after);
|
||||
assert_eq!(delta.dimensions_improved, 6);
|
||||
assert_eq!(delta.dimensions_regressed, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn autonomy_ladder_advances() {
|
||||
let evaluator = AutonomyEvaluator::default();
|
||||
|
||||
// No history => ReadOnly
|
||||
assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);
|
||||
|
||||
// 3 compliant cycles at L1 level
|
||||
let h = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.55,
|
||||
contradiction_rate: 0.04,
|
||||
rollback_correctness: 1.0,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.75,
|
||||
cost_efficiency: 0.30,
|
||||
compliant: true,
|
||||
};
|
||||
let history = vec![h.clone(), h.clone(), h.clone()];
|
||||
assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn viability_checklist_basic() {
|
||||
let h1 = ContractHealth {
|
||||
solved_per_cost: 0.10,
|
||||
noise_stability: 0.70,
|
||||
contradiction_rate: 0.01,
|
||||
rollback_correctness: 0.90,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.85,
|
||||
cost_efficiency: 0.50,
|
||||
compliant: true,
|
||||
};
|
||||
let h2 = ContractHealth {
|
||||
solved_per_cost: 0.12,
|
||||
noise_stability: 0.80,
|
||||
contradiction_rate: 0.005,
|
||||
rollback_correctness: 0.95,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.90,
|
||||
cost_efficiency: 0.60,
|
||||
compliant: true,
|
||||
};
|
||||
let h3 = ContractHealth {
|
||||
solved_per_cost: 0.15,
|
||||
noise_stability: 0.85,
|
||||
contradiction_rate: 0.002,
|
||||
rollback_correctness: 0.95,
|
||||
policy_violations: 0,
|
||||
accuracy: 0.93,
|
||||
cost_efficiency: 0.70,
|
||||
compliant: true,
|
||||
};
|
||||
let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
|
||||
assert!(viability.deterministic_replay);
|
||||
assert!(viability.improving_without_violations);
|
||||
assert!(viability.reliable_rollback);
|
||||
assert!(viability.infinite_gradeable_tasks);
|
||||
assert!(viability.cost_trending_down);
|
||||
assert!(viability.all_pass());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user