Files
wifi-densepose/examples/benchmarks/src/agi_contract.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

628 lines
23 KiB
Rust

//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
//!
//! The AGI contract states: a system improves utility over time without violating
//! policy, while maintaining structural health.
//!
//! ## Core Metrics (all deterministic, all auditable)
//!
//! - **Solved tasks per cost** — graded outcomes normalized by compute
//! - **Stability under noise** — accuracy retention when inputs are corrupted
//! - **Contradiction rate** — solved-but-wrong / total attempted
//! - **Rollback correctness** — recovery rate when bad inputs are detected
//! - **Policy violations** — budget overruns + contradictions (must be zero)
//!
//! ## Autonomy Ladder
//!
//! Each level requires sustained health metrics before advancement:
//! 0. Read-only (observe only)
//! 1. Write to memory (store episodes, no execution)
//! 2. Execute tools (run solver, generate puzzles)
//! 3. Write to external systems (publish results)
//! 4. Deploy and operate (self-directed improvement)
use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
use serde::{Deserialize, Serialize};
// ═══════════════════════════════════════════════════════════════════════════
// Contract Health Snapshot
// ═══════════════════════════════════════════════════════════════════════════
/// A single point-in-time health measurement against the AGI contract.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractHealth {
/// Solved tasks per unit cost (tasks_correct / total_steps)
pub solved_per_cost: f64,
/// Accuracy on noise-injected tasks
pub noise_stability: f64,
/// Contradiction rate: solved-but-wrong / attempted
pub contradiction_rate: f64,
/// Rollback correctness: successful rollbacks / attempted rollbacks
pub rollback_correctness: f64,
/// Total policy violations (must be zero for contract compliance)
pub policy_violations: usize,
/// Clean accuracy (graded outcome baseline)
pub accuracy: f64,
/// Cost efficiency (0-1, higher = cheaper per solve)
pub cost_efficiency: f64,
/// Whether the contract is satisfied
pub compliant: bool,
}
impl ContractHealth {
/// Evaluate contract health from raw metrics.
pub fn from_raw(raw: &RawMetrics) -> Self {
let accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let solved_per_cost = if raw.total_steps > 0 {
raw.tasks_correct as f64 / raw.total_steps as f64
} else {
0.0
};
let noise_stability = if raw.noise_tasks_attempted > 0 {
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
} else {
0.0
};
let contradiction_rate = if raw.tasks_attempted > 0 {
raw.contradictions as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let rollback_correctness = if raw.rollback_attempts > 0 {
raw.rollback_successes as f64 / raw.rollback_attempts as f64
} else {
1.0 // no rollbacks needed => perfect
};
let cost_efficiency = (1.0 - {
let sps = if raw.tasks_correct > 0 {
raw.total_steps as f64 / raw.tasks_correct as f64
} else {
100.0
};
(sps - 5.0) / 95.0
})
.clamp(0.0, 1.0);
let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;
ContractHealth {
solved_per_cost,
noise_stability,
contradiction_rate,
rollback_correctness,
policy_violations: raw.policy_violations,
accuracy,
cost_efficiency,
compliant,
}
}
/// Evaluate contract health from an IntelligenceAssessment.
pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
Self::from_raw(&assessment.raw_data)
}
/// Print formatted contract health report.
pub fn print(&self) {
println!(" Contract Health:");
println!(" Solved/Cost: {:.4}", self.solved_per_cost);
println!(
" Noise Stability: {:.2}%",
self.noise_stability * 100.0
);
println!(
" Contradiction Rate: {:.4}%",
self.contradiction_rate * 100.0
);
println!(
" Rollback Correct: {:.2}%",
self.rollback_correctness * 100.0
);
println!(" Policy Violations: {}", self.policy_violations);
println!(" Accuracy: {:.2}%", self.accuracy * 100.0);
println!(
" Cost Efficiency: {:.2}%",
self.cost_efficiency * 100.0
);
println!(
" Compliant: {}",
if self.compliant { "YES" } else { "NO" }
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Contract Trend — compares two snapshots
// ═══════════════════════════════════════════════════════════════════════════
/// Tracks improvement across contract dimensions between two measurement points.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractDelta {
/// Change in solved-per-cost (positive = improving)
pub solved_per_cost_delta: f64,
/// Change in noise stability (positive = more robust)
pub noise_stability_delta: f64,
/// Change in contradiction rate (negative = improving)
pub contradiction_rate_delta: f64,
/// Change in rollback correctness (positive = better recovery)
pub rollback_delta: f64,
/// Change in accuracy (positive = better)
pub accuracy_delta: f64,
/// Change in cost efficiency (positive = cheaper)
pub cost_efficiency_delta: f64,
/// Number of dimensions that improved
pub dimensions_improved: usize,
/// Number of dimensions that regressed
pub dimensions_regressed: usize,
}
impl ContractDelta {
/// Compute delta between two health snapshots.
pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
let noise_stability_delta = after.noise_stability - before.noise_stability;
let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
let rollback_delta = after.rollback_correctness - before.rollback_correctness;
let accuracy_delta = after.accuracy - before.accuracy;
let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;
// Count improvements (positive is better for all except contradiction_rate)
let deltas = [
solved_per_cost_delta > 0.001,
noise_stability_delta > 0.001,
contradiction_rate_delta < -0.001, // decrease = improvement
rollback_delta > 0.001,
accuracy_delta > 0.001,
cost_efficiency_delta > 0.001,
];
let regressions = [
solved_per_cost_delta < -0.001,
noise_stability_delta < -0.001,
contradiction_rate_delta > 0.001,
rollback_delta < -0.001,
accuracy_delta < -0.01,
cost_efficiency_delta < -0.001,
];
ContractDelta {
solved_per_cost_delta,
noise_stability_delta,
contradiction_rate_delta,
rollback_delta,
accuracy_delta,
cost_efficiency_delta,
dimensions_improved: deltas.iter().filter(|&&d| d).count(),
dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
}
}
pub fn print(&self) {
let arrow = |v: f64, invert: bool| {
let positive = if invert { v < 0.0 } else { v > 0.0 };
if positive {
"+"
} else if v == 0.0 {
"="
} else {
"-"
}
};
println!(" Contract Delta:");
println!(
" Solved/Cost: {:>+.4} [{}]",
self.solved_per_cost_delta,
arrow(self.solved_per_cost_delta, false)
);
println!(
" Noise Stability: {:>+.4} [{}]",
self.noise_stability_delta,
arrow(self.noise_stability_delta, false)
);
println!(
" Contradiction: {:>+.4} [{}]",
self.contradiction_rate_delta,
arrow(self.contradiction_rate_delta, true)
);
println!(
" Rollback: {:>+.4} [{}]",
self.rollback_delta,
arrow(self.rollback_delta, false)
);
println!(
" Accuracy: {:>+.4} [{}]",
self.accuracy_delta,
arrow(self.accuracy_delta, false)
);
println!(
" Cost Efficiency: {:>+.4} [{}]",
self.cost_efficiency_delta,
arrow(self.cost_efficiency_delta, false)
);
println!(" Dimensions improved: {}/6", self.dimensions_improved);
println!(" Dimensions regressed: {}/6", self.dimensions_regressed);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Autonomy Ladder
// ═══════════════════════════════════════════════════════════════════════════
/// Autonomy level gated by sustained contract health.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AutonomyLevel {
/// Level 0: Read-only observation
ReadOnly = 0,
/// Level 1: Write to memory (store episodes)
WriteMemory = 1,
/// Level 2: Execute tools (run solver)
ExecuteTools = 2,
/// Level 3: Write to external systems (publish results)
WriteExternal = 3,
/// Level 4: Deploy and operate (self-directed improvement)
DeployOperate = 4,
}
/// Thresholds for advancing autonomy levels.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AutonomyGates {
/// Minimum consecutive compliant cycles to advance
pub min_compliant_cycles: usize,
/// Maximum allowed contradiction rate per level
pub max_contradiction_rate: [f64; 5],
/// Minimum accuracy per level
pub min_accuracy: [f64; 5],
/// Minimum cost efficiency per level
pub min_cost_efficiency: [f64; 5],
/// Minimum noise stability per level
pub min_noise_stability: [f64; 5],
/// Must have zero policy violations for levels >= 2
pub zero_violations_above: AutonomyLevel,
}
impl Default for AutonomyGates {
fn default() -> Self {
Self {
min_compliant_cycles: 3,
// L0 L1 L2 L3 L4
max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
zero_violations_above: AutonomyLevel::ExecuteTools,
}
}
}
/// Evaluator that determines current autonomy level from contract history.
pub struct AutonomyEvaluator {
pub gates: AutonomyGates,
}
impl Default for AutonomyEvaluator {
fn default() -> Self {
Self {
gates: AutonomyGates::default(),
}
}
}
impl AutonomyEvaluator {
/// Determine the highest autonomy level supported by the health history.
/// `history` is ordered oldest-first.
pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
if history.is_empty() {
return AutonomyLevel::ReadOnly;
}
let mut level = AutonomyLevel::ReadOnly;
let levels = [
AutonomyLevel::WriteMemory,
AutonomyLevel::ExecuteTools,
AutonomyLevel::WriteExternal,
AutonomyLevel::DeployOperate,
];
for &candidate in &levels {
let idx = candidate as usize;
let required = self.gates.min_compliant_cycles;
// Need enough recent history
if history.len() < required {
break;
}
let recent = &history[history.len().saturating_sub(required)..];
let all_pass = recent.iter().all(|h| {
h.accuracy >= self.gates.min_accuracy[idx]
&& h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
&& h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
&& h.noise_stability >= self.gates.min_noise_stability[idx]
&& (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
});
if all_pass {
level = candidate;
} else {
break;
}
}
level
}
pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
let labels = [
"Read-Only",
"Write Memory",
"Execute Tools",
"Write External",
"Deploy & Operate",
];
println!(
" Autonomy Level: {} ({})",
level as usize, labels[level as usize]
);
println!(" Gates for next level:");
let next = (level as usize + 1).min(4);
println!(
" Accuracy: {:.0}% (need {:.0}%)",
health.accuracy * 100.0,
self.gates.min_accuracy[next] * 100.0
);
println!(
" Contradiction: {:.3}% (need <{:.3}%)",
health.contradiction_rate * 100.0,
self.gates.max_contradiction_rate[next] * 100.0
);
println!(
" Cost Eff: {:.0}% (need {:.0}%)",
health.cost_efficiency * 100.0,
self.gates.min_cost_efficiency[next] * 100.0
);
println!(
" Noise Stab: {:.0}% (need {:.0}%)",
health.noise_stability * 100.0,
self.gates.min_noise_stability[next] * 100.0
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Viability Checklist
// ═══════════════════════════════════════════════════════════════════════════
/// The 5 viability checks that determine if the system is on an AGI trajectory.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ViabilityChecklist {
/// Can replay runs and get identical grades
pub deterministic_replay: bool,
/// Improves utility over time without raising policy violations
pub improving_without_violations: bool,
/// Can roll back bad learning reliably
pub reliable_rollback: bool,
/// Can generate infinite novel tasks with automatic grading
pub infinite_gradeable_tasks: bool,
/// Cost per solve trending down over weeks
pub cost_trending_down: bool,
}
impl ViabilityChecklist {
/// Evaluate from contract health history.
pub fn evaluate(history: &[ContractHealth]) -> Self {
// Deterministic replay: verified externally (always true in our harness)
let deterministic_replay = true;
// Improving without violations: later health better than earlier, zero violations
let improving_without_violations = if history.len() >= 2 {
let first = &history[0];
let last = &history[history.len() - 1];
last.accuracy >= first.accuracy
&& last.policy_violations == 0
&& history.iter().all(|h| h.policy_violations == 0)
} else {
false
};
// Reliable rollback: rollback correctness >= 80% when attempted
let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);
// Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
let infinite_gradeable_tasks = true;
// Cost trending down: solved_per_cost increases over time
let cost_trending_down = if history.len() >= 3 {
let first_third: f64 = history[..history.len() / 3]
.iter()
.map(|h| h.solved_per_cost)
.sum::<f64>()
/ (history.len() / 3) as f64;
let last_third: f64 = history[history.len() * 2 / 3..]
.iter()
.map(|h| h.solved_per_cost)
.sum::<f64>()
/ (history.len() - history.len() * 2 / 3) as f64;
last_third > first_third
} else {
false
};
ViabilityChecklist {
deterministic_replay,
improving_without_violations,
reliable_rollback,
infinite_gradeable_tasks,
cost_trending_down,
}
}
pub fn all_pass(&self) -> bool {
self.deterministic_replay
&& self.improving_without_violations
&& self.reliable_rollback
&& self.infinite_gradeable_tasks
&& self.cost_trending_down
}
pub fn print(&self) {
let check = |b: bool| if b { "PASS" } else { "FAIL" };
println!(" Viability Checklist:");
println!(
" 1. Deterministic replay: {}",
check(self.deterministic_replay)
);
println!(
" 2. Improving w/o violations: {}",
check(self.improving_without_violations)
);
println!(
" 3. Reliable rollback: {}",
check(self.reliable_rollback)
);
println!(
" 4. Infinite gradeable tasks: {}",
check(self.infinite_gradeable_tasks)
);
println!(
" 5. Cost trending down: {}",
check(self.cost_trending_down)
);
println!(
" Overall: {}",
if self.all_pass() {
"VIABLE AGI TRAJECTORY"
} else {
"NOT YET VIABLE"
}
);
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Tests
// ═══════════════════════════════════════════════════════════════════════════
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn contract_health_from_raw() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 100;
raw.tasks_completed = 95;
raw.tasks_correct = 92;
raw.total_steps = 600;
raw.noise_tasks_attempted = 30;
raw.noise_tasks_correct = 25;
raw.contradictions = 0; // zero contradictions for compliance
raw.rollback_attempts = 5;
raw.rollback_successes = 4;
let health = ContractHealth::from_raw(&raw);
assert!((health.accuracy - 0.92).abs() < 0.01);
assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
assert!((health.contradiction_rate).abs() < 0.001);
assert!((health.rollback_correctness - 0.8).abs() < 0.01);
assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
}
#[test]
fn contract_delta_detects_improvement() {
let before = ContractHealth {
solved_per_cost: 0.10,
noise_stability: 0.70,
contradiction_rate: 0.03,
rollback_correctness: 0.80,
policy_violations: 0,
accuracy: 0.85,
cost_efficiency: 0.50,
compliant: false,
};
let after = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.85,
contradiction_rate: 0.01,
rollback_correctness: 0.90,
policy_violations: 0,
accuracy: 0.93,
cost_efficiency: 0.70,
compliant: true,
};
let delta = ContractDelta::between(&before, &after);
assert_eq!(delta.dimensions_improved, 6);
assert_eq!(delta.dimensions_regressed, 0);
}
#[test]
fn autonomy_ladder_advances() {
let evaluator = AutonomyEvaluator::default();
// No history => ReadOnly
assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);
// 3 compliant cycles at L1 level
let h = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.55,
contradiction_rate: 0.04,
rollback_correctness: 1.0,
policy_violations: 0,
accuracy: 0.75,
cost_efficiency: 0.30,
compliant: true,
};
let history = vec![h.clone(), h.clone(), h.clone()];
assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
}
#[test]
fn viability_checklist_basic() {
let h1 = ContractHealth {
solved_per_cost: 0.10,
noise_stability: 0.70,
contradiction_rate: 0.01,
rollback_correctness: 0.90,
policy_violations: 0,
accuracy: 0.85,
cost_efficiency: 0.50,
compliant: true,
};
let h2 = ContractHealth {
solved_per_cost: 0.12,
noise_stability: 0.80,
contradiction_rate: 0.005,
rollback_correctness: 0.95,
policy_violations: 0,
accuracy: 0.90,
cost_efficiency: 0.60,
compliant: true,
};
let h3 = ContractHealth {
solved_per_cost: 0.15,
noise_stability: 0.85,
contradiction_rate: 0.002,
rollback_correctness: 0.95,
policy_violations: 0,
accuracy: 0.93,
cost_efficiency: 0.70,
compliant: true,
};
let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
assert!(viability.deterministic_replay);
assert!(viability.improving_without_violations);
assert!(viability.reliable_rollback);
assert!(viability.infinite_gradeable_tasks);
assert!(viability.cost_trending_down);
assert!(viability.all_pass());
}
}