wifi-densepose/examples/benchmarks/src/agi_contract.rs

//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
//!
//! The AGI contract states: a system improves utility over time without violating
//! policy, while maintaining structural health.
//!
//! ## Core Metrics (all deterministic, all auditable)
//!
//! - **Solved tasks per cost** — graded outcomes normalized by compute
//! - **Stability under noise** — accuracy retention when inputs are corrupted
//! - **Contradiction rate** — solved-but-wrong / total attempted
//! - **Rollback correctness** — recovery rate when bad inputs are detected
//! - **Policy violations** — budget overruns + contradictions (must be zero)
//!
//! ## Autonomy Ladder
//!
//! Each level requires sustained health metrics before advancement:
//! 0. Read-only (observe only)
//! 1. Write to memory (store episodes, no execution)
//! 2. Execute tools (run solver, generate puzzles)
//! 3. Write to external systems (publish results)
//! 4. Deploy and operate (self-directed improvement)

use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
use serde::{Deserialize, Serialize};

// ═══════════════════════════════════════════════════════════════════════════
// Contract Health Snapshot
// ═══════════════════════════════════════════════════════════════════════════

/// A single point-in-time health measurement against the AGI contract.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractHealth {
    /// Solved tasks per unit cost (tasks_correct / total_steps)
    pub solved_per_cost: f64,
    /// Accuracy on noise-injected tasks
    pub noise_stability: f64,
    /// Contradiction rate: solved-but-wrong / attempted
    pub contradiction_rate: f64,
    /// Rollback correctness: successful rollbacks / attempted rollbacks
    pub rollback_correctness: f64,
    /// Total policy violations (must be zero for contract compliance)
    pub policy_violations: usize,
    /// Clean accuracy (graded outcome baseline)
    pub accuracy: f64,
    /// Cost efficiency (0-1, higher = cheaper per solve)
    pub cost_efficiency: f64,
    /// Whether the contract is satisfied
    pub compliant: bool,
}

impl ContractHealth {
    /// Evaluate contract health from raw metrics.
    pub fn from_raw(raw: &RawMetrics) -> Self {
        let accuracy = if raw.tasks_attempted > 0 {
            raw.tasks_correct as f64 / raw.tasks_attempted as f64
        } else {
            0.0
        };

        let solved_per_cost = if raw.total_steps > 0 {
            raw.tasks_correct as f64 / raw.total_steps as f64
        } else {
            0.0
        };

        let noise_stability = if raw.noise_tasks_attempted > 0 {
            raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
        } else {
            0.0
        };

        let contradiction_rate = if raw.tasks_attempted > 0 {
            raw.contradictions as f64 / raw.tasks_attempted as f64
        } else {
            0.0
        };

        let rollback_correctness = if raw.rollback_attempts > 0 {
            raw.rollback_successes as f64 / raw.rollback_attempts as f64
        } else {
            1.0 // no rollbacks needed => perfect
        };

        let cost_efficiency = (1.0 - {
            let sps = if raw.tasks_correct > 0 {
                raw.total_steps as f64 / raw.tasks_correct as f64
            } else {
                100.0
            };
            (sps - 5.0) / 95.0
        })
        .clamp(0.0, 1.0);

        let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;

        ContractHealth {
            solved_per_cost,
            noise_stability,
            contradiction_rate,
            rollback_correctness,
            policy_violations: raw.policy_violations,
            accuracy,
            cost_efficiency,
            compliant,
        }
    }

    /// Evaluate contract health from an IntelligenceAssessment.
    pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
        Self::from_raw(&assessment.raw_data)
    }

    /// Print formatted contract health report.
    pub fn print(&self) {
        println!("  Contract Health:");
        println!("    Solved/Cost:        {:.4}", self.solved_per_cost);
        println!(
            "    Noise Stability:    {:.2}%",
            self.noise_stability * 100.0
        );
        println!(
            "    Contradiction Rate: {:.4}%",
            self.contradiction_rate * 100.0
        );
        println!(
            "    Rollback Correct:   {:.2}%",
            self.rollback_correctness * 100.0
        );
        println!("    Policy Violations:  {}", self.policy_violations);
        println!("    Accuracy:           {:.2}%", self.accuracy * 100.0);
        println!(
            "    Cost Efficiency:    {:.2}%",
            self.cost_efficiency * 100.0
        );
        println!(
            "    Compliant:          {}",
            if self.compliant { "YES" } else { "NO" }
        );
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// Contract Trend — compares two snapshots
// ═══════════════════════════════════════════════════════════════════════════

/// Tracks improvement across contract dimensions between two measurement points.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ContractDelta {
    /// Change in solved-per-cost (positive = improving)
    pub solved_per_cost_delta: f64,
    /// Change in noise stability (positive = more robust)
    pub noise_stability_delta: f64,
    /// Change in contradiction rate (negative = improving)
    pub contradiction_rate_delta: f64,
    /// Change in rollback correctness (positive = better recovery)
    pub rollback_delta: f64,
    /// Change in accuracy (positive = better)
    pub accuracy_delta: f64,
    /// Change in cost efficiency (positive = cheaper)
    pub cost_efficiency_delta: f64,
    /// Number of dimensions that improved
    pub dimensions_improved: usize,
    /// Number of dimensions that regressed
    pub dimensions_regressed: usize,
}

impl ContractDelta {
    /// Compute delta between two health snapshots.
    pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
        let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
        let noise_stability_delta = after.noise_stability - before.noise_stability;
        let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
        let rollback_delta = after.rollback_correctness - before.rollback_correctness;
        let accuracy_delta = after.accuracy - before.accuracy;
        let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;

        // Count improvements (positive is better for all except contradiction_rate)
        let deltas = [
            solved_per_cost_delta > 0.001,
            noise_stability_delta > 0.001,
            contradiction_rate_delta < -0.001, // decrease = improvement
            rollback_delta > 0.001,
            accuracy_delta > 0.001,
            cost_efficiency_delta > 0.001,
        ];
        let regressions = [
            solved_per_cost_delta < -0.001,
            noise_stability_delta < -0.001,
            contradiction_rate_delta > 0.001,
            rollback_delta < -0.001,
            accuracy_delta < -0.01,
            cost_efficiency_delta < -0.001,
        ];

        ContractDelta {
            solved_per_cost_delta,
            noise_stability_delta,
            contradiction_rate_delta,
            rollback_delta,
            accuracy_delta,
            cost_efficiency_delta,
            dimensions_improved: deltas.iter().filter(|&&d| d).count(),
            dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
        }
    }

    pub fn print(&self) {
        let arrow = |v: f64, invert: bool| {
            let positive = if invert { v < 0.0 } else { v > 0.0 };
            if positive {
                "+"
            } else if v == 0.0 {
                "="
            } else {
                "-"
            }
        };
        println!("  Contract Delta:");
        println!(
            "    Solved/Cost:     {:>+.4} [{}]",
            self.solved_per_cost_delta,
            arrow(self.solved_per_cost_delta, false)
        );
        println!(
            "    Noise Stability: {:>+.4} [{}]",
            self.noise_stability_delta,
            arrow(self.noise_stability_delta, false)
        );
        println!(
            "    Contradiction:   {:>+.4} [{}]",
            self.contradiction_rate_delta,
            arrow(self.contradiction_rate_delta, true)
        );
        println!(
            "    Rollback:        {:>+.4} [{}]",
            self.rollback_delta,
            arrow(self.rollback_delta, false)
        );
        println!(
            "    Accuracy:        {:>+.4} [{}]",
            self.accuracy_delta,
            arrow(self.accuracy_delta, false)
        );
        println!(
            "    Cost Efficiency: {:>+.4} [{}]",
            self.cost_efficiency_delta,
            arrow(self.cost_efficiency_delta, false)
        );
        println!("    Dimensions improved:  {}/6", self.dimensions_improved);
        println!("    Dimensions regressed: {}/6", self.dimensions_regressed);
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// Autonomy Ladder
// ═══════════════════════════════════════════════════════════════════════════

/// Autonomy level gated by sustained contract health.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AutonomyLevel {
    /// Level 0: Read-only observation
    ReadOnly = 0,
    /// Level 1: Write to memory (store episodes)
    WriteMemory = 1,
    /// Level 2: Execute tools (run solver)
    ExecuteTools = 2,
    /// Level 3: Write to external systems (publish results)
    WriteExternal = 3,
    /// Level 4: Deploy and operate (self-directed improvement)
    DeployOperate = 4,
}

/// Thresholds for advancing autonomy levels.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AutonomyGates {
    /// Minimum consecutive compliant cycles to advance
    pub min_compliant_cycles: usize,
    /// Maximum allowed contradiction rate per level
    pub max_contradiction_rate: [f64; 5],
    /// Minimum accuracy per level
    pub min_accuracy: [f64; 5],
    /// Minimum cost efficiency per level
    pub min_cost_efficiency: [f64; 5],
    /// Minimum noise stability per level
    pub min_noise_stability: [f64; 5],
    /// Must have zero policy violations for levels >= 2
    pub zero_violations_above: AutonomyLevel,
}

impl Default for AutonomyGates {
    fn default() -> Self {
        Self {
            min_compliant_cycles: 3,
            //                          L0    L1    L2    L3    L4
            max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
            min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
            min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
            min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
            zero_violations_above: AutonomyLevel::ExecuteTools,
        }
    }
}

/// Evaluator that determines current autonomy level from contract history.
pub struct AutonomyEvaluator {
    pub gates: AutonomyGates,
}

impl Default for AutonomyEvaluator {
    fn default() -> Self {
        Self {
            gates: AutonomyGates::default(),
        }
    }
}

impl AutonomyEvaluator {
    /// Determine the highest autonomy level supported by the health history.
    /// `history` is ordered oldest-first.
    pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
        if history.is_empty() {
            return AutonomyLevel::ReadOnly;
        }

        let mut level = AutonomyLevel::ReadOnly;
        let levels = [
            AutonomyLevel::WriteMemory,
            AutonomyLevel::ExecuteTools,
            AutonomyLevel::WriteExternal,
            AutonomyLevel::DeployOperate,
        ];

        for &candidate in &levels {
            let idx = candidate as usize;
            let required = self.gates.min_compliant_cycles;

            // Need enough recent history
            if history.len() < required {
                break;
            }

            let recent = &history[history.len().saturating_sub(required)..];
            let all_pass = recent.iter().all(|h| {
                h.accuracy >= self.gates.min_accuracy[idx]
                    && h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
                    && h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
                    && h.noise_stability >= self.gates.min_noise_stability[idx]
                    && (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
            });

            if all_pass {
                level = candidate;
            } else {
                break;
            }
        }

        level
    }

    pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
        let labels = [
            "Read-Only",
            "Write Memory",
            "Execute Tools",
            "Write External",
            "Deploy & Operate",
        ];
        println!(
            "  Autonomy Level: {} ({})",
            level as usize, labels[level as usize]
        );
        println!("  Gates for next level:");
        let next = (level as usize + 1).min(4);
        println!(
            "    Accuracy:       {:.0}% (need {:.0}%)",
            health.accuracy * 100.0,
            self.gates.min_accuracy[next] * 100.0
        );
        println!(
            "    Contradiction:  {:.3}% (need <{:.3}%)",
            health.contradiction_rate * 100.0,
            self.gates.max_contradiction_rate[next] * 100.0
        );
        println!(
            "    Cost Eff:       {:.0}% (need {:.0}%)",
            health.cost_efficiency * 100.0,
            self.gates.min_cost_efficiency[next] * 100.0
        );
        println!(
            "    Noise Stab:     {:.0}% (need {:.0}%)",
            health.noise_stability * 100.0,
            self.gates.min_noise_stability[next] * 100.0
        );
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// Viability Checklist
// ═══════════════════════════════════════════════════════════════════════════

/// The 5 viability checks that determine if the system is on an AGI trajectory.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ViabilityChecklist {
    /// Can replay runs and get identical grades
    pub deterministic_replay: bool,
    /// Improves utility over time without raising policy violations
    pub improving_without_violations: bool,
    /// Can roll back bad learning reliably
    pub reliable_rollback: bool,
    /// Can generate infinite novel tasks with automatic grading
    pub infinite_gradeable_tasks: bool,
    /// Cost per solve trending down over weeks
    pub cost_trending_down: bool,
}

impl ViabilityChecklist {
    /// Evaluate from contract health history.
    pub fn evaluate(history: &[ContractHealth]) -> Self {
        // Deterministic replay: verified externally (always true in our harness)
        let deterministic_replay = true;

        // Improving without violations: later health better than earlier, zero violations
        let improving_without_violations = if history.len() >= 2 {
            let first = &history[0];
            let last = &history[history.len() - 1];
            last.accuracy >= first.accuracy
                && last.policy_violations == 0
                && history.iter().all(|h| h.policy_violations == 0)
        } else {
            false
        };

        // Reliable rollback: rollback correctness >= 80% when attempted
        let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);

        // Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
        let infinite_gradeable_tasks = true;

        // Cost trending down: solved_per_cost increases over time
        let cost_trending_down = if history.len() >= 3 {
            let first_third: f64 = history[..history.len() / 3]
                .iter()
                .map(|h| h.solved_per_cost)
                .sum::<f64>()
                / (history.len() / 3) as f64;
            let last_third: f64 = history[history.len() * 2 / 3..]
                .iter()
                .map(|h| h.solved_per_cost)
                .sum::<f64>()
                / (history.len() - history.len() * 2 / 3) as f64;
            last_third > first_third
        } else {
            false
        };

        ViabilityChecklist {
            deterministic_replay,
            improving_without_violations,
            reliable_rollback,
            infinite_gradeable_tasks,
            cost_trending_down,
        }
    }

    pub fn all_pass(&self) -> bool {
        self.deterministic_replay
            && self.improving_without_violations
            && self.reliable_rollback
            && self.infinite_gradeable_tasks
            && self.cost_trending_down
    }

    pub fn print(&self) {
        let check = |b: bool| if b { "PASS" } else { "FAIL" };
        println!("  Viability Checklist:");
        println!(
            "    1. Deterministic replay:       {}",
            check(self.deterministic_replay)
        );
        println!(
            "    2. Improving w/o violations:    {}",
            check(self.improving_without_violations)
        );
        println!(
            "    3. Reliable rollback:           {}",
            check(self.reliable_rollback)
        );
        println!(
            "    4. Infinite gradeable tasks:    {}",
            check(self.infinite_gradeable_tasks)
        );
        println!(
            "    5. Cost trending down:          {}",
            check(self.cost_trending_down)
        );
        println!(
            "    Overall: {}",
            if self.all_pass() {
                "VIABLE AGI TRAJECTORY"
            } else {
                "NOT YET VIABLE"
            }
        );
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// Tests
// ═══════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn contract_health_from_raw() {
        let mut raw = RawMetrics::default();
        raw.tasks_attempted = 100;
        raw.tasks_completed = 95;
        raw.tasks_correct = 92;
        raw.total_steps = 600;
        raw.noise_tasks_attempted = 30;
        raw.noise_tasks_correct = 25;
        raw.contradictions = 0; // zero contradictions for compliance
        raw.rollback_attempts = 5;
        raw.rollback_successes = 4;

        let health = ContractHealth::from_raw(&raw);
        assert!((health.accuracy - 0.92).abs() < 0.01);
        assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
        assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
        assert!((health.contradiction_rate).abs() < 0.001);
        assert!((health.rollback_correctness - 0.8).abs() < 0.01);
        assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
    }

    #[test]
    fn contract_delta_detects_improvement() {
        let before = ContractHealth {
            solved_per_cost: 0.10,
            noise_stability: 0.70,
            contradiction_rate: 0.03,
            rollback_correctness: 0.80,
            policy_violations: 0,
            accuracy: 0.85,
            cost_efficiency: 0.50,
            compliant: false,
        };
        let after = ContractHealth {
            solved_per_cost: 0.15,
            noise_stability: 0.85,
            contradiction_rate: 0.01,
            rollback_correctness: 0.90,
            policy_violations: 0,
            accuracy: 0.93,
            cost_efficiency: 0.70,
            compliant: true,
        };
        let delta = ContractDelta::between(&before, &after);
        assert_eq!(delta.dimensions_improved, 6);
        assert_eq!(delta.dimensions_regressed, 0);
    }

    #[test]
    fn autonomy_ladder_advances() {
        let evaluator = AutonomyEvaluator::default();

        // No history => ReadOnly
        assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);

        // 3 compliant cycles at L1 level
        let h = ContractHealth {
            solved_per_cost: 0.15,
            noise_stability: 0.55,
            contradiction_rate: 0.04,
            rollback_correctness: 1.0,
            policy_violations: 0,
            accuracy: 0.75,
            cost_efficiency: 0.30,
            compliant: true,
        };
        let history = vec![h.clone(), h.clone(), h.clone()];
        assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
    }

    #[test]
    fn viability_checklist_basic() {
        let h1 = ContractHealth {
            solved_per_cost: 0.10,
            noise_stability: 0.70,
            contradiction_rate: 0.01,
            rollback_correctness: 0.90,
            policy_violations: 0,
            accuracy: 0.85,
            cost_efficiency: 0.50,
            compliant: true,
        };
        let h2 = ContractHealth {
            solved_per_cost: 0.12,
            noise_stability: 0.80,
            contradiction_rate: 0.005,
            rollback_correctness: 0.95,
            policy_violations: 0,
            accuracy: 0.90,
            cost_efficiency: 0.60,
            compliant: true,
        };
        let h3 = ContractHealth {
            solved_per_cost: 0.15,
            noise_stability: 0.85,
            contradiction_rate: 0.002,
            rollback_correctness: 0.95,
            policy_violations: 0,
            accuracy: 0.93,
            cost_efficiency: 0.70,
            compliant: true,
        };
        let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
        assert!(viability.deterministic_replay);
        assert!(viability.improving_without_violations);
        assert!(viability.reliable_rollback);
        assert!(viability.infinite_gradeable_tasks);
        assert!(viability.cost_trending_down);
        assert!(viability.all_pass());
    }
}