Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/examples/benchmarks/Cargo.toml
+++ b/examples/benchmarks/Cargo.toml
@@ -0,0 +1,110 @@
+[package]
+name = "ruvector-benchmarks"
+version = "0.1.0"
+edition = "2021"
+description = "Comprehensive benchmarks for temporal reasoning and vector operations"
+publish = false
+
+[dependencies]
+# Core ruvector
+ruvector-core = { path = "../../crates/ruvector-core", default-features = false, features = ["parallel"] }
+
+# Serialization
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+bincode = { version = "2.0.0-rc.3", features = ["serde"] }
+
+# Error handling
+anyhow = "1.0"
+thiserror = "2.0"
+
+# Random and numerics
+rand = "0.8"
+rand_distr = "0.4"
+
+# Parallel processing
+rayon = "1.10"
+
+# CLI and progress
+clap = { version = "4.5", features = ["derive"] }
+indicatif = "0.17"
+console = "0.15"
+
+# Async
+tokio = { version = "1.41", features = ["rt-multi-thread", "sync", "macros", "time", "fs"] }
+futures = "0.3"
+
+# Time handling (critical for temporal benchmarks)
+chrono = { version = "0.4", features = ["serde"] }
+
+# Logging and tracing
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+
+# Crypto for witness chains
+sha2 = "0.10"
+
+# RVF native format integration
+rvf-types = { path = "../../crates/rvf/rvf-types" }
+rvf-crypto = { path = "../../crates/rvf/rvf-crypto" }
+rvf-wire = { path = "../../crates/rvf/rvf-wire" }
+
+# Statistics
+statistical = "1.0"
+hdrhistogram = "7.5"
+
+# HTTP for tool-augmented tests
+reqwest = { version = "0.11", features = ["json"] }
+
+# Visualization
+plotters = { version = "0.3", optional = true }
+
+# Type theory for verified reasoning (lean-agentic)
+lean-agentic = "0.1"
+
+[dev-dependencies]
+tempfile = "3.13"
+
+[features]
+default = []
+visualize = ["plotters"]
+
+[[bin]]
+name = "temporal-benchmark"
+path = "src/bin/temporal_benchmark.rs"
+
+[[bin]]
+name = "vector-benchmark"
+path = "src/bin/vector_benchmark.rs"
+
+[[bin]]
+name = "swarm-regret"
+path = "src/bin/swarm_regret.rs"
+
+[[bin]]
+name = "timepuzzle-runner"
+path = "src/bin/timepuzzle_runner.rs"
+
+[[bin]]
+name = "intelligence-assessment"
+path = "src/bin/intelligence_assessment.rs"
+
+[[bin]]
+name = "rvf-intelligence-bench"
+path = "src/bin/rvf_intelligence_bench.rs"
+
+[[bin]]
+name = "superintelligence"
+path = "src/bin/superintelligence.rs"
+
+[[bin]]
+name = "agi-proof-harness"
+path = "src/bin/agi_proof_harness.rs"
+
+[[bin]]
+name = "acceptance-rvf"
+path = "src/bin/acceptance_rvf.rs"
+
+[[bin]]
+name = "wasm-solver-bench"
+path = "src/bin/wasm_solver_bench.rs"
--- a/examples/benchmarks/src/acceptance_test.rs
+++ b/examples/benchmarks/src/acceptance_test.rs
--- a/examples/benchmarks/src/agi_contract.rs
+++ b/examples/benchmarks/src/agi_contract.rs
@@ -0,0 +1,627 @@
+//! AGI Contract — Defines intelligence as a measurable, falsifiable contract.
+//!
+//! The AGI contract states: a system improves utility over time without violating
+//! policy, while maintaining structural health.
+//!
+//! ## Core Metrics (all deterministic, all auditable)
+//!
+//! - **Solved tasks per cost** — graded outcomes normalized by compute
+//! - **Stability under noise** — accuracy retention when inputs are corrupted
+//! - **Contradiction rate** — solved-but-wrong / total attempted
+//! - **Rollback correctness** — recovery rate when bad inputs are detected
+//! - **Policy violations** — budget overruns + contradictions (must be zero)
+//!
+//! ## Autonomy Ladder
+//!
+//! Each level requires sustained health metrics before advancement:
+//! 0. Read-only (observe only)
+//! 1. Write to memory (store episodes, no execution)
+//! 2. Execute tools (run solver, generate puzzles)
+//! 3. Write to external systems (publish results)
+//! 4. Deploy and operate (self-directed improvement)
+
+use crate::intelligence_metrics::{IntelligenceAssessment, RawMetrics};
+use serde::{Deserialize, Serialize};
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Contract Health Snapshot
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// A single point-in-time health measurement against the AGI contract.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ContractHealth {
+    /// Solved tasks per unit cost (tasks_correct / total_steps)
+    pub solved_per_cost: f64,
+    /// Accuracy on noise-injected tasks
+    pub noise_stability: f64,
+    /// Contradiction rate: solved-but-wrong / attempted
+    pub contradiction_rate: f64,
+    /// Rollback correctness: successful rollbacks / attempted rollbacks
+    pub rollback_correctness: f64,
+    /// Total policy violations (must be zero for contract compliance)
+    pub policy_violations: usize,
+    /// Clean accuracy (graded outcome baseline)
+    pub accuracy: f64,
+    /// Cost efficiency (0-1, higher = cheaper per solve)
+    pub cost_efficiency: f64,
+    /// Whether the contract is satisfied
+    pub compliant: bool,
+}
+
+impl ContractHealth {
+    /// Evaluate contract health from raw metrics.
+    pub fn from_raw(raw: &RawMetrics) -> Self {
+        let accuracy = if raw.tasks_attempted > 0 {
+            raw.tasks_correct as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        let solved_per_cost = if raw.total_steps > 0 {
+            raw.tasks_correct as f64 / raw.total_steps as f64
+        } else {
+            0.0
+        };
+
+        let noise_stability = if raw.noise_tasks_attempted > 0 {
+            raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        let contradiction_rate = if raw.tasks_attempted > 0 {
+            raw.contradictions as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        let rollback_correctness = if raw.rollback_attempts > 0 {
+            raw.rollback_successes as f64 / raw.rollback_attempts as f64
+        } else {
+            1.0 // no rollbacks needed => perfect
+        };
+
+        let cost_efficiency = (1.0 - {
+            let sps = if raw.tasks_correct > 0 {
+                raw.total_steps as f64 / raw.tasks_correct as f64
+            } else {
+                100.0
+            };
+            (sps - 5.0) / 95.0
+        })
+        .clamp(0.0, 1.0);
+
+        let compliant = raw.policy_violations == 0 && contradiction_rate < 0.01 && accuracy >= 0.90;
+
+        ContractHealth {
+            solved_per_cost,
+            noise_stability,
+            contradiction_rate,
+            rollback_correctness,
+            policy_violations: raw.policy_violations,
+            accuracy,
+            cost_efficiency,
+            compliant,
+        }
+    }
+
+    /// Evaluate contract health from an IntelligenceAssessment.
+    pub fn from_assessment(assessment: &IntelligenceAssessment) -> Self {
+        Self::from_raw(&assessment.raw_data)
+    }
+
+    /// Print formatted contract health report.
+    pub fn print(&self) {
+        println!("  Contract Health:");
+        println!("    Solved/Cost:        {:.4}", self.solved_per_cost);
+        println!(
+            "    Noise Stability:    {:.2}%",
+            self.noise_stability * 100.0
+        );
+        println!(
+            "    Contradiction Rate: {:.4}%",
+            self.contradiction_rate * 100.0
+        );
+        println!(
+            "    Rollback Correct:   {:.2}%",
+            self.rollback_correctness * 100.0
+        );
+        println!("    Policy Violations:  {}", self.policy_violations);
+        println!("    Accuracy:           {:.2}%", self.accuracy * 100.0);
+        println!(
+            "    Cost Efficiency:    {:.2}%",
+            self.cost_efficiency * 100.0
+        );
+        println!(
+            "    Compliant:          {}",
+            if self.compliant { "YES" } else { "NO" }
+        );
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Contract Trend — compares two snapshots
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Tracks improvement across contract dimensions between two measurement points.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ContractDelta {
+    /// Change in solved-per-cost (positive = improving)
+    pub solved_per_cost_delta: f64,
+    /// Change in noise stability (positive = more robust)
+    pub noise_stability_delta: f64,
+    /// Change in contradiction rate (negative = improving)
+    pub contradiction_rate_delta: f64,
+    /// Change in rollback correctness (positive = better recovery)
+    pub rollback_delta: f64,
+    /// Change in accuracy (positive = better)
+    pub accuracy_delta: f64,
+    /// Change in cost efficiency (positive = cheaper)
+    pub cost_efficiency_delta: f64,
+    /// Number of dimensions that improved
+    pub dimensions_improved: usize,
+    /// Number of dimensions that regressed
+    pub dimensions_regressed: usize,
+}
+
+impl ContractDelta {
+    /// Compute delta between two health snapshots.
+    pub fn between(before: &ContractHealth, after: &ContractHealth) -> Self {
+        let solved_per_cost_delta = after.solved_per_cost - before.solved_per_cost;
+        let noise_stability_delta = after.noise_stability - before.noise_stability;
+        let contradiction_rate_delta = after.contradiction_rate - before.contradiction_rate;
+        let rollback_delta = after.rollback_correctness - before.rollback_correctness;
+        let accuracy_delta = after.accuracy - before.accuracy;
+        let cost_efficiency_delta = after.cost_efficiency - before.cost_efficiency;
+
+        // Count improvements (positive is better for all except contradiction_rate)
+        let deltas = [
+            solved_per_cost_delta > 0.001,
+            noise_stability_delta > 0.001,
+            contradiction_rate_delta < -0.001, // decrease = improvement
+            rollback_delta > 0.001,
+            accuracy_delta > 0.001,
+            cost_efficiency_delta > 0.001,
+        ];
+        let regressions = [
+            solved_per_cost_delta < -0.001,
+            noise_stability_delta < -0.001,
+            contradiction_rate_delta > 0.001,
+            rollback_delta < -0.001,
+            accuracy_delta < -0.01,
+            cost_efficiency_delta < -0.001,
+        ];
+
+        ContractDelta {
+            solved_per_cost_delta,
+            noise_stability_delta,
+            contradiction_rate_delta,
+            rollback_delta,
+            accuracy_delta,
+            cost_efficiency_delta,
+            dimensions_improved: deltas.iter().filter(|&&d| d).count(),
+            dimensions_regressed: regressions.iter().filter(|&&r| r).count(),
+        }
+    }
+
+    pub fn print(&self) {
+        let arrow = |v: f64, invert: bool| {
+            let positive = if invert { v < 0.0 } else { v > 0.0 };
+            if positive {
+                "+"
+            } else if v == 0.0 {
+                "="
+            } else {
+                "-"
+            }
+        };
+        println!("  Contract Delta:");
+        println!(
+            "    Solved/Cost:     {:>+.4} [{}]",
+            self.solved_per_cost_delta,
+            arrow(self.solved_per_cost_delta, false)
+        );
+        println!(
+            "    Noise Stability: {:>+.4} [{}]",
+            self.noise_stability_delta,
+            arrow(self.noise_stability_delta, false)
+        );
+        println!(
+            "    Contradiction:   {:>+.4} [{}]",
+            self.contradiction_rate_delta,
+            arrow(self.contradiction_rate_delta, true)
+        );
+        println!(
+            "    Rollback:        {:>+.4} [{}]",
+            self.rollback_delta,
+            arrow(self.rollback_delta, false)
+        );
+        println!(
+            "    Accuracy:        {:>+.4} [{}]",
+            self.accuracy_delta,
+            arrow(self.accuracy_delta, false)
+        );
+        println!(
+            "    Cost Efficiency: {:>+.4} [{}]",
+            self.cost_efficiency_delta,
+            arrow(self.cost_efficiency_delta, false)
+        );
+        println!("    Dimensions improved:  {}/6", self.dimensions_improved);
+        println!("    Dimensions regressed: {}/6", self.dimensions_regressed);
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Autonomy Ladder
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Autonomy level gated by sustained contract health.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum AutonomyLevel {
+    /// Level 0: Read-only observation
+    ReadOnly = 0,
+    /// Level 1: Write to memory (store episodes)
+    WriteMemory = 1,
+    /// Level 2: Execute tools (run solver)
+    ExecuteTools = 2,
+    /// Level 3: Write to external systems (publish results)
+    WriteExternal = 3,
+    /// Level 4: Deploy and operate (self-directed improvement)
+    DeployOperate = 4,
+}
+
+/// Thresholds for advancing autonomy levels.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct AutonomyGates {
+    /// Minimum consecutive compliant cycles to advance
+    pub min_compliant_cycles: usize,
+    /// Maximum allowed contradiction rate per level
+    pub max_contradiction_rate: [f64; 5],
+    /// Minimum accuracy per level
+    pub min_accuracy: [f64; 5],
+    /// Minimum cost efficiency per level
+    pub min_cost_efficiency: [f64; 5],
+    /// Minimum noise stability per level
+    pub min_noise_stability: [f64; 5],
+    /// Must have zero policy violations for levels >= 2
+    pub zero_violations_above: AutonomyLevel,
+}
+
+impl Default for AutonomyGates {
+    fn default() -> Self {
+        Self {
+            min_compliant_cycles: 3,
+            //                          L0    L1    L2    L3    L4
+            max_contradiction_rate: [1.0, 0.05, 0.02, 0.01, 0.005],
+            min_accuracy: [0.0, 0.70, 0.85, 0.92, 0.96],
+            min_cost_efficiency: [0.0, 0.20, 0.40, 0.60, 0.75],
+            min_noise_stability: [0.0, 0.50, 0.65, 0.80, 0.90],
+            zero_violations_above: AutonomyLevel::ExecuteTools,
+        }
+    }
+}
+
+/// Evaluator that determines current autonomy level from contract history.
+pub struct AutonomyEvaluator {
+    pub gates: AutonomyGates,
+}
+
+impl Default for AutonomyEvaluator {
+    fn default() -> Self {
+        Self {
+            gates: AutonomyGates::default(),
+        }
+    }
+}
+
+impl AutonomyEvaluator {
+    /// Determine the highest autonomy level supported by the health history.
+    /// `history` is ordered oldest-first.
+    pub fn evaluate(&self, history: &[ContractHealth]) -> AutonomyLevel {
+        if history.is_empty() {
+            return AutonomyLevel::ReadOnly;
+        }
+
+        let mut level = AutonomyLevel::ReadOnly;
+        let levels = [
+            AutonomyLevel::WriteMemory,
+            AutonomyLevel::ExecuteTools,
+            AutonomyLevel::WriteExternal,
+            AutonomyLevel::DeployOperate,
+        ];
+
+        for &candidate in &levels {
+            let idx = candidate as usize;
+            let required = self.gates.min_compliant_cycles;
+
+            // Need enough recent history
+            if history.len() < required {
+                break;
+            }
+
+            let recent = &history[history.len().saturating_sub(required)..];
+            let all_pass = recent.iter().all(|h| {
+                h.accuracy >= self.gates.min_accuracy[idx]
+                    && h.contradiction_rate <= self.gates.max_contradiction_rate[idx]
+                    && h.cost_efficiency >= self.gates.min_cost_efficiency[idx]
+                    && h.noise_stability >= self.gates.min_noise_stability[idx]
+                    && (candidate < self.gates.zero_violations_above || h.policy_violations == 0)
+            });
+
+            if all_pass {
+                level = candidate;
+            } else {
+                break;
+            }
+        }
+
+        level
+    }
+
+    pub fn print_status(&self, level: AutonomyLevel, health: &ContractHealth) {
+        let labels = [
+            "Read-Only",
+            "Write Memory",
+            "Execute Tools",
+            "Write External",
+            "Deploy & Operate",
+        ];
+        println!(
+            "  Autonomy Level: {} ({})",
+            level as usize, labels[level as usize]
+        );
+        println!("  Gates for next level:");
+        let next = (level as usize + 1).min(4);
+        println!(
+            "    Accuracy:       {:.0}% (need {:.0}%)",
+            health.accuracy * 100.0,
+            self.gates.min_accuracy[next] * 100.0
+        );
+        println!(
+            "    Contradiction:  {:.3}% (need <{:.3}%)",
+            health.contradiction_rate * 100.0,
+            self.gates.max_contradiction_rate[next] * 100.0
+        );
+        println!(
+            "    Cost Eff:       {:.0}% (need {:.0}%)",
+            health.cost_efficiency * 100.0,
+            self.gates.min_cost_efficiency[next] * 100.0
+        );
+        println!(
+            "    Noise Stab:     {:.0}% (need {:.0}%)",
+            health.noise_stability * 100.0,
+            self.gates.min_noise_stability[next] * 100.0
+        );
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Viability Checklist
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// The 5 viability checks that determine if the system is on an AGI trajectory.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ViabilityChecklist {
+    /// Can replay runs and get identical grades
+    pub deterministic_replay: bool,
+    /// Improves utility over time without raising policy violations
+    pub improving_without_violations: bool,
+    /// Can roll back bad learning reliably
+    pub reliable_rollback: bool,
+    /// Can generate infinite novel tasks with automatic grading
+    pub infinite_gradeable_tasks: bool,
+    /// Cost per solve trending down over weeks
+    pub cost_trending_down: bool,
+}
+
+impl ViabilityChecklist {
+    /// Evaluate from contract health history.
+    pub fn evaluate(history: &[ContractHealth]) -> Self {
+        // Deterministic replay: verified externally (always true in our harness)
+        let deterministic_replay = true;
+
+        // Improving without violations: later health better than earlier, zero violations
+        let improving_without_violations = if history.len() >= 2 {
+            let first = &history[0];
+            let last = &history[history.len() - 1];
+            last.accuracy >= first.accuracy
+                && last.policy_violations == 0
+                && history.iter().all(|h| h.policy_violations == 0)
+        } else {
+            false
+        };
+
+        // Reliable rollback: rollback correctness >= 80% when attempted
+        let reliable_rollback = history.iter().all(|h| h.rollback_correctness >= 0.8);
+
+        // Infinite gradeable tasks: always true (PuzzleGenerator is unbounded)
+        let infinite_gradeable_tasks = true;
+
+        // Cost trending down: solved_per_cost increases over time
+        let cost_trending_down = if history.len() >= 3 {
+            let first_third: f64 = history[..history.len() / 3]
+                .iter()
+                .map(|h| h.solved_per_cost)
+                .sum::<f64>()
+                / (history.len() / 3) as f64;
+            let last_third: f64 = history[history.len() * 2 / 3..]
+                .iter()
+                .map(|h| h.solved_per_cost)
+                .sum::<f64>()
+                / (history.len() - history.len() * 2 / 3) as f64;
+            last_third > first_third
+        } else {
+            false
+        };
+
+        ViabilityChecklist {
+            deterministic_replay,
+            improving_without_violations,
+            reliable_rollback,
+            infinite_gradeable_tasks,
+            cost_trending_down,
+        }
+    }
+
+    pub fn all_pass(&self) -> bool {
+        self.deterministic_replay
+            && self.improving_without_violations
+            && self.reliable_rollback
+            && self.infinite_gradeable_tasks
+            && self.cost_trending_down
+    }
+
+    pub fn print(&self) {
+        let check = |b: bool| if b { "PASS" } else { "FAIL" };
+        println!("  Viability Checklist:");
+        println!(
+            "    1. Deterministic replay:       {}",
+            check(self.deterministic_replay)
+        );
+        println!(
+            "    2. Improving w/o violations:    {}",
+            check(self.improving_without_violations)
+        );
+        println!(
+            "    3. Reliable rollback:           {}",
+            check(self.reliable_rollback)
+        );
+        println!(
+            "    4. Infinite gradeable tasks:    {}",
+            check(self.infinite_gradeable_tasks)
+        );
+        println!(
+            "    5. Cost trending down:          {}",
+            check(self.cost_trending_down)
+        );
+        println!(
+            "    Overall: {}",
+            if self.all_pass() {
+                "VIABLE AGI TRAJECTORY"
+            } else {
+                "NOT YET VIABLE"
+            }
+        );
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Tests
+// ═══════════════════════════════════════════════════════════════════════════
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn contract_health_from_raw() {
+        let mut raw = RawMetrics::default();
+        raw.tasks_attempted = 100;
+        raw.tasks_completed = 95;
+        raw.tasks_correct = 92;
+        raw.total_steps = 600;
+        raw.noise_tasks_attempted = 30;
+        raw.noise_tasks_correct = 25;
+        raw.contradictions = 0; // zero contradictions for compliance
+        raw.rollback_attempts = 5;
+        raw.rollback_successes = 4;
+
+        let health = ContractHealth::from_raw(&raw);
+        assert!((health.accuracy - 0.92).abs() < 0.01);
+        assert!((health.solved_per_cost - 92.0 / 600.0).abs() < 0.01);
+        assert!((health.noise_stability - 25.0 / 30.0).abs() < 0.01);
+        assert!((health.contradiction_rate).abs() < 0.001);
+        assert!((health.rollback_correctness - 0.8).abs() < 0.01);
+        assert!(health.compliant); // 0 violations, 0% contradictions, >=90% accuracy
+    }
+
+    #[test]
+    fn contract_delta_detects_improvement() {
+        let before = ContractHealth {
+            solved_per_cost: 0.10,
+            noise_stability: 0.70,
+            contradiction_rate: 0.03,
+            rollback_correctness: 0.80,
+            policy_violations: 0,
+            accuracy: 0.85,
+            cost_efficiency: 0.50,
+            compliant: false,
+        };
+        let after = ContractHealth {
+            solved_per_cost: 0.15,
+            noise_stability: 0.85,
+            contradiction_rate: 0.01,
+            rollback_correctness: 0.90,
+            policy_violations: 0,
+            accuracy: 0.93,
+            cost_efficiency: 0.70,
+            compliant: true,
+        };
+        let delta = ContractDelta::between(&before, &after);
+        assert_eq!(delta.dimensions_improved, 6);
+        assert_eq!(delta.dimensions_regressed, 0);
+    }
+
+    #[test]
+    fn autonomy_ladder_advances() {
+        let evaluator = AutonomyEvaluator::default();
+
+        // No history => ReadOnly
+        assert_eq!(evaluator.evaluate(&[]), AutonomyLevel::ReadOnly);
+
+        // 3 compliant cycles at L1 level
+        let h = ContractHealth {
+            solved_per_cost: 0.15,
+            noise_stability: 0.55,
+            contradiction_rate: 0.04,
+            rollback_correctness: 1.0,
+            policy_violations: 0,
+            accuracy: 0.75,
+            cost_efficiency: 0.30,
+            compliant: true,
+        };
+        let history = vec![h.clone(), h.clone(), h.clone()];
+        assert_eq!(evaluator.evaluate(&history), AutonomyLevel::WriteMemory);
+    }
+
+    #[test]
+    fn viability_checklist_basic() {
+        let h1 = ContractHealth {
+            solved_per_cost: 0.10,
+            noise_stability: 0.70,
+            contradiction_rate: 0.01,
+            rollback_correctness: 0.90,
+            policy_violations: 0,
+            accuracy: 0.85,
+            cost_efficiency: 0.50,
+            compliant: true,
+        };
+        let h2 = ContractHealth {
+            solved_per_cost: 0.12,
+            noise_stability: 0.80,
+            contradiction_rate: 0.005,
+            rollback_correctness: 0.95,
+            policy_violations: 0,
+            accuracy: 0.90,
+            cost_efficiency: 0.60,
+            compliant: true,
+        };
+        let h3 = ContractHealth {
+            solved_per_cost: 0.15,
+            noise_stability: 0.85,
+            contradiction_rate: 0.002,
+            rollback_correctness: 0.95,
+            policy_violations: 0,
+            accuracy: 0.93,
+            cost_efficiency: 0.70,
+            compliant: true,
+        };
+        let viability = ViabilityChecklist::evaluate(&[h1, h2, h3]);
+        assert!(viability.deterministic_replay);
+        assert!(viability.improving_without_violations);
+        assert!(viability.reliable_rollback);
+        assert!(viability.infinite_gradeable_tasks);
+        assert!(viability.cost_trending_down);
+        assert!(viability.all_pass());
+    }
+}
--- a/examples/benchmarks/src/bin/acceptance_rvf.rs
+++ b/examples/benchmarks/src/bin/acceptance_rvf.rs
@@ -0,0 +1,166 @@
+//! Publishable RVF Acceptance Test — CLI entry point.
+//!
+//! Generates or verifies a deterministic acceptance test manifest with
+//! SHAKE-256 witness chain (rvf-crypto native). Same seed → same outcomes
+//! → same root hash.
+//!
+//! ```bash
+//! # Generate manifest (JSON + .rvf binary)
+//! cargo run --bin acceptance-rvf -- generate -o manifest.json
+//!
+//! # Generate with custom config
+//! cargo run --bin acceptance-rvf -- generate -o manifest.json \
+//!     --holdout 200 --training 200 --cycles 5
+//!
+//! # Verify a manifest (re-runs and compares root hash)
+//! cargo run --bin acceptance-rvf -- verify -i manifest.json
+//!
+//! # Verify the .rvf binary witness chain
+//! cargo run --bin acceptance-rvf -- verify-rvf -i acceptance_manifest.rvf
+//! ```
+
+use clap::{Parser, Subcommand};
+use ruvector_benchmarks::acceptance_test::HoldoutConfig;
+use ruvector_benchmarks::publishable_rvf::{
+    generate_manifest_with_rvf, verify_manifest, verify_rvf_binary,
+};
+
+#[derive(Parser)]
+#[command(name = "acceptance-rvf")]
+#[command(about = "Publishable RVF acceptance test with SHAKE-256 witness chain")]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Generate a new acceptance test manifest (JSON + .rvf binary)
+    Generate {
+        /// Output JSON file path
+        #[arg(short, long, default_value = "acceptance_manifest.json")]
+        output: String,
+
+        /// Holdout set size
+        #[arg(long, default_value_t = 200)]
+        holdout: usize,
+
+        /// Training puzzles per cycle
+        #[arg(long, default_value_t = 200)]
+        training: usize,
+
+        /// Number of training cycles
+        #[arg(long, default_value_t = 5)]
+        cycles: usize,
+
+        /// Step budget per puzzle
+        #[arg(long, default_value_t = 400)]
+        budget: usize,
+
+        /// Verbose output
+        #[arg(short, long)]
+        verbose: bool,
+    },
+    /// Verify an existing manifest by replaying and comparing root hash
+    Verify {
+        /// Input JSON file path
+        #[arg(short, long)]
+        input: String,
+    },
+    /// Verify a native .rvf binary witness chain
+    VerifyRvf {
+        /// Input .rvf file path
+        #[arg(short, long)]
+        input: String,
+    },
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    match cli.command {
+        Commands::Generate {
+            output,
+            holdout,
+            training,
+            cycles,
+            budget,
+            verbose,
+        } => {
+            let config = HoldoutConfig {
+                holdout_size: holdout,
+                training_per_cycle: training,
+                cycles,
+                step_budget: budget,
+                min_accuracy: 0.50,
+                min_dimensions_improved: 1,
+                verbose,
+                ..Default::default()
+            };
+
+            // Derive .rvf path from JSON output path
+            let rvf_path = output.replace(".json", ".rvf");
+
+            println!("Generating acceptance test manifest...");
+            println!(
+                "  holdout={}, training={}, cycles={}, budget={}",
+                holdout, training, cycles, budget
+            );
+            println!();
+
+            let manifest = generate_manifest_with_rvf(&config, Some(&rvf_path))?;
+            manifest.print_summary();
+
+            let json = serde_json::to_string_pretty(&manifest)?;
+            std::fs::write(&output, &json)?;
+            println!("  JSON manifest:  {}", output);
+            println!("  RVF binary:     {}", rvf_path);
+            println!("  Chain root hash: {}", manifest.chain_root_hash);
+            println!();
+
+            if manifest.all_passed {
+                std::process::exit(0);
+            } else {
+                std::process::exit(1);
+            }
+        }
+        Commands::Verify { input } => {
+            println!("Loading manifest from: {}", input);
+            let json = std::fs::read_to_string(&input)?;
+            let manifest: ruvector_benchmarks::publishable_rvf::RvfManifest =
+                serde_json::from_str(&json)?;
+
+            println!("  Chain length: {}", manifest.chain_length);
+            println!(
+                "  Expected root: {}",
+                &manifest.chain_root_hash[..32.min(manifest.chain_root_hash.len())]
+            );
+            println!();
+            println!("Re-running acceptance test with same config...");
+
+            let result = verify_manifest(&manifest)?;
+            result.print();
+
+            if result.passed() {
+                println!("  VERIFICATION: PASSED — outcomes are identical");
+                std::process::exit(0);
+            } else {
+                println!("  VERIFICATION: FAILED — outcomes differ");
+                std::process::exit(1);
+            }
+        }
+        Commands::VerifyRvf { input } => {
+            println!("Verifying .rvf witness chain: {}", input);
+            match verify_rvf_binary(&input) {
+                Ok(count) => {
+                    println!("  WITNESS_SEG verified: {} entries, chain intact", count);
+                    std::process::exit(0);
+                }
+                Err(e) => {
+                    println!("  VERIFICATION FAILED: {}", e);
+                    std::process::exit(1);
+                }
+            }
+        }
+    }
+}
--- a/examples/benchmarks/src/bin/agi_proof_harness.rs
+++ b/examples/benchmarks/src/bin/agi_proof_harness.rs
@@ -0,0 +1,204 @@
+//! AGI Proof Harness — Nightly runner that publishes contract metrics.
+//!
+//! Publishes:
+//! - Success rate
+//! - Cost per solve
+//! - Robustness under noise
+//! - Policy compliance
+//! - Contradiction rate
+//! - Rollback correctness
+//! - Viability checklist status
+//! - Autonomy level
+//!
+//! Usage:
+//!   cargo run --bin agi-proof-harness
+//!   cargo run --bin agi-proof-harness -- --holdout 1000 --cycles 10 --verbose
+//!   cargo run --bin agi-proof-harness -- --full  # 10K training, 1K holdout, 10 cycles
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::acceptance_test::{
+    run_ablation_comparison, run_acceptance_test, HoldoutConfig,
+};
+use ruvector_benchmarks::agi_contract::{AutonomyEvaluator, ContractHealth, ViabilityChecklist};
+use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
+use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
+
+#[derive(Parser, Debug)]
+#[command(name = "agi-proof-harness")]
+#[command(about = "AGI contract proof harness — publishes nightly metrics")]
+struct Args {
+    /// Holdout evaluation set size
+    #[arg(long, default_value = "200")]
+    holdout: usize,
+
+    /// Training tasks per cycle
+    #[arg(long, default_value = "200")]
+    training: usize,
+
+    /// Number of improvement cycles
+    #[arg(long, default_value = "5")]
+    cycles: usize,
+
+    /// Frozen holdout seed
+    #[arg(long, default_value = "3735928559")]
+    holdout_seed: u64,
+
+    /// Training seed
+    #[arg(long, default_value = "42")]
+    training_seed: u64,
+
+    /// Noise injection rate
+    #[arg(long, default_value = "0.25")]
+    noise: f64,
+
+    /// Step budget per task
+    #[arg(long, default_value = "400")]
+    step_budget: usize,
+
+    /// Full acceptance test (10K training, 1K holdout, 10 cycles)
+    #[arg(long)]
+    full: bool,
+
+    /// Minimum accuracy threshold
+    #[arg(long, default_value = "0.80")]
+    min_accuracy: f64,
+
+    /// Run three-mode ablation comparison (A/B/C)
+    #[arg(long)]
+    ablation: bool,
+
+    /// Also run the 5-level SI pathway
+    #[arg(long)]
+    pathway: bool,
+
+    /// Verbose output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!();
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║              AGI PROOF HARNESS                               ║");
+    println!("║   Contract-based intelligence measurement                    ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    let config = if args.full {
+        HoldoutConfig {
+            holdout_size: 1000,
+            training_per_cycle: 1000,
+            cycles: 10,
+            holdout_seed: args.holdout_seed,
+            training_seed: args.training_seed,
+            noise_rate: args.noise,
+            step_budget: args.step_budget,
+            min_accuracy: 0.95,
+            min_dimensions_improved: 2,
+            verbose: args.verbose,
+        }
+    } else {
+        HoldoutConfig {
+            holdout_size: args.holdout,
+            training_per_cycle: args.training,
+            cycles: args.cycles,
+            holdout_seed: args.holdout_seed,
+            training_seed: args.training_seed,
+            noise_rate: args.noise,
+            step_budget: args.step_budget,
+            min_accuracy: args.min_accuracy,
+            min_dimensions_improved: 2,
+            verbose: args.verbose,
+        }
+    };
+
+    println!(
+        "  Config: holdout={}, training/cycle={}, cycles={}, noise={:.0}%",
+        config.holdout_size,
+        config.training_per_cycle,
+        config.cycles,
+        config.noise_rate * 100.0
+    );
+    println!(
+        "  Seeds: holdout=0x{:X}, training={}",
+        config.holdout_seed, config.training_seed
+    );
+    println!();
+
+    // ─── Run Acceptance Test ─────────────────────────────────────────
+    println!("  Running acceptance test...");
+    let result = run_acceptance_test(&config)?;
+    result.print();
+
+    // ─── Ablation Comparison ─────────────────────────────────────────
+    if args.ablation {
+        println!("  Running ablation comparison (A / B / C)...");
+        let comparison = run_ablation_comparison(&config)?;
+        comparison.print();
+    }
+
+    // ─── Contract Health Summary ─────────────────────────────────────
+    if let Some(last_cycle) = result.cycles.last() {
+        println!();
+        last_cycle.contract_health.print();
+
+        // ─── Autonomy Level ──────────────────────────────────────────
+        let health_history: Vec<ContractHealth> = result
+            .cycles
+            .iter()
+            .map(|c| c.contract_health.clone())
+            .collect();
+        let evaluator = AutonomyEvaluator::default();
+        let level = evaluator.evaluate(&health_history);
+        println!();
+        evaluator.print_status(level, &last_cycle.contract_health);
+
+        // ─── Viability Checklist ─────────────────────────────────────
+        let viability = ViabilityChecklist::evaluate(&health_history);
+        println!();
+        viability.print();
+    }
+
+    // ─── Optional: SI Pathway ────────────────────────────────────────
+    if args.pathway {
+        println!();
+        println!("  Running 5-level SI pathway...");
+        let si_config = SIConfig {
+            episodes_per_level: 6,
+            tasks_per_episode: 15,
+            verbose: args.verbose,
+            ..Default::default()
+        };
+        let pathway_result = run_pathway(&si_config)?;
+        pathway_result.print();
+
+        // Show contract health for peak level
+        if let Some(peak) = pathway_result
+            .levels
+            .iter()
+            .max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
+        {
+            let health = ContractHealth::from_raw(&peak.raw_metrics);
+            println!("  Peak Level ({}) Contract:", peak.name);
+            health.print();
+
+            let calculator = IntelligenceCalculator::default();
+            let assessment = calculator.calculate(&peak.raw_metrics);
+            println!("  Multi-dimensional IQ: {:.1}", assessment.overall_score);
+            println!(
+                "    Cost efficiency:  {:.2}",
+                assessment.cost.cost_efficiency
+            );
+            println!(
+                "    Robustness score: {:.2}",
+                assessment.robustness.robustness_score
+            );
+        }
+    }
+
+    println!();
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/intelligence_assessment.rs
+++ b/examples/benchmarks/src/bin/intelligence_assessment.rs
@@ -0,0 +1,355 @@
+//! Intelligence Assessment Runner
+//!
+//! Runs comprehensive intelligence assessment across all benchmark types.
+//!
+//! Usage:
+//!   cargo run --bin intelligence-assessment -- --episodes 10 --puzzles 50
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::{
+    intelligence_metrics::{
+        print_intelligence_report, DifficultyStats, EpisodeMetrics, IntelligenceCalculator,
+        RawMetrics,
+    },
+    swarm_regret::SwarmController,
+    temporal::{AdaptiveSolver, TemporalSolver},
+    timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
+};
+
+#[derive(Parser, Debug)]
+#[command(name = "intelligence-assessment")]
+#[command(about = "Run comprehensive intelligence assessment")]
+struct Args {
+    /// Number of episodes for regret tracking
+    #[arg(short, long, default_value = "10")]
+    episodes: usize,
+
+    /// Tasks per episode
+    #[arg(short, long, default_value = "10")]
+    tasks_per_episode: usize,
+
+    /// Enable calendar tool
+    #[arg(long, default_value = "true")]
+    calendar: bool,
+
+    /// Enable adaptive learning (ReasoningBank)
+    #[arg(long, default_value = "true")]
+    adaptive: bool,
+
+    /// Random seed
+    #[arg(long)]
+    seed: Option<u64>,
+
+    /// Verbose output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║         Comprehensive Intelligence Assessment                 ║");
+    println!("║      Measuring Reasoning, Learning & Cognitive Abilities      ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    // Initialize metrics collector
+    let mut raw_metrics = RawMetrics::default();
+
+    // Initialize components
+    let mut controller = SwarmController::new(args.tasks_per_episode);
+
+    // Choose solver based on adaptive flag
+    let mut adaptive_solver = if args.adaptive {
+        Some(AdaptiveSolver::new())
+    } else {
+        None
+    };
+    let mut basic_solver = if !args.adaptive {
+        let mut s = TemporalSolver::with_tools(args.calendar, false);
+        s.max_steps = 100;
+        Some(s)
+    } else {
+        None
+    };
+
+    let puzzle_config = PuzzleGeneratorConfig {
+        min_difficulty: 1,
+        max_difficulty: 10,
+        constraint_density: 3,
+        seed: args.seed,
+        ..Default::default()
+    };
+
+    println!("🔧 Configuration:");
+    println!("   Episodes:         {}", args.episodes);
+    println!("   Tasks/episode:    {}", args.tasks_per_episode);
+    println!("   Calendar tool:    {}", args.calendar);
+    println!("   Adaptive learning:{}", args.adaptive);
+    println!();
+
+    println!("🏃 Running assessment...");
+    println!();
+
+    // Run episodes
+    for ep in 0..args.episodes {
+        controller.start_episode();
+
+        // Generate puzzles for this episode
+        let mut generator = PuzzleGenerator::new(puzzle_config.clone());
+        let puzzles = generator.generate_batch(args.tasks_per_episode)?;
+
+        let mut solved = 0;
+        let mut correct = 0;
+        let mut total_steps = 0;
+        let mut total_tool_calls = 0;
+        let mut total_latency = 0u64;
+
+        // Solve puzzles and collect metrics
+        for puzzle in &puzzles {
+            raw_metrics.tasks_attempted += 1;
+
+            // Use adaptive or basic solver
+            let result = if let Some(ref mut solver) = adaptive_solver {
+                solver.solve(puzzle)?
+            } else if let Some(ref mut solver) = basic_solver {
+                solver.solve(puzzle)?
+            } else {
+                unreachable!()
+            };
+
+            if result.solved {
+                solved += 1;
+                raw_metrics.tasks_completed += 1;
+            }
+            if result.correct {
+                correct += 1;
+                raw_metrics.tasks_correct += 1;
+            }
+
+            total_steps += result.steps;
+            total_tool_calls += result.tool_calls;
+            total_latency += result.latency_ms;
+
+            raw_metrics.total_steps += result.steps;
+            raw_metrics.total_tool_calls += result.tool_calls;
+            raw_metrics.total_latency_ms += result.latency_ms;
+
+            // Track by difficulty
+            let entry = raw_metrics
+                .by_difficulty
+                .entry(puzzle.difficulty)
+                .or_insert(DifficultyStats {
+                    attempted: 0,
+                    completed: 0,
+                    correct: 0,
+                    avg_steps: 0.0,
+                });
+            entry.attempted += 1;
+            if result.solved {
+                entry.completed += 1;
+            }
+            if result.correct {
+                entry.correct += 1;
+            }
+        }
+
+        // Record episode for swarm controller
+        controller.complete_episode(
+            solved,
+            correct,
+            total_steps,
+            total_tool_calls,
+            total_latency,
+        );
+
+        // Record episode metrics
+        let episode_accuracy = if args.tasks_per_episode > 0 {
+            correct as f64 / args.tasks_per_episode as f64
+        } else {
+            0.0
+        };
+
+        let last_ep = controller.regret.episodes.last().unwrap();
+        raw_metrics.episodes.push(EpisodeMetrics {
+            episode: ep + 1,
+            accuracy: episode_accuracy,
+            reward: last_ep.reward,
+            regret: last_ep.regret(),
+            cumulative_regret: controller.regret.current_cumulative_regret(),
+        });
+
+        if args.verbose {
+            println!(
+                "  Episode {:2}: Accuracy {:.1}%, Regret {:.2}",
+                ep + 1,
+                episode_accuracy * 100.0,
+                last_ep.regret()
+            );
+        } else {
+            print!(".");
+            use std::io::Write;
+            std::io::stdout().flush()?;
+        }
+    }
+
+    if !args.verbose {
+        println!();
+    }
+    println!();
+
+    // Update difficulty stats with average steps
+    for (_, stats) in raw_metrics.by_difficulty.iter_mut() {
+        if stats.attempted > 0 {
+            // This is a simplification - we'd need to track this properly
+            stats.avg_steps = raw_metrics.total_steps as f64 / raw_metrics.tasks_attempted as f64;
+        }
+    }
+
+    // Calculate intelligence assessment
+    let calculator = IntelligenceCalculator::default();
+    let assessment = calculator.calculate(&raw_metrics);
+
+    // Print report
+    print_intelligence_report(&assessment);
+
+    // Additional insights
+    println!();
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║                    Performance Summary                        ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    println!("📊 Task Performance:");
+    println!("   Tasks Attempted:   {}", raw_metrics.tasks_attempted);
+    println!("   Tasks Completed:   {}", raw_metrics.tasks_completed);
+    println!("   Tasks Correct:     {}", raw_metrics.tasks_correct);
+    println!(
+        "   Overall Accuracy:  {:.1}%",
+        raw_metrics.tasks_correct as f64 / raw_metrics.tasks_attempted as f64 * 100.0
+    );
+    println!();
+
+    println!("📈 Learning Progress:");
+    let regret_summary = controller.regret.summary();
+    println!("   Cumulative Regret: {:.2}", regret_summary.total_regret);
+    println!("   Average Regret:    {:.4}", regret_summary.average_regret);
+    println!(
+        "   Sublinear:         {}",
+        if regret_summary.is_sublinear {
+            "Yes ✓"
+        } else {
+            "No ✗"
+        }
+    );
+    println!(
+        "   Regret Trend:      {:.4} ({})",
+        regret_summary.regret_trend,
+        if regret_summary.regret_trend < 0.0 {
+            "decreasing ✓"
+        } else {
+            "increasing ✗"
+        }
+    );
+    println!();
+
+    // Grade the overall performance
+    let grade = if assessment.overall_score >= 90.0 {
+        "A+ (Excellent)"
+    } else if assessment.overall_score >= 80.0 {
+        "A (Very Good)"
+    } else if assessment.overall_score >= 70.0 {
+        "B (Good)"
+    } else if assessment.overall_score >= 60.0 {
+        "C (Adequate)"
+    } else if assessment.overall_score >= 50.0 {
+        "D (Below Average)"
+    } else {
+        "F (Needs Improvement)"
+    };
+
+    println!("🎯 Final Grade: {}", grade);
+    println!();
+
+    // Recommendations
+    println!("💡 Recommendations:");
+    if assessment.capabilities.temporal_reasoning < 70.0 {
+        println!("   • Improve temporal reasoning with more constraint examples");
+    }
+    if assessment.learning.regret_sublinearity < 0.5 {
+        println!("   • Increase episodes to achieve sublinear regret");
+    }
+    if assessment.tool_use.utilization_effectiveness < 0.7 {
+        println!("   • Better tool selection needed for complex tasks");
+    }
+    if assessment.meta_cognition.strategy_adaptation < 0.5 {
+        println!("   • Enable adaptive strategy switching");
+    }
+    if assessment.overall_score >= 70.0 {
+        println!("   • Good performance! Consider harder difficulty levels");
+    }
+
+    // Show adaptive learning progress if enabled
+    if let Some(ref solver) = adaptive_solver {
+        println!();
+        println!("╔══════════════════════════════════════════════════════════════╗");
+        println!("║                 Adaptive Learning Progress                    ║");
+        println!("╚══════════════════════════════════════════════════════════════╝");
+        println!();
+
+        let progress = solver.learning_progress();
+        println!("🧠 ReasoningBank Statistics:");
+        println!("   Total trajectories: {}", progress.total_trajectories);
+        println!(
+            "   Success rate:       {:.1}%",
+            progress.success_rate * 100.0
+        );
+        println!("   Improvement rate:   {:.4}", progress.improvement_rate);
+        println!("   Patterns learned:   {}", progress.patterns_learned);
+        println!("   Strategies tried:   {}", progress.strategies_tried);
+        println!(
+            "   Is improving:       {}",
+            if progress.is_improving {
+                "Yes ✓"
+            } else {
+                "No ✗"
+            }
+        );
+
+        // Show learned patterns
+        if !solver.reasoning_bank.patterns.is_empty() {
+            println!();
+            println!("📚 Learned Patterns:");
+            for (constraint_type, patterns) in &solver.reasoning_bank.patterns {
+                for p in patterns.iter().filter(|p| p.observations >= 3) {
+                    println!(
+                        "   • {}: {} strategy ({:.0}% success, {} obs)",
+                        constraint_type,
+                        p.best_strategy,
+                        p.success_rate * 100.0,
+                        p.observations
+                    );
+                }
+            }
+        }
+
+        // Show strategy stats
+        if !solver.reasoning_bank.strategy_stats.is_empty() {
+            println!();
+            println!("📊 Strategy Performance:");
+            for (strategy, stats) in &solver.reasoning_bank.strategy_stats {
+                println!(
+                    "   • {}: {:.1}% success ({} attempts, {:.1} avg steps)",
+                    strategy,
+                    stats.success_rate() * 100.0,
+                    stats.attempts,
+                    stats.avg_steps()
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
+++ b/examples/benchmarks/src/bin/rvf_intelligence_bench.rs
@@ -0,0 +1,180 @@
+//! RVF Intelligence Benchmark Runner
+//!
+//! Runs head-to-head comparison across 6 intelligence verticals:
+//! Baseline (no learning) vs. RVF-Learning (full pipeline).
+//!
+//! Usage:
+//!   cargo run --bin rvf-intelligence-bench -- --episodes 15 --tasks 25 --verbose
+//!   cargo run --bin rvf-intelligence-bench -- --noise 0.4 --step-budget 300
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
+use ruvector_benchmarks::rvf_intelligence_bench::{run_comparison, BenchmarkConfig};
+
+#[derive(Parser, Debug)]
+#[command(name = "rvf-intelligence-bench")]
+#[command(about = "Benchmark intelligence with and without RVF learning across 6 verticals")]
+struct Args {
+    /// Number of episodes per mode
+    #[arg(short, long, default_value = "10")]
+    episodes: usize,
+
+    /// Tasks per episode
+    #[arg(short, long, default_value = "20")]
+    tasks: usize,
+
+    /// Minimum difficulty (1-10)
+    #[arg(long, default_value = "1")]
+    min_diff: u8,
+
+    /// Maximum difficulty (1-10)
+    #[arg(long, default_value = "10")]
+    max_diff: u8,
+
+    /// Random seed for reproducibility
+    #[arg(long, default_value = "42")]
+    seed: u64,
+
+    /// Noise probability (0.0-1.0)
+    #[arg(long, default_value = "0.25")]
+    noise: f64,
+
+    /// Step budget per episode
+    #[arg(long, default_value = "400")]
+    step_budget: usize,
+
+    /// Max retries for error recovery (RVF only)
+    #[arg(long, default_value = "2")]
+    max_retries: usize,
+
+    /// Retention fraction (0.0-1.0)
+    #[arg(long, default_value = "0.15")]
+    retention: f64,
+
+    /// Token budget per episode (RVF mode)
+    #[arg(long, default_value = "200000")]
+    token_budget: u32,
+
+    /// Tool call budget per episode (RVF mode)
+    #[arg(long, default_value = "50")]
+    tool_budget: u16,
+
+    /// Verbose per-episode output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!();
+    println!("================================================================");
+    println!("  RVF Intelligence Benchmark v2 — Six Verticals");
+    println!("  Baseline vs. RVF-Learning (noise + step limits + retry + transfer)");
+    println!("================================================================");
+    println!();
+    println!("  Configuration:");
+    println!("    Episodes:       {}", args.episodes);
+    println!("    Tasks/episode:  {}", args.tasks);
+    println!("    Difficulty:     {}-{}", args.min_diff, args.max_diff);
+    println!("    Seed:           {}", args.seed);
+    println!("    Noise prob:     {:.0}%", args.noise * 100.0);
+    println!("    Step budget/ep: {}", args.step_budget);
+    println!("    Max retries:    {}", args.max_retries);
+    println!("    Retention:      {:.0}%", args.retention * 100.0);
+    println!();
+
+    let config = BenchmarkConfig {
+        episodes: args.episodes,
+        tasks_per_episode: args.tasks,
+        min_difficulty: args.min_diff,
+        max_difficulty: args.max_diff,
+        seed: Some(args.seed),
+        token_budget: args.token_budget,
+        tool_call_budget: args.tool_budget,
+        verbose: args.verbose,
+        noise_probability: args.noise,
+        step_budget_per_episode: args.step_budget,
+        max_retries: args.max_retries,
+        retention_fraction: args.retention,
+        ..Default::default()
+    };
+
+    println!("  Phase 1/2: Running baseline (no learning)...");
+    let report = run_comparison(&config)?;
+
+    // Print comparison report
+    report.print();
+
+    // Full IQ assessment
+    let calculator = IntelligenceCalculator::default();
+
+    println!("----------------------------------------------------------------");
+    println!("  Detailed Intelligence Assessment: Baseline");
+    println!("----------------------------------------------------------------");
+    let base_assessment = calculator.calculate(&report.baseline.raw_metrics);
+    print_compact_assessment(&base_assessment);
+
+    println!();
+    println!("----------------------------------------------------------------");
+    println!("  Detailed Intelligence Assessment: RVF-Learning");
+    println!("----------------------------------------------------------------");
+    let rvf_assessment = calculator.calculate(&report.rvf_learning.raw_metrics);
+    print_compact_assessment(&rvf_assessment);
+
+    // Final IQ comparison
+    println!();
+    println!("================================================================");
+    println!("  Intelligence Score Comparison");
+    println!("================================================================");
+    println!(
+        "  Baseline IQ Score:     {:.1}/100",
+        base_assessment.overall_score
+    );
+    println!(
+        "  RVF-Learning IQ Score: {:.1}/100",
+        rvf_assessment.overall_score
+    );
+    let iq_delta = rvf_assessment.overall_score - base_assessment.overall_score;
+    println!("  Delta:                 {:+.1}", iq_delta);
+    println!();
+
+    if iq_delta > 10.0 {
+        println!("  >> RVF learning loop provides a DRAMATIC intelligence boost.");
+    } else if iq_delta > 5.0 {
+        println!("  >> RVF learning loop provides a SIGNIFICANT intelligence boost.");
+    } else if iq_delta > 1.0 {
+        println!("  >> RVF learning loop provides a MEASURABLE intelligence improvement.");
+    } else if iq_delta > 0.0 {
+        println!("  >> RVF learning loop provides a MARGINAL intelligence gain.");
+    } else {
+        println!("  >> Performance is comparable. Increase noise or reduce step budget.");
+    }
+    println!();
+
+    Ok(())
+}
+
+fn print_compact_assessment(a: &ruvector_benchmarks::intelligence_metrics::IntelligenceAssessment) {
+    println!("  Overall Score: {:.1}/100", a.overall_score);
+    println!(
+        "  Reasoning:     coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
+        a.reasoning.logical_coherence, a.reasoning.reasoning_efficiency, a.reasoning.error_rate,
+    );
+    println!(
+        "  Learning:      sample_eff={:.2}, regret_sub={:.2}, rate={:.2}, gen={:.2}",
+        a.learning.sample_efficiency,
+        a.learning.regret_sublinearity,
+        a.learning.learning_rate,
+        a.learning.generalization,
+    );
+    println!(
+        "  Capabilities:  pattern={:.1}, planning={:.1}, adaptation={:.1}",
+        a.capabilities.pattern_recognition, a.capabilities.planning, a.capabilities.adaptation,
+    );
+    println!(
+        "  Meta-cog:      self_correct={:.2}, strategy_adapt={:.2}",
+        a.meta_cognition.self_correction_rate, a.meta_cognition.strategy_adaptation,
+    );
+}
--- a/examples/benchmarks/src/bin/superintelligence.rs
+++ b/examples/benchmarks/src/bin/superintelligence.rs
@@ -0,0 +1,135 @@
+//! Superintelligence Pathway Runner
+//!
+//! Runs a 5-level recursive intelligence amplification pipeline and tracks
+//! IQ progression from foundation (~85) toward superintelligence (~98+).
+//!
+//! Usage:
+//!   cargo run --bin superintelligence -- --verbose
+//!   cargo run --bin superintelligence -- --episodes 15 --tasks 30 --target 95
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::intelligence_metrics::IntelligenceCalculator;
+use ruvector_benchmarks::superintelligence::{run_pathway, SIConfig};
+
+#[derive(Parser, Debug)]
+#[command(name = "superintelligence")]
+#[command(about = "Run 5-level superintelligence pathway with IQ tracking")]
+struct Args {
+    /// Episodes per level
+    #[arg(short, long, default_value = "12")]
+    episodes: usize,
+
+    /// Tasks per episode
+    #[arg(short, long, default_value = "25")]
+    tasks: usize,
+
+    /// Random seed
+    #[arg(long, default_value = "42")]
+    seed: u64,
+
+    /// Noise injection rate (0.0-1.0)
+    #[arg(long, default_value = "0.25")]
+    noise: f64,
+
+    /// Step budget per episode
+    #[arg(long, default_value = "400")]
+    step_budget: usize,
+
+    /// Target IQ score
+    #[arg(long, default_value = "98.0")]
+    target: f64,
+
+    /// Ensemble size for Level 3
+    #[arg(long, default_value = "4")]
+    ensemble: usize,
+
+    /// Recursive improvement cycles for Level 4
+    #[arg(long, default_value = "3")]
+    cycles: usize,
+
+    /// Adversarial pressure multiplier for Level 5
+    #[arg(long, default_value = "1.5")]
+    pressure: f64,
+
+    /// Verbose per-episode output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!();
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║           SUPERINTELLIGENCE PATHWAY ENGINE                   ║");
+    println!("║   5-Level Recursive Intelligence Amplification               ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!(
+        "  Config: {} eps/level x {} tasks, noise={:.0}%, target IQ={:.0}",
+        args.episodes,
+        args.tasks,
+        args.noise * 100.0,
+        args.target
+    );
+    println!(
+        "  Ensemble={}, Cycles={}, Pressure={:.1}",
+        args.ensemble, args.cycles, args.pressure
+    );
+    println!();
+
+    let config = SIConfig {
+        episodes_per_level: args.episodes,
+        tasks_per_episode: args.tasks,
+        seed: args.seed,
+        noise_rate: args.noise,
+        step_budget: args.step_budget,
+        target_iq: args.target,
+        ensemble_size: args.ensemble,
+        recursive_cycles: args.cycles,
+        adversarial_pressure: args.pressure,
+        verbose: args.verbose,
+        ..Default::default()
+    };
+
+    let result = run_pathway(&config)?;
+    result.print();
+
+    // Detailed assessment for peak level
+    let calculator = IntelligenceCalculator::default();
+    if let Some(peak) = result
+        .levels
+        .iter()
+        .max_by(|a, b| a.iq_score.partial_cmp(&b.iq_score).unwrap())
+    {
+        println!("  Peak Level ({}) Assessment:", peak.name);
+        let assessment = calculator.calculate(&peak.raw_metrics);
+        println!(
+            "    Reasoning:     coherence={:.2}, efficiency={:.2}, error_rate={:.2}",
+            assessment.reasoning.logical_coherence,
+            assessment.reasoning.reasoning_efficiency,
+            assessment.reasoning.error_rate
+        );
+        println!(
+            "    Learning:      sample_eff={:.2}, regret_sub={:.2}, rate={:.2}",
+            assessment.learning.sample_efficiency,
+            assessment.learning.regret_sublinearity,
+            assessment.learning.learning_rate
+        );
+        println!(
+            "    Capabilities:  pattern={:.1}, planning={:.1}, adaptation={:.1}",
+            assessment.capabilities.pattern_recognition,
+            assessment.capabilities.planning,
+            assessment.capabilities.adaptation
+        );
+        println!(
+            "    Meta-cog:      self_correct={:.2}, strategy_adapt={:.2}",
+            assessment.meta_cognition.self_correction_rate,
+            assessment.meta_cognition.strategy_adaptation
+        );
+        println!();
+    }
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/swarm_regret.rs
+++ b/examples/benchmarks/src/bin/swarm_regret.rs
@@ -0,0 +1,247 @@
+//! Swarm Regret Tracking Runner
+//!
+//! Track sublinear regret across episodes for swarm controller evaluation.
+//!
+//! Usage:
+//!   cargo run --bin swarm-regret -- --episodes 20 --tasks-per-episode 20
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::{
+    logging::BenchmarkLogger,
+    swarm_regret::SwarmController,
+    temporal::TemporalSolver,
+    timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig},
+};
+use std::time::Instant;
+
+#[derive(Parser, Debug)]
+#[command(name = "swarm-regret")]
+#[command(about = "Track sublinear regret for swarm controller")]
+struct Args {
+    /// Number of episodes to run
+    #[arg(short, long, default_value = "20")]
+    episodes: usize,
+
+    /// Tasks per episode
+    #[arg(short, long, default_value = "20")]
+    tasks_per_episode: usize,
+
+    /// Enable calendar tool
+    #[arg(long, default_value = "true")]
+    calendar: bool,
+
+    /// Enable web search tool
+    #[arg(long, default_value = "false")]
+    web_search: bool,
+
+    /// Maximum steps per task
+    #[arg(long, default_value = "100")]
+    max_steps: usize,
+
+    /// Random seed
+    #[arg(long)]
+    seed: Option<u64>,
+
+    /// Output log file
+    #[arg(short, long, default_value = "logs/swarm_regret.jsonl")]
+    output: String,
+
+    /// Verbose output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║            Swarm Controller Regret Tracking                   ║");
+    println!("║          Sublinear Regret for Multi-Agent Control             ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    // Initialize
+    let mut logger = BenchmarkLogger::new(&args.output)?;
+    logger.log_system("INFO", "Starting regret tracking", "swarm-regret")?;
+
+    let mut controller = SwarmController::new(args.tasks_per_episode);
+    let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
+    solver.max_steps = args.max_steps;
+
+    let puzzle_config = PuzzleGeneratorConfig {
+        min_difficulty: 1,
+        max_difficulty: 10,
+        constraint_density: 3,
+        seed: args.seed,
+        ..Default::default()
+    };
+
+    println!("🔧 Configuration:");
+    println!("   Episodes:         {}", args.episodes);
+    println!("   Tasks/episode:    {}", args.tasks_per_episode);
+    println!("   Calendar tool:    {}", args.calendar);
+    println!("   Web search:       {}", args.web_search);
+    println!("   Max steps/task:   {}", args.max_steps);
+    println!();
+
+    println!("🏃 Running episodes...");
+    println!();
+    println!("┌────────┬────────┬─────────┬─────────┬──────────┬───────────┐");
+    println!("│Episode │ Acc(%) │  Regret │ Cum.Reg │ Avg.Reg  │ Sublinear │");
+    println!("├────────┼────────┼─────────┼─────────┼──────────┼───────────┤");
+
+    let total_start = Instant::now();
+
+    for ep in 0..args.episodes {
+        controller.start_episode();
+
+        // Generate puzzles for this episode
+        let mut generator = PuzzleGenerator::new(puzzle_config.clone());
+        let puzzles = generator.generate_batch(args.tasks_per_episode)?;
+
+        let mut solved = 0;
+        let mut correct = 0;
+        let mut total_steps = 0;
+        let mut total_tool_calls = 0;
+        let mut total_latency = 0u64;
+
+        // Solve puzzles
+        for puzzle in &puzzles {
+            let result = solver.solve(puzzle)?;
+            if result.solved {
+                solved += 1;
+            }
+            if result.correct {
+                correct += 1;
+            }
+            total_steps += result.steps;
+            total_tool_calls += result.tool_calls;
+            total_latency += result.latency_ms;
+        }
+
+        // Record episode
+        controller.complete_episode(
+            solved,
+            correct,
+            total_steps,
+            total_tool_calls,
+            total_latency,
+        );
+
+        // Get status
+        let summary = controller.regret.summary();
+        let last_episode = controller.regret.episodes.last().unwrap();
+
+        // Log episode
+        logger.log_swarm(
+            ep + 1,
+            args.tasks_per_episode,
+            solved,
+            correct,
+            last_episode.reward,
+            last_episode.oracle_reward,
+            summary.total_regret,
+            summary.average_regret,
+            summary.is_sublinear,
+        )?;
+
+        // Print row
+        let sublinear = if summary.is_sublinear { "✓" } else { "✗" };
+        println!(
+            "│ {:6} │ {:5.1}  │ {:7.2} │ {:7.2} │ {:8.4} │     {}     │",
+            ep + 1,
+            last_episode.accuracy() * 100.0,
+            last_episode.regret(),
+            summary.total_regret,
+            summary.average_regret,
+            sublinear
+        );
+    }
+
+    println!("└────────┴────────┴─────────┴─────────┴──────────┴───────────┘");
+    println!();
+
+    let total_time = total_start.elapsed();
+
+    // Final summary
+    let summary = controller.regret.summary();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║                       Final Summary                           ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!("📊 Regret Analysis:");
+    println!("   Total episodes:      {}", summary.total_episodes);
+    println!("   Cumulative regret:   {:.2}", summary.total_regret);
+    println!("   Average regret:      {:.4}", summary.average_regret);
+    println!(
+        "   Regret trend:        {:.6} ({})",
+        summary.regret_trend,
+        if summary.regret_trend < 0.0 {
+            "decreasing ✓"
+        } else {
+            "increasing ✗"
+        }
+    );
+    println!(
+        "   Sublinear:           {}",
+        if summary.is_sublinear {
+            "Yes ✓"
+        } else {
+            "No ✗"
+        }
+    );
+    println!();
+    println!("📈 Performance:");
+    println!(
+        "   Average accuracy:    {:.1}%",
+        summary.average_accuracy * 100.0
+    );
+    println!("   Average reward:      {:.2}", summary.average_reward);
+    println!(
+        "   Moving avg reward:   {:.2}",
+        summary.moving_average_reward
+    );
+    println!("   Total time:          {:.2}s", total_time.as_secs_f64());
+    println!();
+
+    // Regret curve analysis
+    if controller.regret.average_regret.len() >= 5 {
+        println!("📉 Regret Curve (R_k/k):");
+        let regrets = &controller.regret.average_regret;
+        let step = regrets.len().max(10) / 10;
+        for (i, r) in regrets.iter().enumerate() {
+            if i % step == 0 || i == regrets.len() - 1 {
+                let bar_len = (r * 50.0).min(50.0) as usize;
+                let bar = "█".repeat(bar_len);
+                println!("   Episode {:3}: {:.4} {}", i + 1, r, bar);
+            }
+        }
+        println!();
+    }
+
+    // Goal check
+    println!("🎯 Goal Status:");
+    if summary.is_sublinear && summary.regret_trend < 0.0 {
+        println!("   ✓ Achieving sublinear regret - average regret trending to zero");
+    } else if summary.is_sublinear {
+        println!("   ~ Sublinear but trend not clearly decreasing");
+    } else {
+        println!("   ✗ Not yet achieving sublinear regret");
+        println!("   Recommendation: Increase episodes or tune solver parameters");
+    }
+
+    // Flush logs
+    logger.flush()?;
+    println!();
+    println!("📝 Results saved to: {}", args.output);
+
+    // Save summary
+    let summary_path = args.output.replace(".jsonl", "_summary.json");
+    let summary_json = serde_json::to_string_pretty(&summary)?;
+    std::fs::write(&summary_path, summary_json)?;
+    println!("📝 Summary saved to: {}", summary_path);
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/temporal_benchmark.rs
+++ b/examples/benchmarks/src/bin/temporal_benchmark.rs
@@ -0,0 +1,262 @@
+//! Temporal Benchmark Runner
+//!
+//! Run temporal reasoning benchmarks based on TimePuzzles methodology.
+//!
+//! Usage:
+//!   cargo run --bin temporal-benchmark -- --puzzles 50 --calendar --web-search
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::{
+    logging::BenchmarkLogger,
+    temporal::{BenchmarkConfig, BenchmarkResults, TemporalSolver},
+    timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
+};
+use std::time::Instant;
+
+#[derive(Parser, Debug)]
+#[command(name = "temporal-benchmark")]
+#[command(about = "Run temporal reasoning benchmarks")]
+struct Args {
+    /// Number of puzzles to run
+    #[arg(short = 'n', long, default_value = "50")]
+    puzzles: usize,
+
+    /// Minimum difficulty (1-10)
+    #[arg(long, default_value = "1")]
+    min_difficulty: u8,
+
+    /// Maximum difficulty (1-10)
+    #[arg(long, default_value = "10")]
+    max_difficulty: u8,
+
+    /// Enable calendar math tool
+    #[arg(long, default_value = "true")]
+    calendar: bool,
+
+    /// Enable web search tool
+    #[arg(long, default_value = "false")]
+    web_search: bool,
+
+    /// Maximum steps per puzzle
+    #[arg(long, default_value = "100")]
+    max_steps: usize,
+
+    /// Constraint density (1-5)
+    #[arg(long, default_value = "3")]
+    constraint_density: u8,
+
+    /// Random seed for reproducibility
+    #[arg(long)]
+    seed: Option<u64>,
+
+    /// Output log file
+    #[arg(short, long, default_value = "logs/temporal_benchmark.jsonl")]
+    output: String,
+
+    /// Use sample puzzles instead of generating
+    #[arg(long)]
+    use_samples: bool,
+
+    /// Verbose output
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║           Temporal Reasoning Benchmark Runner                 ║");
+    println!("║         Based on TimePuzzles (arXiv:2601.07148)              ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    // Initialize logger
+    let mut logger = BenchmarkLogger::new(&args.output)?;
+    logger.log_system("INFO", "Starting benchmark run", "temporal-benchmark")?;
+
+    // Generate or load puzzles
+    let puzzles = if args.use_samples {
+        println!("📚 Using sample puzzle set (50 puzzles)...");
+        SamplePuzzles::mixed_sample()
+    } else {
+        println!(
+            "🎲 Generating {} puzzles (difficulty {}-{})...",
+            args.puzzles, args.min_difficulty, args.max_difficulty
+        );
+
+        let config = PuzzleGeneratorConfig {
+            min_difficulty: args.min_difficulty,
+            max_difficulty: args.max_difficulty,
+            constraint_density: args.constraint_density,
+            cross_cultural: true,
+            relative_constraints: true,
+            year_range: (2000, 2030),
+            seed: args.seed,
+        };
+
+        let mut generator = PuzzleGenerator::new(config);
+        generator.generate_batch(args.puzzles)?
+    };
+
+    println!("✓ Loaded {} puzzles", puzzles.len());
+    println!();
+
+    // Configure solver
+    let mut solver = TemporalSolver::with_tools(args.calendar, args.web_search);
+    solver.max_steps = args.max_steps;
+
+    println!("🔧 Solver configuration:");
+    println!("   Calendar tool: {}", args.calendar);
+    println!("   Web search:    {}", args.web_search);
+    println!("   Max steps:     {}", args.max_steps);
+    println!();
+
+    // Run benchmarks
+    println!("🏃 Running benchmarks...");
+    println!();
+
+    let benchmark_id = format!(
+        "bench-{}-{}",
+        chrono::Utc::now().format("%Y%m%d-%H%M%S"),
+        args.seed.unwrap_or(0)
+    );
+
+    let mut results = Vec::new();
+    let start = Instant::now();
+
+    for (i, puzzle) in puzzles.iter().enumerate() {
+        let result = solver.solve(puzzle)?;
+
+        // Log result
+        logger.log_temporal(
+            &benchmark_id,
+            &puzzle.id,
+            puzzle.difficulty,
+            result.solved,
+            result.correct,
+            result.steps,
+            result.tool_calls,
+            result.latency_ms,
+            puzzle.constraints.len(),
+            args.calendar,
+            args.web_search,
+        )?;
+
+        if args.verbose {
+            let status = if result.correct {
+                "✓"
+            } else if result.solved {
+                "~"
+            } else {
+                "✗"
+            };
+            println!(
+                "  {} Puzzle {:3}: {} (steps: {}, latency: {}ms)",
+                status,
+                i + 1,
+                puzzle.id,
+                result.steps,
+                result.latency_ms
+            );
+        } else if (i + 1) % 10 == 0 {
+            print!(".");
+            use std::io::Write;
+            std::io::stdout().flush()?;
+        }
+
+        results.push(result);
+    }
+
+    let total_time = start.elapsed();
+
+    if !args.verbose {
+        println!();
+    }
+    println!();
+
+    // Compute aggregate results
+    let config = BenchmarkConfig {
+        num_puzzles: puzzles.len(),
+        difficulty_range: (args.min_difficulty, args.max_difficulty),
+        calendar_tool: args.calendar,
+        web_search_tool: args.web_search,
+        max_steps: args.max_steps,
+        constraint_density: args.constraint_density,
+    };
+
+    let benchmark_results = BenchmarkResults::from_results(config, results);
+
+    // Print results
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║                     Benchmark Results                         ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!("📊 Summary:");
+    println!("   Total puzzles:  {}", benchmark_results.total_puzzles);
+    println!("   Solved:         {}", benchmark_results.solved_count);
+    println!("   Correct:        {}", benchmark_results.correct_count);
+    println!(
+        "   Accuracy:       {:.1}%",
+        benchmark_results.accuracy * 100.0
+    );
+    println!();
+    println!("⏱️  Performance:");
+    println!("   Avg steps:      {:.1}", benchmark_results.avg_steps);
+    println!("   Avg tool calls: {:.1}", benchmark_results.avg_tool_calls);
+    println!(
+        "   Avg latency:    {:.1}ms",
+        benchmark_results.avg_latency_ms
+    );
+    println!("   Total time:     {:.2}s", total_time.as_secs_f64());
+    println!();
+
+    // Compute accuracy by difficulty
+    let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
+        std::collections::HashMap::new();
+    for (puzzle, result) in puzzles.iter().zip(benchmark_results.results.iter()) {
+        let entry = by_difficulty.entry(puzzle.difficulty).or_insert((0, 0));
+        entry.0 += 1;
+        if result.correct {
+            entry.1 += 1;
+        }
+    }
+
+    println!("📈 Accuracy by Difficulty:");
+    let mut difficulties: Vec<_> = by_difficulty.keys().copied().collect();
+    difficulties.sort();
+    for d in difficulties {
+        let (total, correct) = by_difficulty[&d];
+        let acc = correct as f64 / total as f64 * 100.0;
+        println!("   Difficulty {}: {:5.1}% ({}/{})", d, acc, correct, total);
+    }
+    println!();
+
+    // Tool usage analysis
+    if args.calendar {
+        let with_rewriting = benchmark_results
+            .results
+            .iter()
+            .filter(|r| r.tool_calls > 0 && r.correct)
+            .count();
+        println!("🔧 Tool Analysis:");
+        println!(
+            "   Calendar rewriting success: {}/{}",
+            with_rewriting, benchmark_results.total_puzzles
+        );
+    }
+
+    // Flush logs
+    logger.flush()?;
+    println!();
+    println!("📝 Results saved to: {}", args.output);
+
+    // Save full results as JSON
+    let results_path = args.output.replace(".jsonl", "_summary.json");
+    let results_json = serde_json::to_string_pretty(&benchmark_results)?;
+    std::fs::write(&results_path, results_json)?;
+    println!("📝 Summary saved to: {}", results_path);
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/timepuzzle_runner.rs
+++ b/examples/benchmarks/src/bin/timepuzzle_runner.rs
@@ -0,0 +1,308 @@
+//! TimePuzzle Quick Runner
+//!
+//! 10-minute probe for temporal reasoning with tool augmentation.
+//!
+//! Usage:
+//!   cargo run --bin timepuzzle-runner -- --quick
+//!   cargo run --bin timepuzzle-runner -- --depth 5
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::{
+    logging::BenchmarkLogger, temporal::TemporalSolver, timepuzzles::SamplePuzzles,
+};
+use std::time::{Duration, Instant};
+
+#[derive(Parser, Debug)]
+#[command(name = "timepuzzle-runner")]
+#[command(about = "Quick TimePuzzle probe for agent testing")]
+struct Args {
+    /// Quick mode: 50 puzzles, depth-limited steps
+    #[arg(long)]
+    quick: bool,
+
+    /// Maximum depth (steps) per puzzle
+    #[arg(short, long, default_value = "50")]
+    depth: usize,
+
+    /// Number of puzzles
+    #[arg(short = 'n', long, default_value = "50")]
+    puzzles: usize,
+
+    /// Tool latency cap (abort if tool > 1.5x median)
+    #[arg(long, default_value = "1.5")]
+    latency_cap: f64,
+
+    /// Timeout in seconds
+    #[arg(long, default_value = "600")]
+    timeout: u64,
+
+    /// Enable constraint rewriting (calendar math)
+    #[arg(long, default_value = "true")]
+    rewrite: bool,
+
+    /// Enable web search (for factual anchors)
+    #[arg(long, default_value = "false")]
+    web_search: bool,
+
+    /// Output file
+    #[arg(short, long, default_value = "logs/timepuzzle_probe.jsonl")]
+    output: String,
+
+    /// Verbose mode
+    #[arg(short, long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║              TimePuzzle Quick Probe Runner                    ║");
+    println!("║        Tool-Augmented Iterative Temporal Reasoning            ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    let mut logger = BenchmarkLogger::new(&args.output)?;
+    logger.log_system("INFO", "Starting TimePuzzle probe", "timepuzzle-runner")?;
+
+    // Quick mode settings
+    let (num_puzzles, max_depth) = if args.quick {
+        println!("⚡ Quick mode enabled (50 puzzles, depth {})", args.depth);
+        (50, args.depth)
+    } else {
+        (args.puzzles, args.depth)
+    };
+
+    let timeout = Duration::from_secs(args.timeout);
+
+    println!();
+    println!("🔧 Configuration:");
+    println!("   Puzzles:          {}", num_puzzles);
+    println!("   Max depth:        {}", max_depth);
+    println!("   Rewriting:        {}", args.rewrite);
+    println!("   Web search:       {}", args.web_search);
+    println!("   Latency cap:      {}x median", args.latency_cap);
+    println!("   Timeout:          {}s", args.timeout);
+    println!();
+
+    // Generate puzzles with varying constraint density
+    println!("🎲 Generating puzzles...");
+    let puzzles = SamplePuzzles::mixed_sample()
+        .into_iter()
+        .take(num_puzzles)
+        .collect::<Vec<_>>();
+    println!("✓ Loaded {} puzzles", puzzles.len());
+    println!();
+
+    // Configure solver
+    let mut solver = TemporalSolver::with_tools(args.rewrite, args.web_search);
+    solver.max_steps = max_depth;
+
+    // Run probe
+    println!("🏃 Running probe...");
+    println!();
+
+    let probe_start = Instant::now();
+    let mut results = Vec::new();
+    let mut latencies: Vec<u64> = Vec::new();
+    let mut median_latency: f64 = 100.0; // Initial estimate
+
+    for (i, puzzle) in puzzles.iter().enumerate() {
+        // Check timeout
+        if probe_start.elapsed() > timeout {
+            println!("⚠️  Timeout reached after {} puzzles", i);
+            break;
+        }
+
+        let result = solver.solve(puzzle)?;
+
+        // Check latency cap
+        if latencies.len() >= 10 {
+            let mut sorted = latencies.clone();
+            sorted.sort();
+            median_latency = sorted[sorted.len() / 2] as f64;
+
+            if result.latency_ms as f64 > median_latency * args.latency_cap {
+                if args.verbose {
+                    println!(
+                        "  ⚠ Puzzle {} aborted: latency {}ms > {:.0}ms cap",
+                        puzzle.id,
+                        result.latency_ms,
+                        median_latency * args.latency_cap
+                    );
+                }
+                // Still record but mark as slow
+            }
+        }
+
+        latencies.push(result.latency_ms);
+
+        // Log
+        logger.log_temporal(
+            "timepuzzle-probe",
+            &puzzle.id,
+            puzzle.difficulty,
+            result.solved,
+            result.correct,
+            result.steps,
+            result.tool_calls,
+            result.latency_ms,
+            puzzle.constraints.len(),
+            args.rewrite,
+            args.web_search,
+        )?;
+
+        if args.verbose {
+            let status = if result.correct {
+                "✓"
+            } else if result.solved {
+                "~"
+            } else {
+                "✗"
+            };
+            println!(
+                "  {} [{:2}] {}: steps={}, tools={}, {}ms",
+                status,
+                puzzle.difficulty,
+                puzzle.id,
+                result.steps,
+                result.tool_calls,
+                result.latency_ms
+            );
+        }
+
+        results.push(result);
+    }
+
+    let total_time = probe_start.elapsed();
+    println!();
+
+    // Analyze results
+    let solved = results.iter().filter(|r| r.solved).count();
+    let correct = results.iter().filter(|r| r.correct).count();
+    let total = results.len();
+    let accuracy = correct as f64 / total as f64;
+
+    let avg_steps = results.iter().map(|r| r.steps).sum::<usize>() as f64 / total as f64;
+    let avg_tools = results.iter().map(|r| r.tool_calls).sum::<usize>() as f64 / total as f64;
+    let avg_latency = results.iter().map(|r| r.latency_ms).sum::<u64>() as f64 / total as f64;
+
+    // Tool toggle analysis
+    let with_tool_correct = results
+        .iter()
+        .filter(|r| r.tool_calls > 0 && r.correct)
+        .count();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║                    Probe Results                              ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!("📊 Overall Performance:");
+    println!("   Puzzles run:      {}", total);
+    println!(
+        "   Solved:           {} ({:.1}%)",
+        solved,
+        solved as f64 / total as f64 * 100.0
+    );
+    println!(
+        "   Correct:          {} ({:.1}%)",
+        correct,
+        accuracy * 100.0
+    );
+    println!();
+    println!("⏱️  Efficiency:");
+    println!("   Avg steps:        {:.1}", avg_steps);
+    println!("   Avg tool calls:   {:.1}", avg_tools);
+    println!("   Avg latency:      {:.1}ms", avg_latency);
+    println!("   Median latency:   {:.0}ms", median_latency);
+    println!("   Total time:       {:.2}s", total_time.as_secs_f64());
+    println!();
+
+    // Scaling curves
+    println!("📈 Tool Toggle Analysis:");
+    println!(
+        "   With rewriting:   {}/{} ({:.1}%)",
+        with_tool_correct,
+        total,
+        with_tool_correct as f64 / total as f64 * 100.0
+    );
+
+    // Sensitivity analysis
+    let fast_correct = results
+        .iter()
+        .filter(|r| r.latency_ms < median_latency as u64 && r.correct)
+        .count();
+    let slow_correct = results
+        .iter()
+        .filter(|r| r.latency_ms >= median_latency as u64 && r.correct)
+        .count();
+    let fast_total = results
+        .iter()
+        .filter(|r| r.latency_ms < median_latency as u64)
+        .count();
+    let slow_total = total - fast_total;
+
+    if fast_total > 0 && slow_total > 0 {
+        println!();
+        println!("⚡ Latency Sensitivity:");
+        println!(
+            "   Fast (<{:.0}ms):    {}/{} ({:.1}%)",
+            median_latency,
+            fast_correct,
+            fast_total,
+            fast_correct as f64 / fast_total as f64 * 100.0
+        );
+        println!(
+            "   Slow (>={:.0}ms):   {}/{} ({:.1}%)",
+            median_latency,
+            slow_correct,
+            slow_total,
+            slow_correct as f64 / slow_total as f64 * 100.0
+        );
+    }
+
+    // Accuracy by difficulty
+    println!();
+    println!("🎯 Accuracy by Difficulty:");
+    let mut by_diff: std::collections::HashMap<u8, (usize, usize)> =
+        std::collections::HashMap::new();
+    for (p, r) in puzzles.iter().zip(results.iter()) {
+        let e = by_diff.entry(p.difficulty).or_insert((0, 0));
+        e.0 += 1;
+        if r.correct {
+            e.1 += 1;
+        }
+    }
+    let mut diffs: Vec<_> = by_diff.keys().copied().collect();
+    diffs.sort();
+    for d in diffs {
+        let (t, c) = by_diff[&d];
+        let pct = c as f64 / t as f64 * 100.0;
+        let bar = "█".repeat((pct / 5.0) as usize);
+        println!("   Level {:2}: {:5.1}% {}", d, pct, bar);
+    }
+
+    // Recommendations
+    println!();
+    println!("💡 Insights:");
+    if accuracy < 0.5 {
+        println!("   • Low accuracy - consider enabling constraint rewriting");
+    }
+    if avg_steps > max_depth as f64 * 0.8 {
+        println!("   • High step count - search may be inefficient");
+    }
+    if args.web_search && with_tool_correct > correct / 2 {
+        println!("   • Web search providing substantial gains");
+    }
+    if accuracy >= 0.8 {
+        println!("   • Good performance - ready for harder puzzles");
+    }
+
+    // Flush logs
+    logger.flush()?;
+    println!();
+    println!("📝 Results saved to: {}", args.output);
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/vector_benchmark.rs
+++ b/examples/benchmarks/src/bin/vector_benchmark.rs
@@ -0,0 +1,248 @@
+//! Vector Index Benchmark Runner
+//!
+//! Benchmark vector operations with IVF and coherence gating.
+//!
+//! Usage:
+//!   cargo run --bin vector-benchmark -- --dim 128 --vectors 10000
+
+use anyhow::Result;
+use clap::Parser;
+use ruvector_benchmarks::{
+    logging::BenchmarkLogger,
+    vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
+};
+use std::time::Instant;
+
+#[derive(Parser, Debug)]
+#[command(name = "vector-benchmark")]
+#[command(about = "Benchmark vector index operations")]
+struct Args {
+    /// Vector dimensionality
+    #[arg(short, long, default_value = "128")]
+    dim: usize,
+
+    /// Number of vectors to insert
+    #[arg(short = 'n', long, default_value = "10000")]
+    vectors: usize,
+
+    /// Number of queries to run
+    #[arg(short, long, default_value = "1000")]
+    queries: usize,
+
+    /// Top-k results per query
+    #[arg(short, long, default_value = "10")]
+    top_k: usize,
+
+    /// Enable IVF indexing
+    #[arg(long, default_value = "true")]
+    ivf: bool,
+
+    /// Number of IVF clusters
+    #[arg(long, default_value = "64")]
+    clusters: usize,
+
+    /// Number of clusters to probe
+    #[arg(long, default_value = "4")]
+    probes: usize,
+
+    /// Enable coherence gate
+    #[arg(long)]
+    gate: bool,
+
+    /// Coherence gate threshold
+    #[arg(long, default_value = "0.5")]
+    gate_threshold: f32,
+
+    /// Output log file
+    #[arg(short, long, default_value = "logs/vector_benchmark.jsonl")]
+    output: String,
+
+    /// Verbose output
+    #[arg(short = 'V', long)]
+    verbose: bool,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║              Vector Index Benchmark Runner                    ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+
+    // Initialize logger
+    let mut logger = BenchmarkLogger::new(&args.output)?;
+    logger.log_system("INFO", "Starting vector benchmark", "vector-benchmark")?;
+
+    // Create index
+    println!("🔧 Configuration:");
+    println!("   Dimensions:  {}", args.dim);
+    println!("   Vectors:     {}", args.vectors);
+    println!("   Queries:     {}", args.queries);
+    println!("   Top-K:       {}", args.top_k);
+    println!("   IVF:         {}", args.ivf);
+    if args.ivf {
+        println!("   Clusters:    {}", args.clusters);
+        println!("   Probes:      {}", args.probes);
+    }
+    println!("   Gate:        {}", args.gate);
+    if args.gate {
+        println!("   Threshold:   {}", args.gate_threshold);
+    }
+    println!();
+
+    let mut index = VectorIndex::new(args.dim);
+
+    if args.gate {
+        index = index.with_gate(CoherenceGate::new(args.gate_threshold));
+    }
+
+    if args.ivf {
+        index = index.with_ivf(IvfConfig::new(args.clusters, args.probes));
+    }
+
+    // Insert vectors
+    println!("📥 Inserting {} vectors...", args.vectors);
+    let insert_start = Instant::now();
+
+    for i in 0..args.vectors {
+        index.insert(DenseVec::random(args.dim))?;
+        if args.verbose && (i + 1) % 1000 == 0 {
+            println!("   Inserted {} vectors", i + 1);
+        }
+    }
+
+    let insert_time = insert_start.elapsed();
+    println!(
+        "✓ Insert complete ({:.2}s, {:.0} vec/s)",
+        insert_time.as_secs_f64(),
+        args.vectors as f64 / insert_time.as_secs_f64()
+    );
+    println!();
+
+    // Build IVF if enabled
+    if args.ivf {
+        println!("🏗️  Building IVF index...");
+        let build_start = Instant::now();
+        index.rebuild_ivf()?;
+        let build_time = build_start.elapsed();
+        println!("✓ IVF build complete ({:.2}s)", build_time.as_secs_f64());
+        println!();
+    }
+
+    // Print index stats
+    let stats = index.stats();
+    println!("📊 Index Statistics:");
+    println!("   Active vectors:  {}", stats.active_vectors);
+    println!("   IVF clusters:    {}", stats.ivf_clusters);
+    println!();
+
+    // Run queries
+    println!("🔍 Running {} queries...", args.queries);
+    let query_start = Instant::now();
+
+    let mut latencies: Vec<u64> = Vec::with_capacity(args.queries);
+    let mut total_results = 0usize;
+
+    for i in 0..args.queries {
+        let q = DenseVec::random(args.dim);
+        let coherence = if args.gate {
+            rand::random::<f32>()
+        } else {
+            1.0
+        };
+
+        let start = Instant::now();
+        let results = index.search(&q, args.top_k, coherence)?;
+        let latency_us = start.elapsed().as_micros() as u64;
+
+        latencies.push(latency_us);
+        total_results += results.len();
+
+        // Log query
+        logger.log_vector(
+            "search",
+            args.dim,
+            stats.active_vectors,
+            1,
+            args.top_k,
+            args.ivf,
+            coherence,
+            latency_us,
+            results.len(),
+        )?;
+
+        if args.verbose && (i + 1) % 100 == 0 {
+            println!("   Completed {} queries", i + 1);
+        }
+    }
+
+    let query_time = query_start.elapsed();
+    println!(
+        "✓ Queries complete ({:.2}s, {:.0} q/s)",
+        query_time.as_secs_f64(),
+        args.queries as f64 / query_time.as_secs_f64()
+    );
+    println!();
+
+    // Compute statistics
+    latencies.sort();
+    let p50 = latencies[latencies.len() / 2];
+    let p95 = latencies[latencies.len() * 95 / 100];
+    let p99 = latencies[latencies.len() * 99 / 100];
+    let avg = latencies.iter().sum::<u64>() / latencies.len() as u64;
+    let max = *latencies.last().unwrap();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║                     Benchmark Results                         ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!("⏱️  Latency (microseconds):");
+    println!("   Average: {}µs", avg);
+    println!("   P50:     {}µs", p50);
+    println!("   P95:     {}µs", p95);
+    println!("   P99:     {}µs", p99);
+    println!("   Max:     {}µs", max);
+    println!();
+    println!("📈 Throughput:");
+    println!(
+        "   Queries/sec:    {:.0}",
+        args.queries as f64 / query_time.as_secs_f64()
+    );
+    println!(
+        "   Insert/sec:     {:.0}",
+        args.vectors as f64 / insert_time.as_secs_f64()
+    );
+    println!();
+    println!("📊 Results:");
+    println!("   Total results:  {}", total_results);
+    println!(
+        "   Avg results:    {:.2}",
+        total_results as f64 / args.queries as f64
+    );
+
+    if args.gate {
+        let gated = latencies
+            .iter()
+            .enumerate()
+            .filter(|(_, &l)| l < 10)
+            .count();
+        println!(
+            "   Gated queries:  {:.1}%",
+            gated as f64 / args.queries as f64 * 100.0
+        );
+    }
+
+    // Save index
+    println!();
+    let index_path = "data/vector_index.bin";
+    std::fs::create_dir_all("data")?;
+    index.save_to_file(index_path)?;
+    println!("💾 Index saved to: {}", index_path);
+
+    // Flush logs
+    logger.flush()?;
+    println!("📝 Results saved to: {}", args.output);
+
+    Ok(())
+}
--- a/examples/benchmarks/src/bin/wasm_solver_bench.rs
+++ b/examples/benchmarks/src/bin/wasm_solver_bench.rs
@@ -0,0 +1,197 @@
+//! WASM Solver Benchmark — Compares native vs WASM AGI solver performance.
+//!
+//! Runs the same acceptance test configuration through:
+//! 1. Native Rust solver (benchmarks crate)
+//! 2. Reference metrics comparison
+//!
+//! Usage:
+//!   cargo run --bin wasm-solver-bench [-- --holdout <N> --training <N> --cycles <N>]
+
+use clap::Parser;
+use ruvector_benchmarks::acceptance_test::{run_acceptance_test_mode, AblationMode, HoldoutConfig};
+use std::time::Instant;
+
+#[derive(Parser)]
+#[command(name = "wasm-solver-bench")]
+struct Args {
+    #[arg(long, default_value = "50")]
+    holdout: usize,
+    #[arg(long, default_value = "50")]
+    training: usize,
+    #[arg(long, default_value = "3")]
+    cycles: usize,
+    #[arg(long, default_value = "200")]
+    budget: usize,
+}
+
+fn main() {
+    let args = Args::parse();
+
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║          WASM vs Native AGI Solver Benchmark                ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!(
+        "  Config: holdout={}, training={}, cycles={}, budget={}",
+        args.holdout, args.training, args.cycles, args.budget
+    );
+    println!();
+
+    let config = HoldoutConfig {
+        holdout_size: args.holdout,
+        training_per_cycle: args.training,
+        cycles: args.cycles,
+        step_budget: args.budget,
+        holdout_seed: 0xDEAD_BEEF,
+        training_seed: 42,
+        noise_rate: 0.25,
+        min_accuracy: 0.50,
+        min_dimensions_improved: 1,
+        verbose: false,
+    };
+
+    // ── Native Mode A (Baseline) ──────────────────────────────────
+    println!("  Running Native Mode A (baseline)...");
+    let t0 = Instant::now();
+    let native_a = run_acceptance_test_mode(&config, &AblationMode::Baseline).unwrap();
+    let native_a_ms = t0.elapsed().as_millis();
+
+    // ── Native Mode B (Compiler) ──────────────────────────────────
+    println!("  Running Native Mode B (compiler)...");
+    let t0 = Instant::now();
+    let native_b = run_acceptance_test_mode(&config, &AblationMode::CompilerOnly).unwrap();
+    let native_b_ms = t0.elapsed().as_millis();
+
+    // ── Native Mode C (Full learned) ──────────────────────────────
+    println!("  Running Native Mode C (full learned)...");
+    let t0 = Instant::now();
+    let native_c = run_acceptance_test_mode(&config, &AblationMode::Full).unwrap();
+    let native_c_ms = t0.elapsed().as_millis();
+
+    println!();
+    println!("  ┌────────────────────────────────────────────────────────┐");
+    println!("  │              NATIVE SOLVER RESULTS                      │");
+    println!("  ├────────────────────────────────────────────────────────┤");
+    println!(
+        "  │  {:<12} {:>8} {:>10} {:>10} {:>8} {:>8} │",
+        "Mode", "Acc%", "Cost", "Noise%", "Time", "Pass"
+    );
+    println!("  │  {} │", "-".repeat(54));
+
+    for (label, result, ms) in [
+        ("A baseline", &native_a, native_a_ms),
+        ("B compiler", &native_b, native_b_ms),
+        ("C learned", &native_c, native_c_ms),
+    ] {
+        let last = result.result.cycles.last().unwrap();
+        println!(
+            "  │  {:<12} {:>6.1}% {:>9.1} {:>8.1}% {:>5}ms {:>7} │",
+            label,
+            last.holdout_accuracy * 100.0,
+            last.holdout_cost_per_solve,
+            last.holdout_noise_accuracy * 100.0,
+            ms,
+            if result.result.passed { "PASS" } else { "FAIL" }
+        );
+    }
+    println!("  └────────────────────────────────────────────────────────┘");
+    println!();
+
+    // ── WASM Reference Metrics ────────────────────────────────────
+    // Since we can't run WASM directly from Rust without a runtime,
+    // we output the reference metrics that the WASM module should match.
+    println!("  ┌────────────────────────────────────────────────────────┐");
+    println!("  │         WASM REFERENCE METRICS (for validation)        │");
+    println!("  ├────────────────────────────────────────────────────────┤");
+    println!("  │                                                        │");
+    println!("  │  The rvf-solver-wasm module should produce:            │");
+    println!("  │                                                        │");
+
+    let total_ms = native_a_ms + native_b_ms + native_c_ms;
+    println!(
+        "  │  Native total time:  {}ms                           │",
+        total_ms
+    );
+    println!(
+        "  │  WASM expected:      ~{}ms (2-5x native)           │",
+        total_ms * 3
+    );
+    println!("  │                                                        │");
+
+    // PolicyKernel convergence check
+    println!("  │  Mode C PolicyKernel:                                  │");
+    println!(
+        "  │    Context buckets:  {}                              │",
+        native_c.policy_context_buckets
+    );
+    println!(
+        "  │    Early commit rate: {:.2}%                         │",
+        native_c.early_commit_rate * 100.0
+    );
+    println!(
+        "  │    Compiler hits:    {}                              │",
+        native_c.compiler_hits
+    );
+    println!("  │                                                        │");
+
+    // Thompson Sampling convergence: Mode C should learn differently across contexts
+    let c_unique_modes: std::collections::HashSet<&str> = native_c
+        .skip_mode_distribution
+        .values()
+        .flat_map(|m| m.keys())
+        .map(|s| s.as_str())
+        .collect();
+    println!("  │  Thompson Sampling convergence:                        │");
+    println!(
+        "  │    Unique skip modes: {} (need >=2)                  │",
+        c_unique_modes.len()
+    );
+    println!("  │    Skip distribution:                                  │");
+    for (bucket, dist) in &native_c.skip_mode_distribution {
+        let total = dist.values().sum::<usize>().max(1);
+        let parts: Vec<String> = dist
+            .iter()
+            .map(|(m, c)| format!("{}:{:.0}%", m, *c as f64 / total as f64 * 100.0))
+            .collect();
+        if parts.len() > 0 {
+            println!("  │    {:<16} {}  │", bucket, parts.join(" "));
+        }
+    }
+    println!("  │                                                        │");
+
+    // Ablation assertions
+    let last_a = native_a.result.cycles.last().unwrap();
+    let last_b = native_b.result.cycles.last().unwrap();
+    let last_c = native_c.result.cycles.last().unwrap();
+    let cost_decrease = if last_a.holdout_cost_per_solve > 0.0 {
+        (1.0 - last_b.holdout_cost_per_solve / last_a.holdout_cost_per_solve) * 100.0
+    } else {
+        0.0
+    };
+    let robustness_gain = (last_c.holdout_noise_accuracy - last_b.holdout_noise_accuracy) * 100.0;
+
+    println!("  │  Ablation assertions:                                  │");
+    println!(
+        "  │    B vs A cost decrease: {:.1}% (need >=15%)          │",
+        cost_decrease
+    );
+    println!(
+        "  │    C vs B robustness:    {:.1}% (need >=10%)          │",
+        robustness_gain
+    );
+    println!("  │                                                        │");
+    println!("  │  WASM module must match these learning characteristics │");
+    println!("  │  (exact values may differ due to float precision)      │");
+    println!("  └────────────────────────────────────────────────────────┘");
+    println!();
+
+    // Final summary
+    let all_passed = native_a.result.passed && native_b.result.passed && native_c.result.passed;
+    if all_passed {
+        println!("  NATIVE BENCHMARK: ALL MODES PASSED");
+    } else {
+        println!("  NATIVE BENCHMARK: SOME MODES FAILED");
+    }
+    println!("  Binary size: rvf-solver-wasm.wasm ~160 KB");
+    println!();
+}
--- a/examples/benchmarks/src/intelligence_metrics.rs
+++ b/examples/benchmarks/src/intelligence_metrics.rs
@@ -0,0 +1,960 @@
+//! Intelligence Metrics Module
+//!
+//! Measures cognitive capabilities, reasoning quality, and learning indicators
+//! for agent evaluation based on established AI benchmarking methodologies.
+//!
+//! Key metrics tracked:
+//! - Reasoning quality (logical coherence, constraint satisfaction)
+//! - Learning efficiency (regret curves, sample efficiency)
+//! - Working memory (context utilization, information integration)
+//! - Tool use proficiency (appropriate selection, effective utilization)
+//! - Meta-cognitive awareness (self-correction, uncertainty estimation)
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Intelligence assessment result
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct IntelligenceAssessment {
+    /// Overall intelligence score (0-100)
+    pub overall_score: f64,
+    /// Individual capability scores
+    pub capabilities: CapabilityScores,
+    /// Reasoning quality metrics
+    pub reasoning: ReasoningMetrics,
+    /// Learning efficiency metrics
+    pub learning: LearningMetrics,
+    /// Tool use proficiency
+    pub tool_use: ToolUseMetrics,
+    /// Meta-cognitive indicators
+    pub meta_cognition: MetaCognitiveMetrics,
+    /// Cost efficiency metrics
+    pub cost: CostMetrics,
+    /// Robustness under noise
+    pub robustness: RobustnessMetrics,
+    /// Raw performance data
+    pub raw_data: RawMetrics,
+}
+
+/// Capability scores across dimensions
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CapabilityScores {
+    /// Temporal reasoning (date inference, calendar math)
+    pub temporal_reasoning: f64,
+    /// Constraint satisfaction (multi-constraint solving)
+    pub constraint_satisfaction: f64,
+    /// Information retrieval (semantic search, recall)
+    pub information_retrieval: f64,
+    /// Pattern recognition (learning from examples)
+    pub pattern_recognition: f64,
+    /// Planning and sequencing
+    pub planning: f64,
+    /// Error recovery and adaptation
+    pub adaptation: f64,
+}
+
+impl Default for CapabilityScores {
+    fn default() -> Self {
+        Self {
+            temporal_reasoning: 0.0,
+            constraint_satisfaction: 0.0,
+            information_retrieval: 0.0,
+            pattern_recognition: 0.0,
+            planning: 0.0,
+            adaptation: 0.0,
+        }
+    }
+}
+
+impl CapabilityScores {
+    /// Compute weighted average
+    pub fn weighted_average(&self, weights: &[f64; 6]) -> f64 {
+        let scores = [
+            self.temporal_reasoning,
+            self.constraint_satisfaction,
+            self.information_retrieval,
+            self.pattern_recognition,
+            self.planning,
+            self.adaptation,
+        ];
+        let total_weight: f64 = weights.iter().sum();
+        if total_weight == 0.0 {
+            return 0.0;
+        }
+        scores
+            .iter()
+            .zip(weights.iter())
+            .map(|(s, w)| s * w)
+            .sum::<f64>()
+            / total_weight
+    }
+}
+
+/// Reasoning quality metrics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ReasoningMetrics {
+    /// Logical coherence (steps follow logically)
+    pub logical_coherence: f64,
+    /// Constraint satisfaction rate
+    pub constraint_satisfaction_rate: f64,
+    /// Solution optimality (vs. best possible)
+    pub solution_optimality: f64,
+    /// Reasoning efficiency (steps to solution)
+    pub reasoning_efficiency: f64,
+    /// Error rate in logical steps
+    pub error_rate: f64,
+}
+
+impl Default for ReasoningMetrics {
+    fn default() -> Self {
+        Self {
+            logical_coherence: 0.0,
+            constraint_satisfaction_rate: 0.0,
+            solution_optimality: 0.0,
+            reasoning_efficiency: 0.0,
+            error_rate: 0.0,
+        }
+    }
+}
+
+/// Learning efficiency metrics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct LearningMetrics {
+    /// Sample efficiency (performance vs. examples seen)
+    pub sample_efficiency: f64,
+    /// Regret trajectory (sublinear indicator)
+    pub regret_sublinearity: f64,
+    /// Transfer learning capability
+    pub transfer_capability: f64,
+    /// Learning rate (improvement per episode)
+    pub learning_rate: f64,
+    /// Generalization ability
+    pub generalization: f64,
+}
+
+impl Default for LearningMetrics {
+    fn default() -> Self {
+        Self {
+            sample_efficiency: 0.0,
+            regret_sublinearity: 0.0,
+            transfer_capability: 0.0,
+            learning_rate: 0.0,
+            generalization: 0.0,
+        }
+    }
+}
+
+/// Tool use proficiency metrics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ToolUseMetrics {
+    /// Tool selection appropriateness
+    pub selection_appropriateness: f64,
+    /// Tool utilization effectiveness
+    pub utilization_effectiveness: f64,
+    /// Tool composition (combining tools)
+    pub composition_ability: f64,
+    /// Tool discovery (finding needed tools)
+    pub discovery_ability: f64,
+}
+
+impl Default for ToolUseMetrics {
+    fn default() -> Self {
+        Self {
+            selection_appropriateness: 0.0,
+            utilization_effectiveness: 0.0,
+            composition_ability: 0.0,
+            discovery_ability: 0.0,
+        }
+    }
+}
+
+/// Meta-cognitive metrics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MetaCognitiveMetrics {
+    /// Self-correction rate
+    pub self_correction_rate: f64,
+    /// Uncertainty calibration (confidence vs. accuracy)
+    pub uncertainty_calibration: f64,
+    /// Strategy adaptation
+    pub strategy_adaptation: f64,
+    /// Progress monitoring accuracy
+    pub progress_monitoring: f64,
+}
+
+impl Default for MetaCognitiveMetrics {
+    fn default() -> Self {
+        Self {
+            self_correction_rate: 0.0,
+            uncertainty_calibration: 0.0,
+            strategy_adaptation: 0.0,
+            progress_monitoring: 0.0,
+        }
+    }
+}
+
+/// Cost efficiency metrics — first-class IQ dimension
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CostMetrics {
+    /// Steps per correct solve (lower = better)
+    pub steps_per_solve: f64,
+    /// Tool calls per correct solve (lower = better)
+    pub tools_per_solve: f64,
+    /// Cost efficiency score (0-1, higher = cheaper)
+    pub cost_efficiency: f64,
+    /// Cost trend over episodes (positive = improving)
+    pub cost_trend: f64,
+}
+
+impl Default for CostMetrics {
+    fn default() -> Self {
+        Self {
+            steps_per_solve: 100.0,
+            tools_per_solve: 10.0,
+            cost_efficiency: 0.0,
+            cost_trend: 0.0,
+        }
+    }
+}
+
+/// Robustness under adversarial conditions — first-class IQ dimension
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RobustnessMetrics {
+    /// Accuracy on noise-injected tasks
+    pub noise_accuracy: f64,
+    /// Accuracy drop from clean to noisy (lower = more robust)
+    pub noise_degradation: f64,
+    /// Per-episode accuracy consistency (higher = steadier)
+    pub consistency: f64,
+    /// Composite robustness score (0-1)
+    pub robustness_score: f64,
+}
+
+impl Default for RobustnessMetrics {
+    fn default() -> Self {
+        Self {
+            noise_accuracy: 0.0,
+            noise_degradation: 1.0,
+            consistency: 0.0,
+            robustness_score: 0.0,
+        }
+    }
+}
+
+/// Raw metrics from benchmarks
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RawMetrics {
+    /// Total tasks attempted
+    pub tasks_attempted: usize,
+    /// Tasks completed successfully
+    pub tasks_completed: usize,
+    /// Tasks with correct solutions
+    pub tasks_correct: usize,
+    /// Total steps taken
+    pub total_steps: usize,
+    /// Total tool calls
+    pub total_tool_calls: usize,
+    /// Total latency in ms
+    pub total_latency_ms: u64,
+    /// Performance by difficulty
+    pub by_difficulty: HashMap<u8, DifficultyStats>,
+    /// Episode-level metrics
+    pub episodes: Vec<EpisodeMetrics>,
+    /// Tasks attempted under noise injection
+    pub noise_tasks_attempted: usize,
+    /// Tasks correct under noise injection
+    pub noise_tasks_correct: usize,
+    /// Policy violations (contradictions, budget overruns)
+    pub policy_violations: usize,
+    /// Solved-but-incorrect count (contradiction rate numerator)
+    pub contradictions: usize,
+    /// Successful rollbacks from noisy to clean
+    pub rollback_successes: usize,
+    /// Attempted rollbacks from noisy to clean
+    pub rollback_attempts: usize,
+}
+
+impl Default for RawMetrics {
+    fn default() -> Self {
+        Self {
+            tasks_attempted: 0,
+            tasks_completed: 0,
+            tasks_correct: 0,
+            total_steps: 0,
+            total_tool_calls: 0,
+            total_latency_ms: 0,
+            by_difficulty: HashMap::new(),
+            episodes: Vec::new(),
+            noise_tasks_attempted: 0,
+            noise_tasks_correct: 0,
+            policy_violations: 0,
+            contradictions: 0,
+            rollback_successes: 0,
+            rollback_attempts: 0,
+        }
+    }
+}
+
+/// Stats per difficulty level
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DifficultyStats {
+    pub attempted: usize,
+    pub completed: usize,
+    pub correct: usize,
+    pub avg_steps: f64,
+}
+
+/// Per-episode metrics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct EpisodeMetrics {
+    pub episode: usize,
+    pub accuracy: f64,
+    pub reward: f64,
+    pub regret: f64,
+    pub cumulative_regret: f64,
+}
+
+/// Intelligence metrics calculator
+pub struct IntelligenceCalculator {
+    /// Weights for capability scoring
+    pub capability_weights: [f64; 6],
+    /// Baseline for comparison
+    pub baseline_accuracy: f64,
+    /// Oracle performance for regret calculation
+    pub oracle_reward: f64,
+}
+
+impl Default for IntelligenceCalculator {
+    fn default() -> Self {
+        Self {
+            capability_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            baseline_accuracy: 0.5,
+            oracle_reward: 100.0,
+        }
+    }
+}
+
+impl IntelligenceCalculator {
+    /// Calculate intelligence assessment from raw metrics
+    pub fn calculate(&self, raw: &RawMetrics) -> IntelligenceAssessment {
+        let capabilities = self.calculate_capabilities(raw);
+        let reasoning = self.calculate_reasoning(raw);
+        let learning = self.calculate_learning(raw);
+        let tool_use = self.calculate_tool_use(raw);
+        let meta_cognition = self.calculate_meta_cognition(raw);
+        let cost = self.calculate_cost(raw);
+        let robustness = self.calculate_robustness(raw);
+
+        // Overall score: three equal pillars — graded outcomes, cost, robustness
+        let overall_score = self.calculate_overall_score(
+            &capabilities,
+            &reasoning,
+            &learning,
+            &tool_use,
+            &meta_cognition,
+            &cost,
+            &robustness,
+        );
+
+        IntelligenceAssessment {
+            overall_score,
+            capabilities,
+            reasoning,
+            learning,
+            tool_use,
+            meta_cognition,
+            cost,
+            robustness,
+            raw_data: raw.clone(),
+        }
+    }
+
+    fn calculate_capabilities(&self, raw: &RawMetrics) -> CapabilityScores {
+        let base_accuracy = if raw.tasks_attempted > 0 {
+            raw.tasks_correct as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        // Temporal reasoning: accuracy on time-based tasks
+        let temporal_reasoning = base_accuracy * 100.0;
+
+        // Constraint satisfaction: correct solutions
+        let constraint_satisfaction = base_accuracy * 100.0;
+
+        // Information retrieval: based on steps to solution
+        let avg_steps = if raw.tasks_attempted > 0 {
+            raw.total_steps as f64 / raw.tasks_attempted as f64
+        } else {
+            100.0
+        };
+        let information_retrieval = (100.0 - avg_steps).max(0.0).min(100.0);
+
+        // Pattern recognition: performance improvement across difficulties
+        let pattern_recognition = self.calculate_pattern_recognition(raw);
+
+        // Planning: efficiency of tool use
+        let avg_tools = if raw.tasks_attempted > 0 {
+            raw.total_tool_calls as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+        let planning = if avg_tools > 0.0 && avg_tools <= 2.0 {
+            100.0 * (1.0 - (avg_tools - 1.0).abs() / 2.0)
+        } else {
+            50.0
+        };
+
+        // Adaptation: improvement over episodes
+        let adaptation = self.calculate_adaptation(raw);
+
+        CapabilityScores {
+            temporal_reasoning,
+            constraint_satisfaction,
+            information_retrieval,
+            pattern_recognition,
+            planning,
+            adaptation,
+        }
+    }
+
+    fn calculate_pattern_recognition(&self, raw: &RawMetrics) -> f64 {
+        if raw.by_difficulty.len() < 2 {
+            return 50.0;
+        }
+
+        // Check if harder problems are still solvable
+        let mut difficulties: Vec<_> = raw.by_difficulty.keys().copied().collect();
+        difficulties.sort();
+
+        let mut scores = Vec::new();
+        for d in &difficulties {
+            if let Some(stats) = raw.by_difficulty.get(d) {
+                if stats.attempted > 0 {
+                    scores.push(stats.correct as f64 / stats.attempted as f64);
+                }
+            }
+        }
+
+        if scores.is_empty() {
+            return 50.0;
+        }
+
+        // Average accuracy across difficulties
+        let avg: f64 = scores.iter().sum::<f64>() / scores.len() as f64;
+        avg * 100.0
+    }
+
+    fn calculate_adaptation(&self, raw: &RawMetrics) -> f64 {
+        if raw.episodes.len() < 3 {
+            return 50.0;
+        }
+
+        // Check if accuracy improves over episodes
+        let first_half: f64 = raw.episodes[..raw.episodes.len() / 2]
+            .iter()
+            .map(|e| e.accuracy)
+            .sum::<f64>()
+            / (raw.episodes.len() / 2) as f64;
+
+        let second_half: f64 = raw.episodes[raw.episodes.len() / 2..]
+            .iter()
+            .map(|e| e.accuracy)
+            .sum::<f64>()
+            / (raw.episodes.len() - raw.episodes.len() / 2) as f64;
+
+        let improvement = second_half - first_half;
+
+        // Scale: -0.2 to +0.2 improvement maps to 0-100
+        ((improvement + 0.2) / 0.4 * 100.0).max(0.0).min(100.0)
+    }
+
+    fn calculate_reasoning(&self, raw: &RawMetrics) -> ReasoningMetrics {
+        let constraint_satisfaction_rate = if raw.tasks_attempted > 0 {
+            raw.tasks_correct as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        let avg_steps = if raw.tasks_attempted > 0 {
+            raw.total_steps as f64 / raw.tasks_attempted as f64
+        } else {
+            100.0
+        };
+
+        // Reasoning efficiency: inverse of steps (normalized)
+        let reasoning_efficiency = (100.0 - avg_steps).max(0.0).min(100.0) / 100.0;
+
+        // Logical coherence: based on completion rate vs correct rate
+        let completion_rate = if raw.tasks_attempted > 0 {
+            raw.tasks_completed as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+        let logical_coherence = if completion_rate > 0.0 {
+            constraint_satisfaction_rate / completion_rate
+        } else {
+            0.0
+        };
+
+        ReasoningMetrics {
+            logical_coherence,
+            constraint_satisfaction_rate,
+            solution_optimality: constraint_satisfaction_rate,
+            reasoning_efficiency,
+            error_rate: 1.0 - constraint_satisfaction_rate,
+        }
+    }
+
+    fn calculate_learning(&self, raw: &RawMetrics) -> LearningMetrics {
+        let mut learning = LearningMetrics::default();
+
+        if raw.episodes.is_empty() {
+            return learning;
+        }
+
+        // Sample efficiency: accuracy per episode
+        learning.sample_efficiency =
+            raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
+
+        // Regret sublinearity: check if cumulative regret grows sublinearly
+        // True sublinearity means R_k/k → 0 as k → ∞ (regret per episode decreasing)
+        if raw.episodes.len() >= 5 {
+            // Calculate regret trend using linear regression
+            let n = raw.episodes.len() as f64;
+            let mut sum_x = 0.0;
+            let mut sum_y = 0.0;
+            let mut sum_xy = 0.0;
+            let mut sum_xx = 0.0;
+
+            for (i, ep) in raw.episodes.iter().enumerate() {
+                let x = (i + 1) as f64;
+                let y = ep.regret;
+                sum_x += x;
+                sum_y += y;
+                sum_xy += x * y;
+                sum_xx += x * x;
+            }
+
+            let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
+
+            // Negative slope = decreasing regret = sublinear
+            // Transform: slope < 0 → sublinearity > 0
+            if slope < 0.0 {
+                // Stronger negative slope = better sublinearity (cap at 1.0)
+                learning.regret_sublinearity = (-slope / 10.0).min(1.0);
+            }
+
+            // Also check cumulative average
+            let last = raw.episodes.last().unwrap();
+            let avg_regret = last.cumulative_regret / n;
+            let first_half_avg = raw
+                .episodes
+                .iter()
+                .take(raw.episodes.len() / 2)
+                .map(|e| e.regret)
+                .sum::<f64>()
+                / (n / 2.0);
+
+            // If second half has lower per-episode regret, that's sublinear
+            if avg_regret < first_half_avg && learning.regret_sublinearity == 0.0 {
+                learning.regret_sublinearity =
+                    ((first_half_avg - avg_regret) / first_half_avg).max(0.0);
+            }
+        }
+
+        // Learning rate: improvement in accuracy over episodes
+        if raw.episodes.len() >= 2 {
+            let first_acc = raw.episodes[0].accuracy;
+            let last_acc = raw.episodes.last().unwrap().accuracy;
+            learning.learning_rate = (last_acc - first_acc + 1.0) / 2.0;
+        }
+
+        // Generalization: consistency across difficulties
+        if raw.by_difficulty.len() >= 2 {
+            let accuracies: Vec<f64> = raw
+                .by_difficulty
+                .values()
+                .filter(|s| s.attempted > 0)
+                .map(|s| s.correct as f64 / s.attempted as f64)
+                .collect();
+
+            if !accuracies.is_empty() {
+                let mean = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
+                let variance = accuracies.iter().map(|a| (a - mean).powi(2)).sum::<f64>()
+                    / accuracies.len() as f64;
+                let std_dev = variance.sqrt();
+
+                // Lower variance = better generalization
+                learning.generalization = (1.0 - std_dev).max(0.0);
+            }
+        }
+
+        learning
+    }
+
+    fn calculate_tool_use(&self, raw: &RawMetrics) -> ToolUseMetrics {
+        let avg_tools = if raw.tasks_attempted > 0 {
+            raw.total_tool_calls as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        // Selection appropriateness: using tools when helpful
+        let accuracy = if raw.tasks_attempted > 0 {
+            raw.tasks_correct as f64 / raw.tasks_attempted as f64
+        } else {
+            0.0
+        };
+
+        // Effectiveness: accuracy when tools are used
+        let utilization_effectiveness = accuracy;
+
+        // Appropriateness: not overusing tools
+        let selection_appropriateness = if avg_tools > 0.0 {
+            (accuracy / avg_tools.min(2.0)).min(1.0)
+        } else {
+            0.5
+        };
+
+        ToolUseMetrics {
+            selection_appropriateness,
+            utilization_effectiveness,
+            composition_ability: avg_tools.min(1.0), // Using multiple tools
+            discovery_ability: accuracy,             // Finding solutions
+        }
+    }
+
+    fn calculate_meta_cognition(&self, raw: &RawMetrics) -> MetaCognitiveMetrics {
+        // Self-correction: completed but not correct -> corrected
+        let completed_but_wrong = raw.tasks_completed.saturating_sub(raw.tasks_correct);
+        let self_correction_rate = if completed_but_wrong > 0 {
+            0.0 // No self-correction if still wrong
+        } else if raw.tasks_completed > 0 {
+            1.0 // All completed are correct
+        } else {
+            0.5
+        };
+
+        // Strategy adaptation: improvement over episodes
+        let strategy_adaptation = if raw.episodes.len() >= 3 {
+            let trend: f64 = raw
+                .episodes
+                .windows(2)
+                .map(|w| {
+                    if w[1].accuracy > w[0].accuracy {
+                        1.0
+                    } else {
+                        0.0
+                    }
+                })
+                .sum::<f64>();
+            trend / (raw.episodes.len() - 1) as f64
+        } else {
+            0.5
+        };
+
+        MetaCognitiveMetrics {
+            self_correction_rate,
+            uncertainty_calibration: 0.5, // Would need confidence scores
+            strategy_adaptation,
+            progress_monitoring: strategy_adaptation, // Similar metric
+        }
+    }
+
+    fn calculate_cost(&self, raw: &RawMetrics) -> CostMetrics {
+        let steps_per_solve = if raw.tasks_correct > 0 {
+            raw.total_steps as f64 / raw.tasks_correct as f64
+        } else if raw.tasks_attempted > 0 {
+            raw.total_steps as f64
+        } else {
+            100.0
+        };
+
+        let tools_per_solve = if raw.tasks_correct > 0 {
+            raw.total_tool_calls as f64 / raw.tasks_correct as f64
+        } else {
+            10.0
+        };
+
+        // Efficiency: 1.0 at <=5 steps/solve, 0.0 at >=100 steps/solve
+        let cost_efficiency = (1.0 - (steps_per_solve - 5.0) / 95.0).clamp(0.0, 1.0);
+
+        // Cost trend: compare early vs late episode accuracy per step
+        let cost_trend = if raw.episodes.len() >= 4 {
+            let half = raw.episodes.len() / 2;
+            let early_acc: f64 =
+                raw.episodes[..half].iter().map(|e| e.accuracy).sum::<f64>() / half as f64;
+            let late_acc: f64 = raw.episodes[half..].iter().map(|e| e.accuracy).sum::<f64>()
+                / (raw.episodes.len() - half) as f64;
+            // If accuracy improves, effective cost per solve drops
+            if early_acc > 0.01 {
+                (late_acc - early_acc) / early_acc
+            } else {
+                0.0
+            }
+        } else {
+            0.0
+        };
+
+        CostMetrics {
+            steps_per_solve,
+            tools_per_solve,
+            cost_efficiency,
+            cost_trend,
+        }
+    }
+
+    fn calculate_robustness(&self, raw: &RawMetrics) -> RobustnessMetrics {
+        let noise_accuracy = if raw.noise_tasks_attempted > 0 {
+            raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
+        } else {
+            0.5 // no noise data -> neutral prior
+        };
+
+        let clean_attempted = raw
+            .tasks_attempted
+            .saturating_sub(raw.noise_tasks_attempted);
+        let clean_correct = raw.tasks_correct.saturating_sub(raw.noise_tasks_correct);
+        let clean_accuracy = if clean_attempted > 0 {
+            clean_correct as f64 / clean_attempted as f64
+        } else {
+            0.0
+        };
+
+        let noise_degradation = (clean_accuracy - noise_accuracy).max(0.0);
+
+        let consistency = if raw.episodes.len() >= 2 {
+            let mean =
+                raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
+            let variance = raw
+                .episodes
+                .iter()
+                .map(|e| (e.accuracy - mean).powi(2))
+                .sum::<f64>()
+                / raw.episodes.len() as f64;
+            (1.0 - variance.sqrt()).max(0.0)
+        } else {
+            0.5
+        };
+
+        let robustness_score =
+            noise_accuracy * 0.4 + (1.0 - noise_degradation.min(1.0)) * 0.3 + consistency * 0.3;
+
+        RobustnessMetrics {
+            noise_accuracy,
+            noise_degradation,
+            consistency,
+            robustness_score,
+        }
+    }
+
+    fn calculate_overall_score(
+        &self,
+        capabilities: &CapabilityScores,
+        reasoning: &ReasoningMetrics,
+        learning: &LearningMetrics,
+        tool_use: &ToolUseMetrics,
+        meta_cognition: &MetaCognitiveMetrics,
+        cost: &CostMetrics,
+        robustness: &RobustnessMetrics,
+    ) -> f64 {
+        // Sub-scores (0-100 scale)
+        let cap_score = capabilities.weighted_average(&self.capability_weights);
+
+        let reasoning_score = (reasoning.logical_coherence
+            + reasoning.constraint_satisfaction_rate
+            + reasoning.solution_optimality
+            + reasoning.reasoning_efficiency)
+            / 4.0
+            * 100.0;
+
+        let learning_score = (learning.sample_efficiency
+            + learning.regret_sublinearity
+            + learning.learning_rate
+            + learning.generalization)
+            / 4.0
+            * 100.0;
+
+        let tool_score = (tool_use.selection_appropriateness
+            + tool_use.utilization_effectiveness
+            + tool_use.composition_ability
+            + tool_use.discovery_ability)
+            / 4.0
+            * 100.0;
+
+        let meta_score = (meta_cognition.self_correction_rate
+            + meta_cognition.strategy_adaptation
+            + meta_cognition.progress_monitoring)
+            / 3.0
+            * 100.0;
+
+        let cost_score = cost.cost_efficiency * 100.0;
+        let robustness_score = robustness.robustness_score * 100.0;
+
+        // Three equal pillars: graded outcomes (~0.34), cost (~0.33), robustness (~0.33)
+        // Graded outcomes = capabilities + reasoning + learning + tool + meta
+        cap_score * 0.12
+            + reasoning_score * 0.10
+            + learning_score * 0.06
+            + tool_score * 0.03
+            + meta_score * 0.03
+            + cost_score * 0.33
+            + robustness_score * 0.33
+    }
+}
+
+/// Print a formatted intelligence report
+pub fn print_intelligence_report(assessment: &IntelligenceAssessment) {
+    println!("╔══════════════════════════════════════════════════════════════╗");
+    println!("║              Intelligence Assessment Report                   ║");
+    println!("╚══════════════════════════════════════════════════════════════╝");
+    println!();
+    println!(
+        "🧠 Overall Intelligence Score: {:.1}/100",
+        assessment.overall_score
+    );
+    println!();
+
+    println!("📊 Capability Scores:");
+    println!(
+        "   Temporal Reasoning:     {:5.1}",
+        assessment.capabilities.temporal_reasoning
+    );
+    println!(
+        "   Constraint Satisfaction:{:5.1}",
+        assessment.capabilities.constraint_satisfaction
+    );
+    println!(
+        "   Information Retrieval:  {:5.1}",
+        assessment.capabilities.information_retrieval
+    );
+    println!(
+        "   Pattern Recognition:    {:5.1}",
+        assessment.capabilities.pattern_recognition
+    );
+    println!(
+        "   Planning:               {:5.1}",
+        assessment.capabilities.planning
+    );
+    println!(
+        "   Adaptation:             {:5.1}",
+        assessment.capabilities.adaptation
+    );
+    println!();
+
+    println!("🔍 Reasoning Quality:");
+    println!(
+        "   Logical Coherence:      {:.2}",
+        assessment.reasoning.logical_coherence
+    );
+    println!(
+        "   Constraint Satisfaction:{:.2}",
+        assessment.reasoning.constraint_satisfaction_rate
+    );
+    println!(
+        "   Solution Optimality:    {:.2}",
+        assessment.reasoning.solution_optimality
+    );
+    println!(
+        "   Reasoning Efficiency:   {:.2}",
+        assessment.reasoning.reasoning_efficiency
+    );
+    println!(
+        "   Error Rate:             {:.2}",
+        assessment.reasoning.error_rate
+    );
+    println!();
+
+    println!("📈 Learning Metrics:");
+    println!(
+        "   Sample Efficiency:      {:.2}",
+        assessment.learning.sample_efficiency
+    );
+    println!(
+        "   Regret Sublinearity:    {:.2}",
+        assessment.learning.regret_sublinearity
+    );
+    println!(
+        "   Learning Rate:          {:.2}",
+        assessment.learning.learning_rate
+    );
+    println!(
+        "   Generalization:         {:.2}",
+        assessment.learning.generalization
+    );
+    println!();
+
+    println!("🔧 Tool Use Proficiency:");
+    println!(
+        "   Selection:              {:.2}",
+        assessment.tool_use.selection_appropriateness
+    );
+    println!(
+        "   Effectiveness:          {:.2}",
+        assessment.tool_use.utilization_effectiveness
+    );
+    println!(
+        "   Composition:            {:.2}",
+        assessment.tool_use.composition_ability
+    );
+    println!();
+
+    println!("🪞 Meta-Cognitive Indicators:");
+    println!(
+        "   Self-Correction:        {:.2}",
+        assessment.meta_cognition.self_correction_rate
+    );
+    println!(
+        "   Strategy Adaptation:    {:.2}",
+        assessment.meta_cognition.strategy_adaptation
+    );
+    println!(
+        "   Progress Monitoring:    {:.2}",
+        assessment.meta_cognition.progress_monitoring
+    );
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_intelligence_calculation() {
+        let mut raw = RawMetrics::default();
+        raw.tasks_attempted = 100;
+        raw.tasks_completed = 90;
+        raw.tasks_correct = 80;
+        raw.total_steps = 500;
+        raw.total_tool_calls = 100;
+
+        let calculator = IntelligenceCalculator::default();
+        let assessment = calculator.calculate(&raw);
+
+        assert!(assessment.overall_score > 0.0);
+        assert!(assessment.capabilities.temporal_reasoning > 0.0);
+    }
+
+    #[test]
+    fn test_learning_metrics() {
+        let mut raw = RawMetrics::default();
+        raw.tasks_attempted = 50;
+        raw.tasks_correct = 40;
+
+        // Add episodes showing improvement
+        for i in 0..10 {
+            raw.episodes.push(EpisodeMetrics {
+                episode: i + 1,
+                accuracy: 0.5 + 0.04 * i as f64,
+                reward: 50.0 + 4.0 * i as f64,
+                regret: 50.0 - 4.0 * i as f64,
+                cumulative_regret: (0..=i).map(|j| 50.0 - 4.0 * j as f64).sum(),
+            });
+        }
+
+        let calculator = IntelligenceCalculator::default();
+        let assessment = calculator.calculate(&raw);
+
+        // Should show learning (improvement over time)
+        assert!(assessment.learning.learning_rate > 0.5);
+    }
+}
--- a/examples/benchmarks/src/lib.rs
+++ b/examples/benchmarks/src/lib.rs
@@ -0,0 +1,38 @@
+//! RuVector Benchmarks Library
+//!
+//! Comprehensive benchmarking suite for:
+//! - Temporal reasoning (TimePuzzles-style constraint inference)
+//! - Vector index operations (IVF, coherence-gated search)
+//! - Swarm controller regret tracking
+//! - Intelligence metrics and cognitive capability assessment
+//! - Adaptive learning with ReasoningBank trajectory tracking
+//!
+//! Based on research from:
+//! - TimePuzzles benchmark (arXiv:2601.07148)
+//! - Sublinear regret in multi-agent control
+//! - Tool-augmented iterative temporal reasoning
+//! - Cognitive capability assessment frameworks
+//! - lean-agentic type theory for verified reasoning
+
+pub mod acceptance_test;
+pub mod agi_contract;
+pub mod intelligence_metrics;
+pub mod logging;
+pub mod loop_gating;
+pub mod publishable_rvf;
+pub mod reasoning_bank;
+pub mod rvf_artifact;
+pub mod rvf_intelligence_bench;
+pub mod superintelligence;
+pub mod swarm_regret;
+pub mod temporal;
+pub mod timepuzzles;
+pub mod vector_index;
+
+pub use intelligence_metrics::*;
+pub use logging::*;
+pub use reasoning_bank::*;
+pub use swarm_regret::*;
+pub use temporal::*;
+pub use timepuzzles::*;
+pub use vector_index::*;
--- a/examples/benchmarks/src/logging.rs
+++ b/examples/benchmarks/src/logging.rs
@@ -0,0 +1,421 @@
+//! Logging Schema for Benchmark Results
+//!
+//! Comprehensive logging for:
+//! - Temporal reasoning benchmarks
+//! - Vector operations
+//! - Swarm controller metrics
+//! - Tool usage tracking
+
+use anyhow::Result;
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use std::fs::{self, File, OpenOptions};
+use std::io::{BufWriter, Write};
+use std::path::Path;
+
+/// Log entry types
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum LogEntry {
+    /// Temporal benchmark run
+    TemporalBenchmark(TemporalBenchmarkLog),
+    /// Vector operation
+    VectorOperation(VectorOperationLog),
+    /// Swarm episode
+    SwarmEpisode(SwarmEpisodeLog),
+    /// Tool call
+    ToolCall(ToolCallLog),
+    /// System event
+    System(SystemLog),
+}
+
+/// Temporal benchmark log entry
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct TemporalBenchmarkLog {
+    pub timestamp: DateTime<Utc>,
+    pub benchmark_id: String,
+    pub puzzle_id: String,
+    pub difficulty: u8,
+    pub solved: bool,
+    pub correct: bool,
+    pub steps: usize,
+    pub tool_calls: usize,
+    pub latency_ms: u64,
+    pub constraint_count: usize,
+    pub calendar_tool_enabled: bool,
+    pub web_search_enabled: bool,
+}
+
+/// Vector operation log entry
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct VectorOperationLog {
+    pub timestamp: DateTime<Utc>,
+    pub operation: String,
+    pub index_dim: usize,
+    pub index_size: usize,
+    pub query_count: usize,
+    pub top_k: usize,
+    pub ivf_enabled: bool,
+    pub coherence_score: f32,
+    pub latency_us: u64,
+    pub results_count: usize,
+}
+
+/// Swarm episode log entry
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SwarmEpisodeLog {
+    pub timestamp: DateTime<Utc>,
+    pub episode: usize,
+    pub num_tasks: usize,
+    pub solved: usize,
+    pub correct: usize,
+    pub reward: f64,
+    pub oracle_reward: f64,
+    pub regret: f64,
+    pub cumulative_regret: f64,
+    pub average_regret: f64,
+    pub is_sublinear: bool,
+}
+
+/// Tool call log entry
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ToolCallLog {
+    pub timestamp: DateTime<Utc>,
+    pub tool_name: String,
+    pub tool_type: String,
+    pub input_summary: String,
+    pub success: bool,
+    pub latency_ms: u64,
+    pub context: String,
+}
+
+/// System log entry
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SystemLog {
+    pub timestamp: DateTime<Utc>,
+    pub level: String,
+    pub message: String,
+    pub component: String,
+}
+
+/// Benchmark logger
+pub struct BenchmarkLogger {
+    /// Log file path
+    path: String,
+    /// Writer
+    writer: Option<BufWriter<File>>,
+    /// In-memory buffer for batch writes
+    buffer: Vec<LogEntry>,
+    /// Buffer size before flush
+    flush_threshold: usize,
+}
+
+impl BenchmarkLogger {
+    /// Create a new logger
+    pub fn new(path: impl Into<String>) -> Result<Self> {
+        let path = path.into();
+
+        // Create parent directories
+        if let Some(parent) = Path::new(&path).parent() {
+            fs::create_dir_all(parent)?;
+        }
+
+        let file = OpenOptions::new().create(true).append(true).open(&path)?;
+
+        Ok(Self {
+            path,
+            writer: Some(BufWriter::new(file)),
+            buffer: Vec::new(),
+            flush_threshold: 100,
+        })
+    }
+
+    /// Log an entry
+    pub fn log(&mut self, entry: LogEntry) -> Result<()> {
+        self.buffer.push(entry);
+        if self.buffer.len() >= self.flush_threshold {
+            self.flush()?;
+        }
+        Ok(())
+    }
+
+    /// Log a temporal benchmark result
+    pub fn log_temporal(
+        &mut self,
+        benchmark_id: impl Into<String>,
+        puzzle_id: impl Into<String>,
+        difficulty: u8,
+        solved: bool,
+        correct: bool,
+        steps: usize,
+        tool_calls: usize,
+        latency_ms: u64,
+        constraint_count: usize,
+        calendar_tool: bool,
+        web_search: bool,
+    ) -> Result<()> {
+        self.log(LogEntry::TemporalBenchmark(TemporalBenchmarkLog {
+            timestamp: Utc::now(),
+            benchmark_id: benchmark_id.into(),
+            puzzle_id: puzzle_id.into(),
+            difficulty,
+            solved,
+            correct,
+            steps,
+            tool_calls,
+            latency_ms,
+            constraint_count,
+            calendar_tool_enabled: calendar_tool,
+            web_search_enabled: web_search,
+        }))
+    }
+
+    /// Log a vector operation
+    pub fn log_vector(
+        &mut self,
+        operation: impl Into<String>,
+        index_dim: usize,
+        index_size: usize,
+        query_count: usize,
+        top_k: usize,
+        ivf_enabled: bool,
+        coherence_score: f32,
+        latency_us: u64,
+        results_count: usize,
+    ) -> Result<()> {
+        self.log(LogEntry::VectorOperation(VectorOperationLog {
+            timestamp: Utc::now(),
+            operation: operation.into(),
+            index_dim,
+            index_size,
+            query_count,
+            top_k,
+            ivf_enabled,
+            coherence_score,
+            latency_us,
+            results_count,
+        }))
+    }
+
+    /// Log a swarm episode
+    pub fn log_swarm(
+        &mut self,
+        episode: usize,
+        num_tasks: usize,
+        solved: usize,
+        correct: usize,
+        reward: f64,
+        oracle_reward: f64,
+        cumulative_regret: f64,
+        average_regret: f64,
+        is_sublinear: bool,
+    ) -> Result<()> {
+        self.log(LogEntry::SwarmEpisode(SwarmEpisodeLog {
+            timestamp: Utc::now(),
+            episode,
+            num_tasks,
+            solved,
+            correct,
+            reward,
+            oracle_reward,
+            regret: oracle_reward - reward,
+            cumulative_regret,
+            average_regret,
+            is_sublinear,
+        }))
+    }
+
+    /// Log a tool call
+    pub fn log_tool(
+        &mut self,
+        tool_name: impl Into<String>,
+        tool_type: impl Into<String>,
+        input_summary: impl Into<String>,
+        success: bool,
+        latency_ms: u64,
+        context: impl Into<String>,
+    ) -> Result<()> {
+        self.log(LogEntry::ToolCall(ToolCallLog {
+            timestamp: Utc::now(),
+            tool_name: tool_name.into(),
+            tool_type: tool_type.into(),
+            input_summary: input_summary.into(),
+            success,
+            latency_ms,
+            context: context.into(),
+        }))
+    }
+
+    /// Log a system message
+    pub fn log_system(
+        &mut self,
+        level: impl Into<String>,
+        message: impl Into<String>,
+        component: impl Into<String>,
+    ) -> Result<()> {
+        self.log(LogEntry::System(SystemLog {
+            timestamp: Utc::now(),
+            level: level.into(),
+            message: message.into(),
+            component: component.into(),
+        }))
+    }
+
+    /// Flush buffer to file
+    pub fn flush(&mut self) -> Result<()> {
+        if let Some(ref mut writer) = self.writer {
+            for entry in self.buffer.drain(..) {
+                let json = serde_json::to_string(&entry)?;
+                writeln!(writer, "{}", json)?;
+            }
+            writer.flush()?;
+        }
+        Ok(())
+    }
+
+    /// Close the logger
+    pub fn close(&mut self) -> Result<()> {
+        self.flush()?;
+        self.writer = None;
+        Ok(())
+    }
+
+    /// Get log file path
+    pub fn path(&self) -> &str {
+        &self.path
+    }
+}
+
+impl Drop for BenchmarkLogger {
+    fn drop(&mut self) {
+        let _ = self.flush();
+    }
+}
+
+/// Log reader for analysis
+pub struct LogReader {
+    path: String,
+}
+
+impl LogReader {
+    /// Create a new reader
+    pub fn new(path: impl Into<String>) -> Self {
+        Self { path: path.into() }
+    }
+
+    /// Read all entries
+    pub fn read_all(&self) -> Result<Vec<LogEntry>> {
+        let content = fs::read_to_string(&self.path)?;
+        let mut entries = Vec::new();
+        for line in content.lines() {
+            if !line.is_empty() {
+                let entry: LogEntry = serde_json::from_str(line)?;
+                entries.push(entry);
+            }
+        }
+        Ok(entries)
+    }
+
+    /// Read temporal benchmark entries only
+    pub fn read_temporal(&self) -> Result<Vec<TemporalBenchmarkLog>> {
+        let entries = self.read_all()?;
+        Ok(entries
+            .into_iter()
+            .filter_map(|e| match e {
+                LogEntry::TemporalBenchmark(t) => Some(t),
+                _ => None,
+            })
+            .collect())
+    }
+
+    /// Read swarm episode entries only
+    pub fn read_swarm(&self) -> Result<Vec<SwarmEpisodeLog>> {
+        let entries = self.read_all()?;
+        Ok(entries
+            .into_iter()
+            .filter_map(|e| match e {
+                LogEntry::SwarmEpisode(s) => Some(s),
+                _ => None,
+            })
+            .collect())
+    }
+
+    /// Compute aggregate statistics
+    pub fn aggregate_temporal(&self) -> Result<TemporalAggregates> {
+        let logs = self.read_temporal()?;
+        if logs.is_empty() {
+            return Ok(TemporalAggregates::default());
+        }
+
+        let total = logs.len();
+        let solved = logs.iter().filter(|l| l.solved).count();
+        let correct = logs.iter().filter(|l| l.correct).count();
+        let avg_steps = logs.iter().map(|l| l.steps).sum::<usize>() as f64 / total as f64;
+        let avg_latency = logs.iter().map(|l| l.latency_ms).sum::<u64>() as f64 / total as f64;
+        let avg_tools = logs.iter().map(|l| l.tool_calls).sum::<usize>() as f64 / total as f64;
+
+        // By difficulty
+        let mut by_difficulty: std::collections::HashMap<u8, (usize, usize)> =
+            std::collections::HashMap::new();
+        for log in &logs {
+            let entry = by_difficulty.entry(log.difficulty).or_insert((0, 0));
+            entry.0 += 1;
+            if log.correct {
+                entry.1 += 1;
+            }
+        }
+
+        Ok(TemporalAggregates {
+            total_puzzles: total,
+            solved_count: solved,
+            correct_count: correct,
+            accuracy: correct as f64 / total as f64,
+            avg_steps,
+            avg_latency_ms: avg_latency,
+            avg_tool_calls: avg_tools,
+            accuracy_by_difficulty: by_difficulty
+                .into_iter()
+                .map(|(d, (t, c))| (d, c as f64 / t as f64))
+                .collect(),
+        })
+    }
+}
+
+/// Aggregate statistics for temporal benchmarks
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct TemporalAggregates {
+    pub total_puzzles: usize,
+    pub solved_count: usize,
+    pub correct_count: usize,
+    pub accuracy: f64,
+    pub avg_steps: f64,
+    pub avg_latency_ms: f64,
+    pub avg_tool_calls: f64,
+    pub accuracy_by_difficulty: std::collections::HashMap<u8, f64>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::tempdir;
+
+    #[test]
+    fn test_logger() {
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("test.log");
+
+        let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
+
+        logger
+            .log_temporal(
+                "bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
+            )
+            .unwrap();
+
+        logger.flush().unwrap();
+
+        let reader = LogReader::new(path.to_str().unwrap());
+        let entries = reader.read_all().unwrap();
+        assert_eq!(entries.len(), 1);
+    }
+}
--- a/examples/benchmarks/src/loop_gating.rs
+++ b/examples/benchmarks/src/loop_gating.rs
@@ -0,0 +1,603 @@
+//! Three-Loop Gating Architecture
+//!
+//! Separates the intelligence engine into three explicit loops with strict gating:
+//!
+//! ## Fast Loop (per step)
+//! - Runs every step of every solver invocation
+//! - No planning, no model calls
+//! - Only checks invariants: allow, block, quarantine, or rollback
+//! - Outputs: GateDecision, HealthDelta, WitnessRecord
+//!
+//! ## Medium Loop (per attempt)
+//! - Runs per solve attempt (one puzzle)
+//! - Multi-strategy solver, ensemble vote, cascade passes
+//! - Can PROPOSE memory writes, but cannot COMMIT them
+//! - Outputs: CandidateSolution, AttemptTrace, ProposedMemoryWrites
+//!
+//! ## Slow Loop (per cycle)
+//! - Runs per training/evaluation cycle
+//! - Consolidation, compiler updates, promotion review, meta parameter updates
+//! - Only component that can PROMOTE patterns (Volatile → Trusted)
+//! - Outputs: NewPolicyCheckpoint, NewMemoryRoot, PromotionLog
+//!
+//! ## Critical Gating Rule
+//! Medium loop can propose memory writes.
+//! Fast loop is the only component allowed to commit them.
+//! Slow loop is the only component allowed to promote them.
+
+use serde::{Deserialize, Serialize};
+
+use crate::agi_contract::ContractHealth;
+use crate::reasoning_bank::{
+    Counterexample, MemoryCheckpoint, MemoryClass, ReasoningBank, RollbackWitness, Trajectory,
+    Verdict,
+};
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Fast Loop: per-step invariant gating
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Decision made by the fast loop gate on each step.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub enum GateDecision {
+    /// Allow the step to proceed
+    Allow,
+    /// Block: step would violate a policy
+    Block { reason: String },
+    /// Quarantine: result is suspicious, hold for review
+    Quarantine { reason: String },
+    /// Rollback: regression detected, revert to checkpoint
+    Rollback {
+        checkpoint_id: usize,
+        reason: String,
+    },
+}
+
+/// Health delta tracked per step.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct HealthDelta {
+    pub steps_taken: usize,
+    pub contradictions_detected: usize,
+    pub policy_violations: usize,
+    pub cost_accumulated: f64,
+}
+
+/// Fast loop gate: checks invariants on every step.
+/// This is the ONLY component allowed to commit memory writes.
+#[derive(Clone, Debug)]
+pub struct FastGate {
+    /// Maximum steps before forced halt
+    pub step_limit: usize,
+    /// Maximum cost accumulation before halt
+    pub cost_limit: f64,
+    /// Contradiction threshold before quarantine
+    pub contradiction_threshold: usize,
+    /// Running health delta
+    pub delta: HealthDelta,
+    /// Pending writes from medium loop (committed by fast loop)
+    pub pending_writes: Vec<ProposedWrite>,
+    /// Gate decisions log
+    pub decisions: Vec<GateDecision>,
+}
+
+impl FastGate {
+    pub fn new(step_limit: usize) -> Self {
+        Self {
+            step_limit,
+            cost_limit: f64::MAX,
+            contradiction_threshold: 3,
+            delta: HealthDelta::default(),
+            pending_writes: Vec::new(),
+            decisions: Vec::new(),
+        }
+    }
+
+    /// Check a step and return a gate decision.
+    pub fn check_step(&mut self, step: usize, solved: bool, correct: bool) -> GateDecision {
+        self.delta.steps_taken = step;
+
+        // Check step budget
+        if step >= self.step_limit {
+            let decision = GateDecision::Block {
+                reason: format!("step budget exhausted ({}/{})", step, self.step_limit),
+            };
+            self.decisions.push(decision.clone());
+            return decision;
+        }
+
+        // Check contradiction (solved but wrong)
+        if solved && !correct {
+            self.delta.contradictions_detected += 1;
+            if self.delta.contradictions_detected >= self.contradiction_threshold {
+                let decision = GateDecision::Quarantine {
+                    reason: format!(
+                        "{} contradictions in this attempt",
+                        self.delta.contradictions_detected,
+                    ),
+                };
+                self.decisions.push(decision.clone());
+                return decision;
+            }
+        }
+
+        let decision = GateDecision::Allow;
+        self.decisions.push(decision.clone());
+        decision
+    }
+
+    /// Commit pending writes from the medium loop into the bank.
+    /// Only the fast loop has authority to do this.
+    pub fn commit_writes(&mut self, bank: &mut ReasoningBank) -> usize {
+        let count = self.pending_writes.len();
+        for write in self.pending_writes.drain(..) {
+            match write {
+                ProposedWrite::RecordTrajectory(traj) => {
+                    bank.record_trajectory_gated(traj);
+                }
+                ProposedWrite::RecordCounterexample {
+                    constraint_type,
+                    trajectory,
+                } => {
+                    bank.record_counterexample(&constraint_type, trajectory);
+                }
+                ProposedWrite::QuarantineTrajectory { trajectory, reason } => {
+                    bank.quarantine_trajectory(trajectory, &reason);
+                }
+            }
+        }
+        count
+    }
+
+    /// Reset for next attempt.
+    pub fn reset(&mut self) {
+        self.delta = HealthDelta::default();
+        self.decisions.clear();
+    }
+}
+
+/// A proposed memory write from the medium loop.
+/// Cannot be committed directly — must go through FastGate.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum ProposedWrite {
+    RecordTrajectory(Trajectory),
+    RecordCounterexample {
+        constraint_type: String,
+        trajectory: Trajectory,
+    },
+    QuarantineTrajectory {
+        trajectory: Trajectory,
+        reason: String,
+    },
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Medium Loop: per-attempt solving
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Trace of a single solve attempt.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct AttemptTrace {
+    /// Puzzle ID
+    pub puzzle_id: String,
+    /// Strategy used
+    pub strategy: String,
+    /// Steps taken
+    pub steps: usize,
+    /// Whether the answer was correct
+    pub correct: bool,
+    /// Whether a retry was attempted
+    pub retried: bool,
+    /// Gate decisions during this attempt
+    pub gate_decisions: Vec<GateDecision>,
+    /// Proposed memory writes (not yet committed)
+    pub proposed_writes: Vec<ProposedWrite>,
+}
+
+/// Medium loop: handles one puzzle solve attempt.
+/// Can propose memory writes but cannot commit them.
+pub struct MediumLoop {
+    /// Fast gate for step-level invariant checking
+    pub gate: FastGate,
+}
+
+impl MediumLoop {
+    pub fn new(step_limit: usize) -> Self {
+        Self {
+            gate: FastGate::new(step_limit),
+        }
+    }
+
+    /// Process a solve result and produce an attempt trace.
+    /// Proposes memory writes but does NOT commit them.
+    pub fn process_result(
+        &mut self,
+        puzzle_id: &str,
+        difficulty: u8,
+        strategy: &str,
+        steps: usize,
+        solved: bool,
+        correct: bool,
+        constraint_types: &[String],
+    ) -> AttemptTrace {
+        // Fast loop gate check
+        let decision = self.gate.check_step(steps, solved, correct);
+
+        let mut proposed_writes = Vec::new();
+
+        // Build trajectory
+        let mut traj = Trajectory::new(puzzle_id, difficulty);
+        traj.constraint_types = constraint_types.to_vec();
+        traj.record_attempt(
+            if correct {
+                "correct".to_string()
+            } else {
+                "incorrect".to_string()
+            },
+            if correct { 0.9 } else { 0.2 },
+            steps,
+            1,
+            strategy,
+        );
+        traj.set_verdict(
+            if correct {
+                Verdict::Success
+            } else {
+                Verdict::Failed
+            },
+            None,
+        );
+
+        match decision {
+            GateDecision::Allow => {
+                // Propose recording the trajectory
+                proposed_writes.push(ProposedWrite::RecordTrajectory(traj));
+            }
+            GateDecision::Block { .. } => {
+                // Don't record — budget exhausted
+            }
+            GateDecision::Quarantine { ref reason } => {
+                proposed_writes.push(ProposedWrite::QuarantineTrajectory {
+                    trajectory: traj.clone(),
+                    reason: reason.clone(),
+                });
+                for ct in constraint_types {
+                    proposed_writes.push(ProposedWrite::RecordCounterexample {
+                        constraint_type: ct.clone(),
+                        trajectory: traj.clone(),
+                    });
+                }
+            }
+            GateDecision::Rollback { .. } => {
+                // Rollback handled at fast loop level
+            }
+        }
+
+        AttemptTrace {
+            puzzle_id: puzzle_id.to_string(),
+            strategy: strategy.to_string(),
+            steps,
+            correct,
+            retried: false,
+            gate_decisions: vec![decision],
+            proposed_writes,
+        }
+    }
+
+    /// Finalize: transfer proposed writes to fast gate for commitment.
+    pub fn finalize(&mut self, trace: &AttemptTrace) {
+        for write in &trace.proposed_writes {
+            self.gate.pending_writes.push(write.clone());
+        }
+    }
+
+    /// Reset for next attempt.
+    pub fn reset(&mut self) {
+        self.gate.reset();
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Slow Loop: per-cycle consolidation
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Log of pattern promotions during a cycle.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct PromotionLog {
+    /// Patterns promoted from Volatile → Trusted
+    pub promoted: usize,
+    /// Patterns demoted from Trusted → Quarantined
+    pub demoted: usize,
+    /// Patterns remaining in Volatile
+    pub volatile_remaining: usize,
+    /// Patterns in Trusted
+    pub trusted_total: usize,
+    /// Patterns in Quarantined
+    pub quarantined_total: usize,
+}
+
+/// Result of a slow loop cycle.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CycleConsolidation {
+    /// Cycle number
+    pub cycle: usize,
+    /// Checkpoint created at start of cycle
+    pub checkpoint_id: usize,
+    /// Promotion log
+    pub promotion_log: PromotionLog,
+    /// Contract health after consolidation
+    pub contract_health: Option<ContractHealth>,
+    /// Whether a rollback occurred
+    pub rolled_back: bool,
+    /// Rollback witness if rollback occurred
+    pub rollback_witness: Option<RollbackWitness>,
+}
+
+/// Slow loop: handles per-cycle consolidation.
+/// Only component allowed to promote patterns.
+pub struct SlowLoop {
+    /// History of consolidations
+    pub history: Vec<CycleConsolidation>,
+}
+
+impl SlowLoop {
+    pub fn new() -> Self {
+        Self {
+            history: Vec::new(),
+        }
+    }
+
+    /// Run consolidation: promote eligible patterns, demote failing ones.
+    /// This is the ONLY place where pattern promotion happens.
+    pub fn consolidate(
+        &mut self,
+        bank: &mut ReasoningBank,
+        cycle: usize,
+        checkpoint_id: usize,
+        holdout_accuracy: f64,
+        prev_accuracy: Option<f64>,
+    ) -> CycleConsolidation {
+        let mut rolled_back = false;
+        let mut rollback_witness = None;
+
+        // Check for regression — if accuracy dropped, rollback
+        if let Some(prev) = prev_accuracy {
+            if holdout_accuracy < prev - 0.05 {
+                let ok = bank.rollback_with_witness(
+                    checkpoint_id,
+                    "slow loop: accuracy regression",
+                    prev,
+                    holdout_accuracy,
+                );
+                if ok {
+                    rolled_back = true;
+                    rollback_witness = bank.rollback_witnesses.last().cloned();
+                }
+            }
+        }
+
+        // Promote eligible patterns (requires counterexample)
+        let promoted = bank.promote_patterns();
+
+        let log = PromotionLog {
+            promoted,
+            demoted: 0, // Demotions happen in the fast loop
+            volatile_remaining: bank.volatile_count(),
+            trusted_total: bank.trusted_count(),
+            quarantined_total: bank.quarantined_pattern_count(),
+        };
+
+        let consolidation = CycleConsolidation {
+            cycle,
+            checkpoint_id,
+            promotion_log: log,
+            contract_health: None,
+            rolled_back,
+            rollback_witness,
+        };
+
+        self.history.push(consolidation.clone());
+        consolidation
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Tests
+// ═══════════════════════════════════════════════════════════════════════════
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn fast_gate_allows_normal_step() {
+        let mut gate = FastGate::new(100);
+        let decision = gate.check_step(5, false, false);
+        assert_eq!(decision, GateDecision::Allow);
+    }
+
+    #[test]
+    fn fast_gate_blocks_over_budget() {
+        let mut gate = FastGate::new(10);
+        let decision = gate.check_step(10, false, false);
+        assert!(matches!(decision, GateDecision::Block { .. }));
+    }
+
+    #[test]
+    fn fast_gate_quarantines_contradictions() {
+        let mut gate = FastGate::new(100);
+        gate.contradiction_threshold = 2;
+
+        // First contradiction: still allowed
+        let d1 = gate.check_step(1, true, false);
+        assert_eq!(d1, GateDecision::Allow);
+
+        // Second contradiction: quarantine
+        let d2 = gate.check_step(2, true, false);
+        assert!(matches!(d2, GateDecision::Quarantine { .. }));
+    }
+
+    #[test]
+    fn fast_gate_commits_pending_writes() {
+        let mut gate = FastGate::new(100);
+        let mut bank = ReasoningBank::new();
+
+        let mut traj = Trajectory::new("test_1", 5);
+        traj.constraint_types.push("Before".to_string());
+        traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
+        traj.set_verdict(Verdict::Success, None);
+
+        gate.pending_writes
+            .push(ProposedWrite::RecordTrajectory(traj));
+        let committed = gate.commit_writes(&mut bank);
+        assert_eq!(committed, 1);
+        assert_eq!(bank.trajectories.len(), 1);
+    }
+
+    #[test]
+    fn medium_loop_proposes_writes() {
+        let mut medium = MediumLoop::new(100);
+
+        let trace = medium.process_result(
+            "puzzle_1",
+            5,
+            "adaptive",
+            15,
+            true,
+            true,
+            &["Before".to_string()],
+        );
+
+        assert!(trace.correct);
+        assert_eq!(trace.proposed_writes.len(), 1);
+        assert!(matches!(
+            trace.proposed_writes[0],
+            ProposedWrite::RecordTrajectory(_)
+        ));
+    }
+
+    #[test]
+    fn medium_loop_quarantines_contradictions() {
+        let mut medium = MediumLoop::new(100);
+        medium.gate.contradiction_threshold = 1;
+
+        // Solved but wrong → quarantine (threshold 1)
+        let trace = medium.process_result(
+            "puzzle_1",
+            5,
+            "default",
+            15,
+            true,
+            false,
+            &["Month".to_string()],
+        );
+
+        assert!(!trace.correct);
+        // Should have quarantine + counterexample writes
+        assert!(trace.proposed_writes.len() >= 2);
+        assert!(trace
+            .proposed_writes
+            .iter()
+            .any(|w| matches!(w, ProposedWrite::QuarantineTrajectory { .. })));
+    }
+
+    #[test]
+    fn slow_loop_promotes_patterns() {
+        let mut bank = ReasoningBank::new();
+        bank.evidence_threshold = 3;
+
+        // Build enough observations
+        for i in 0..5 {
+            let mut traj = Trajectory::new(&format!("s_{}", i), 5);
+            traj.constraint_types.push("Year".to_string());
+            traj.record_attempt("2024".into(), 0.9, 10, 1, "default");
+            traj.set_verdict(Verdict::Success, None);
+            bank.record_trajectory(traj);
+        }
+
+        // Add counterexample (required for promotion)
+        let ce_traj = Trajectory::new("fail_1", 5);
+        bank.record_counterexample("Year", ce_traj);
+
+        let cp = bank.checkpoint();
+
+        let mut slow = SlowLoop::new();
+        let result = slow.consolidate(&mut bank, 0, cp, 0.95, None);
+
+        assert_eq!(result.promotion_log.promoted, 1);
+        assert_eq!(result.promotion_log.trusted_total, 1);
+        assert!(!result.rolled_back);
+    }
+
+    #[test]
+    fn slow_loop_rolls_back_on_regression() {
+        let mut bank = ReasoningBank::new();
+
+        for i in 0..3 {
+            let mut traj = Trajectory::new(&format!("r_{}", i), 5);
+            traj.constraint_types.push("DayOfWeek".to_string());
+            traj.record_attempt("answer".into(), 0.9, 10, 1, "default");
+            traj.set_verdict(Verdict::Success, None);
+            bank.record_trajectory(traj);
+        }
+
+        let cp = bank.checkpoint();
+
+        // Simulate bad learning
+        for i in 3..6 {
+            let mut traj = Trajectory::new(&format!("r_{}", i), 5);
+            traj.constraint_types.push("DayOfWeek".to_string());
+            traj.record_attempt("wrong".into(), 0.1, 50, 1, "default");
+            traj.set_verdict(Verdict::Failed, None);
+            bank.record_trajectory(traj);
+        }
+
+        let mut slow = SlowLoop::new();
+        // Previous accuracy 0.95, current 0.80 → regression > 0.05
+        let result = slow.consolidate(&mut bank, 1, cp, 0.80, Some(0.95));
+
+        assert!(result.rolled_back);
+        assert!(result.rollback_witness.is_some());
+        assert_eq!(bank.trajectories.len(), 3); // Rolled back to checkpoint
+    }
+
+    #[test]
+    fn three_loop_integration() {
+        let mut bank = ReasoningBank::new();
+        bank.evidence_threshold = 2;
+
+        // === Cycle 1 ===
+        let cp = bank.checkpoint();
+
+        // Medium loop: solve puzzles
+        let mut medium = MediumLoop::new(100);
+
+        for i in 0..5 {
+            let trace = medium.process_result(
+                &format!("p_{}", i),
+                5,
+                "adaptive",
+                10,
+                true,
+                true,
+                &["Before".to_string()],
+            );
+            medium.finalize(&trace);
+        }
+
+        // Fast loop: commit writes
+        let committed = medium.gate.commit_writes(&mut bank);
+        assert_eq!(committed, 5);
+        medium.reset();
+
+        // Add counterexample (for promotion eligibility)
+        let ce = Trajectory::new("ce_1", 5);
+        bank.record_counterexample("Before", ce);
+
+        // Slow loop: consolidate
+        let mut slow = SlowLoop::new();
+        let consolidation = slow.consolidate(&mut bank, 0, cp, 0.90, None);
+
+        assert!(consolidation.promotion_log.promoted > 0);
+        assert_eq!(bank.trusted_count(), 1);
+    }
+}
--- a/examples/benchmarks/src/publishable_rvf.rs
+++ b/examples/benchmarks/src/publishable_rvf.rs
--- a/examples/benchmarks/src/reasoning_bank.rs
+++ b/examples/benchmarks/src/reasoning_bank.rs
--- a/examples/benchmarks/src/rvf_artifact.rs
+++ b/examples/benchmarks/src/rvf_artifact.rs
@@ -0,0 +1,648 @@
+//! RVF Artifact Packaging
+//!
+//! Packages an intelligence experiment as a self-contained, reproducible artifact.
+//! Aligns with the "identical graded outcomes, not identical tokens" promise.
+//!
+//! ## Contents
+//!
+//! 1. **Manifest**: Engine version, pinned configs, seed set, holdout IDs
+//! 2. **Memory Snapshot**: ReasoningBank serialized, KnowledgeCompiler cache, promotion log
+//! 3. **Graders**: Deterministic scoring + ContractHealth evaluation
+//! 4. **Witness Chain**: Per-episode input/config/grade/memory hashes
+//!
+//! ## Run Modes
+//!
+//! - **Replay**: Uses stored tasks, stored grades, verifies witness chain
+//! - **Verify**: Regenerates tasks from seeds, reruns grader, must match grades exactly
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+use crate::agi_contract::ContractHealth;
+use crate::reasoning_bank::{MemoryClass, RollbackWitness};
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Manifest
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// RVF Artifact Manifest — top-level metadata.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RvfManifest {
+    /// Format version
+    pub rvf_version: String,
+    /// Engine version that produced this artifact
+    pub engine_version: String,
+    /// Pinned solver configuration
+    pub solver_config: SolverConfig,
+    /// Pinned generator configuration
+    pub generator_config: GeneratorConfig,
+    /// Seed set used for generation
+    pub seed_set: SeedSet,
+    /// Holdout puzzle IDs (frozen set)
+    pub holdout_ids: Vec<String>,
+    /// Number of training cycles
+    pub cycles: usize,
+    /// Creation timestamp
+    pub created_at: String,
+    /// SHA-256 of the full artifact (computed after serialization)
+    pub artifact_hash: Option<String>,
+}
+
+/// Pinned solver configuration.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SolverConfig {
+    /// Step budget per task
+    pub step_budget: usize,
+    /// Noise injection rate
+    pub noise_rate: f64,
+    /// Retry enabled
+    pub retry_enabled: bool,
+    /// Beam width
+    pub beam_width: usize,
+    /// Minimum accuracy threshold
+    pub min_accuracy: f64,
+}
+
+/// Pinned generator configuration.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct GeneratorConfig {
+    /// Min difficulty
+    pub min_difficulty: u8,
+    /// Max difficulty
+    pub max_difficulty: u8,
+    /// Constraint density
+    pub constraint_density: usize,
+    /// Domain type (e.g., "temporal_puzzles", "program_synthesis")
+    pub domain: String,
+}
+
+/// Seed set for deterministic replay.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SeedSet {
+    /// Holdout generation seed (frozen)
+    pub holdout_seed: u64,
+    /// Training base seed
+    pub training_seed: u64,
+    /// Noise RNG seed
+    pub noise_seed: u64,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Memory Snapshot
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Serialized memory state at a point in time.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MemorySnapshot {
+    /// Serialized ReasoningBank (bincode or JSON)
+    pub reasoning_bank_data: Vec<u8>,
+    /// KnowledgeCompiler cache entries
+    pub compiler_cache: Vec<CompiledEntry>,
+    /// Promotion log: patterns promoted during this experiment
+    pub promotion_log: Vec<PromotionRecord>,
+    /// Memory class summary
+    pub class_summary: MemoryClassSummary,
+}
+
+/// A compiled knowledge entry (from KnowledgeCompiler).
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CompiledEntry {
+    /// Constraint signature
+    pub signature: String,
+    /// Compiled solution
+    pub solution: String,
+    /// Max steps the compiled path takes
+    pub max_steps: usize,
+    /// Confidence in compiled solution
+    pub confidence: f64,
+    /// Number of times this entry was used
+    pub hit_count: usize,
+}
+
+/// Record of a pattern promotion.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PromotionRecord {
+    /// Constraint type
+    pub constraint_type: String,
+    /// Strategy name
+    pub strategy: String,
+    /// From class
+    pub from_class: String,
+    /// To class
+    pub to_class: String,
+    /// Number of observations at promotion time
+    pub observations: usize,
+    /// Number of counterexamples at promotion time
+    pub counterexamples: usize,
+    /// Cycle when promotion occurred
+    pub cycle: usize,
+}
+
+/// Summary of memory classes.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct MemoryClassSummary {
+    pub volatile: usize,
+    pub trusted: usize,
+    pub quarantined: usize,
+    pub total_counterexamples: usize,
+    pub total_rollback_witnesses: usize,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Witness Chain
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Per-episode witness record for auditability.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct WitnessRecord {
+    /// Episode/cycle number
+    pub episode: usize,
+    /// SHA-256 of input (puzzle set)
+    pub input_hash: String,
+    /// SHA-256 of config
+    pub config_hash: String,
+    /// SHA-256 of grade outputs
+    pub grade_hash: String,
+    /// Memory root hash before this episode
+    pub memory_root_before: String,
+    /// Memory root hash after this episode
+    pub memory_root_after: String,
+    /// Gate decisions hash
+    pub gate_decisions_hash: String,
+    /// Contract health at end of episode
+    pub contract_health: ContractHealth,
+}
+
+/// Complete witness chain for the experiment.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct WitnessChain {
+    /// Ordered witness records (one per cycle)
+    pub records: Vec<WitnessRecord>,
+    /// Rollback witnesses that occurred during the experiment
+    pub rollback_witnesses: Vec<RollbackWitness>,
+    /// Final combined hash of the entire chain
+    pub chain_hash: Option<String>,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// RVF Artifact (top-level)
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Complete RVF artifact — everything needed to replay or verify an experiment.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RvfArtifact {
+    /// Manifest with pinned configuration
+    pub manifest: RvfManifest,
+    /// Memory snapshot
+    pub memory: MemorySnapshot,
+    /// Witness chain
+    pub witness_chain: WitnessChain,
+    /// Final contract health
+    pub final_health: ContractHealth,
+    /// Final IQ score
+    pub final_iq: f64,
+}
+
+/// Run mode for artifact verification.
+#[derive(Clone, Debug, PartialEq)]
+pub enum RunMode {
+    /// Use stored tasks, stored grades, verify witness chain
+    Replay,
+    /// Regenerate tasks from seeds, rerun grader, grades must match
+    Verify,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Builder
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Builder for assembling an RVF artifact from experiment results.
+pub struct RvfArtifactBuilder {
+    manifest: Option<RvfManifest>,
+    memory: Option<MemorySnapshot>,
+    witness_records: Vec<WitnessRecord>,
+    rollback_witnesses: Vec<RollbackWitness>,
+    final_health: Option<ContractHealth>,
+    final_iq: f64,
+}
+
+impl RvfArtifactBuilder {
+    pub fn new() -> Self {
+        Self {
+            manifest: None,
+            memory: None,
+            witness_records: Vec::new(),
+            rollback_witnesses: Vec::new(),
+            final_health: None,
+            final_iq: 0.0,
+        }
+    }
+
+    pub fn manifest(mut self, manifest: RvfManifest) -> Self {
+        self.manifest = Some(manifest);
+        self
+    }
+
+    pub fn memory(mut self, memory: MemorySnapshot) -> Self {
+        self.memory = Some(memory);
+        self
+    }
+
+    pub fn add_witness(&mut self, record: WitnessRecord) {
+        self.witness_records.push(record);
+    }
+
+    pub fn add_rollback_witness(&mut self, witness: RollbackWitness) {
+        self.rollback_witnesses.push(witness);
+    }
+
+    pub fn final_health(mut self, health: ContractHealth) -> Self {
+        self.final_health = Some(health);
+        self
+    }
+
+    pub fn final_iq(mut self, iq: f64) -> Self {
+        self.final_iq = iq;
+        self
+    }
+
+    /// Build the artifact. Returns None if required fields are missing.
+    pub fn build(self) -> Option<RvfArtifact> {
+        let manifest = self.manifest?;
+        let memory = self.memory?;
+        let final_health = self.final_health?;
+
+        Some(RvfArtifact {
+            manifest,
+            memory,
+            witness_chain: WitnessChain {
+                records: self.witness_records,
+                rollback_witnesses: self.rollback_witnesses,
+                chain_hash: None,
+            },
+            final_health,
+            final_iq: self.final_iq,
+        })
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Hash utilities (simple deterministic hashing for witness chain)
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Simple deterministic hash for reproducibility checks.
+/// Uses a 64-bit FNV-1a hash displayed as hex.
+pub fn fnv_hash(data: &[u8]) -> String {
+    let mut hash: u64 = 0xcbf29ce484222325;
+    for &byte in data {
+        hash ^= byte as u64;
+        hash = hash.wrapping_mul(0x100000001b3);
+    }
+    format!("{:016x}", hash)
+}
+
+/// Hash a serializable value.
+pub fn hash_value<T: Serialize>(value: &T) -> String {
+    let json = serde_json::to_vec(value).unwrap_or_default();
+    fnv_hash(&json)
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Verification
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Result of artifact verification.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct VerificationResult {
+    /// Overall pass/fail
+    pub passed: bool,
+    /// Per-witness verification
+    pub witness_checks: Vec<WitnessCheck>,
+    /// Number of hash mismatches
+    pub mismatches: usize,
+    /// Chain integrity (each record references previous hash)
+    pub chain_intact: bool,
+}
+
+/// Single witness check result.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct WitnessCheck {
+    pub episode: usize,
+    pub input_hash_ok: bool,
+    pub grade_hash_ok: bool,
+    pub memory_transition_ok: bool,
+}
+
+/// Verify an artifact's witness chain integrity.
+pub fn verify_witness_chain(artifact: &RvfArtifact) -> VerificationResult {
+    let mut checks = Vec::new();
+    let mut mismatches = 0;
+    let mut chain_intact = true;
+
+    let mut prev_memory_after = String::new();
+
+    for (i, record) in artifact.witness_chain.records.iter().enumerate() {
+        let input_ok = !record.input_hash.is_empty();
+        let grade_ok = !record.grade_hash.is_empty();
+
+        // Memory transition: after(N-1) == before(N)
+        let memory_ok = if i == 0 {
+            true
+        } else {
+            record.memory_root_before == prev_memory_after
+        };
+
+        if !memory_ok {
+            chain_intact = false;
+            mismatches += 1;
+        }
+        if !input_ok {
+            mismatches += 1;
+        }
+        if !grade_ok {
+            mismatches += 1;
+        }
+
+        prev_memory_after = record.memory_root_after.clone();
+
+        checks.push(WitnessCheck {
+            episode: record.episode,
+            input_hash_ok: input_ok,
+            grade_hash_ok: grade_ok,
+            memory_transition_ok: memory_ok,
+        });
+    }
+
+    VerificationResult {
+        passed: mismatches == 0 && chain_intact,
+        witness_checks: checks,
+        mismatches,
+        chain_intact,
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Tests
+// ═══════════════════════════════════════════════════════════════════════════
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn fnv_hash_deterministic() {
+        let h1 = fnv_hash(b"hello world");
+        let h2 = fnv_hash(b"hello world");
+        assert_eq!(h1, h2);
+
+        let h3 = fnv_hash(b"hello world!");
+        assert_ne!(h1, h3);
+    }
+
+    #[test]
+    fn artifact_builder_works() {
+        let manifest = RvfManifest {
+            rvf_version: "1.0".to_string(),
+            engine_version: "0.1.0".to_string(),
+            solver_config: SolverConfig {
+                step_budget: 400,
+                noise_rate: 0.25,
+                retry_enabled: true,
+                beam_width: 3,
+                min_accuracy: 0.80,
+            },
+            generator_config: GeneratorConfig {
+                min_difficulty: 1,
+                max_difficulty: 10,
+                constraint_density: 3,
+                domain: "temporal_puzzles".to_string(),
+            },
+            seed_set: SeedSet {
+                holdout_seed: 0xDEAD_BEEF,
+                training_seed: 42,
+                noise_seed: 31337,
+            },
+            holdout_ids: vec!["p1".into(), "p2".into()],
+            cycles: 10,
+            created_at: "2026-02-15T00:00:00Z".to_string(),
+            artifact_hash: None,
+        };
+
+        let memory = MemorySnapshot {
+            reasoning_bank_data: vec![1, 2, 3],
+            compiler_cache: Vec::new(),
+            promotion_log: Vec::new(),
+            class_summary: MemoryClassSummary::default(),
+        };
+
+        let health = ContractHealth {
+            solved_per_cost: 0.85,
+            noise_stability: 0.92,
+            contradiction_rate: 0.01,
+            rollback_correctness: 1.0,
+            policy_violations: 0,
+            accuracy: 0.95,
+            cost_efficiency: 0.85,
+            compliant: true,
+        };
+
+        let artifact = RvfArtifactBuilder::new()
+            .manifest(manifest)
+            .memory(memory)
+            .final_health(health)
+            .final_iq(95.0)
+            .build();
+
+        assert!(artifact.is_some());
+        let a = artifact.unwrap();
+        assert_eq!(a.manifest.rvf_version, "1.0");
+        assert_eq!(a.final_iq, 95.0);
+        assert!(a.final_health.compliant);
+    }
+
+    #[test]
+    fn witness_chain_verification() {
+        let mut builder = RvfArtifactBuilder::new();
+
+        // Build a 3-episode witness chain with consistent memory transitions
+        let mem_root_0 = fnv_hash(b"initial");
+        let mem_root_1 = fnv_hash(b"after_cycle_1");
+        let mem_root_2 = fnv_hash(b"after_cycle_2");
+        let mem_root_3 = fnv_hash(b"after_cycle_3");
+
+        let health = ContractHealth {
+            solved_per_cost: 0.9,
+            noise_stability: 0.95,
+            contradiction_rate: 0.0,
+            rollback_correctness: 1.0,
+            policy_violations: 0,
+            accuracy: 0.95,
+            cost_efficiency: 0.90,
+            compliant: true,
+        };
+
+        builder.add_witness(WitnessRecord {
+            episode: 0,
+            input_hash: fnv_hash(b"input_0"),
+            config_hash: fnv_hash(b"config"),
+            grade_hash: fnv_hash(b"grade_0"),
+            memory_root_before: mem_root_0.clone(),
+            memory_root_after: mem_root_1.clone(),
+            gate_decisions_hash: fnv_hash(b"gates_0"),
+            contract_health: health.clone(),
+        });
+
+        builder.add_witness(WitnessRecord {
+            episode: 1,
+            input_hash: fnv_hash(b"input_1"),
+            config_hash: fnv_hash(b"config"),
+            grade_hash: fnv_hash(b"grade_1"),
+            memory_root_before: mem_root_1.clone(), // matches prev after
+            memory_root_after: mem_root_2.clone(),
+            gate_decisions_hash: fnv_hash(b"gates_1"),
+            contract_health: health.clone(),
+        });
+
+        builder.add_witness(WitnessRecord {
+            episode: 2,
+            input_hash: fnv_hash(b"input_2"),
+            config_hash: fnv_hash(b"config"),
+            grade_hash: fnv_hash(b"grade_2"),
+            memory_root_before: mem_root_2.clone(), // matches prev after
+            memory_root_after: mem_root_3.clone(),
+            gate_decisions_hash: fnv_hash(b"gates_2"),
+            contract_health: health.clone(),
+        });
+
+        let manifest = RvfManifest {
+            rvf_version: "1.0".to_string(),
+            engine_version: "0.1.0".to_string(),
+            solver_config: SolverConfig {
+                step_budget: 400,
+                noise_rate: 0.25,
+                retry_enabled: true,
+                beam_width: 3,
+                min_accuracy: 0.80,
+            },
+            generator_config: GeneratorConfig {
+                min_difficulty: 1,
+                max_difficulty: 10,
+                constraint_density: 3,
+                domain: "temporal_puzzles".to_string(),
+            },
+            seed_set: SeedSet {
+                holdout_seed: 0xDEAD_BEEF,
+                training_seed: 42,
+                noise_seed: 31337,
+            },
+            holdout_ids: Vec::new(),
+            cycles: 3,
+            created_at: "2026-02-15T00:00:00Z".to_string(),
+            artifact_hash: None,
+        };
+
+        let artifact = RvfArtifactBuilder::new()
+            .manifest(manifest)
+            .memory(MemorySnapshot {
+                reasoning_bank_data: Vec::new(),
+                compiler_cache: Vec::new(),
+                promotion_log: Vec::new(),
+                class_summary: MemoryClassSummary::default(),
+            })
+            .final_health(health)
+            .final_iq(90.0);
+
+        // Transfer witnesses
+        let mut artifact_raw = artifact.build().unwrap();
+        artifact_raw.witness_chain.records = builder.witness_records;
+
+        let result = verify_witness_chain(&artifact_raw);
+        assert!(result.passed);
+        assert!(result.chain_intact);
+        assert_eq!(result.mismatches, 0);
+        assert_eq!(result.witness_checks.len(), 3);
+    }
+
+    #[test]
+    fn witness_chain_detects_tampering() {
+        let health = ContractHealth {
+            solved_per_cost: 0.9,
+            noise_stability: 0.95,
+            contradiction_rate: 0.0,
+            rollback_correctness: 1.0,
+            policy_violations: 0,
+            accuracy: 0.95,
+            cost_efficiency: 0.90,
+            compliant: true,
+        };
+
+        let mut artifact = RvfArtifact {
+            manifest: RvfManifest {
+                rvf_version: "1.0".to_string(),
+                engine_version: "0.1.0".to_string(),
+                solver_config: SolverConfig {
+                    step_budget: 400,
+                    noise_rate: 0.25,
+                    retry_enabled: true,
+                    beam_width: 3,
+                    min_accuracy: 0.80,
+                },
+                generator_config: GeneratorConfig {
+                    min_difficulty: 1,
+                    max_difficulty: 10,
+                    constraint_density: 3,
+                    domain: "temporal_puzzles".to_string(),
+                },
+                seed_set: SeedSet {
+                    holdout_seed: 0xDEAD_BEEF,
+                    training_seed: 42,
+                    noise_seed: 31337,
+                },
+                holdout_ids: Vec::new(),
+                cycles: 2,
+                created_at: "2026-02-15T00:00:00Z".to_string(),
+                artifact_hash: None,
+            },
+            memory: MemorySnapshot {
+                reasoning_bank_data: Vec::new(),
+                compiler_cache: Vec::new(),
+                promotion_log: Vec::new(),
+                class_summary: MemoryClassSummary::default(),
+            },
+            witness_chain: WitnessChain {
+                records: vec![
+                    WitnessRecord {
+                        episode: 0,
+                        input_hash: fnv_hash(b"in_0"),
+                        config_hash: fnv_hash(b"cfg"),
+                        grade_hash: fnv_hash(b"gr_0"),
+                        memory_root_before: fnv_hash(b"mem_0"),
+                        memory_root_after: fnv_hash(b"mem_1"),
+                        gate_decisions_hash: fnv_hash(b"g_0"),
+                        contract_health: health.clone(),
+                    },
+                    WitnessRecord {
+                        episode: 1,
+                        input_hash: fnv_hash(b"in_1"),
+                        config_hash: fnv_hash(b"cfg"),
+                        grade_hash: fnv_hash(b"gr_1"),
+                        // TAMPERED: memory_root_before doesn't match previous after
+                        memory_root_before: fnv_hash(b"WRONG"),
+                        memory_root_after: fnv_hash(b"mem_2"),
+                        gate_decisions_hash: fnv_hash(b"g_1"),
+                        contract_health: health.clone(),
+                    },
+                ],
+                rollback_witnesses: Vec::new(),
+                chain_hash: None,
+            },
+            final_health: health,
+            final_iq: 90.0,
+        };
+
+        let result = verify_witness_chain(&artifact);
+        assert!(!result.passed);
+        assert!(!result.chain_intact);
+        assert!(result.mismatches > 0);
+    }
+}
--- a/examples/benchmarks/src/rvf_intelligence_bench.rs
+++ b/examples/benchmarks/src/rvf_intelligence_bench.rs
--- a/examples/benchmarks/src/superintelligence.rs
+++ b/examples/benchmarks/src/superintelligence.rs
--- a/examples/benchmarks/src/swarm_regret.rs
+++ b/examples/benchmarks/src/swarm_regret.rs
@@ -0,0 +1,382 @@
+//! Swarm Controller Regret Tracking
+//!
+//! Implements sublinear regret metrics for multi-agent control:
+//! - Episode-based regret computation
+//! - Oracle baseline comparison
+//! - Regret curve tracking (R_k/k should decrease)
+//!
+//! Based on research on sublinear regret in multi-agent and LLM-agent settings
+
+use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
+
+/// Episode result from agent execution
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct EpisodeResult {
+    /// Episode number
+    pub episode: usize,
+    /// Number of puzzles/tasks in episode
+    pub num_tasks: usize,
+    /// Tasks solved
+    pub solved: usize,
+    /// Correct solutions
+    pub correct: usize,
+    /// Total steps taken
+    pub total_steps: usize,
+    /// Total tool calls
+    pub tool_calls: usize,
+    /// Total latency in ms
+    pub latency_ms: u64,
+    /// Agent reward (e.g., accuracy * 100 - steps / 10)
+    pub reward: f64,
+    /// Oracle reward (best possible performance)
+    pub oracle_reward: f64,
+}
+
+impl EpisodeResult {
+    /// Compute instantaneous regret for this episode
+    pub fn regret(&self) -> f64 {
+        (self.oracle_reward - self.reward).max(0.0)
+    }
+
+    /// Compute accuracy
+    pub fn accuracy(&self) -> f64 {
+        if self.num_tasks == 0 {
+            return 0.0;
+        }
+        self.correct as f64 / self.num_tasks as f64
+    }
+}
+
+/// Regret tracker for swarm controller
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RegretTracker {
+    /// Episode results
+    pub episodes: Vec<EpisodeResult>,
+    /// Cumulative regret history
+    pub cumulative_regret: Vec<f64>,
+    /// Average regret history (R_k/k)
+    pub average_regret: Vec<f64>,
+    /// Window size for moving average
+    pub window_size: usize,
+    /// Recent rewards for moving average
+    recent_rewards: VecDeque<f64>,
+}
+
+impl Default for RegretTracker {
+    fn default() -> Self {
+        Self::new(20)
+    }
+}
+
+impl RegretTracker {
+    /// Create a new regret tracker
+    pub fn new(window_size: usize) -> Self {
+        Self {
+            episodes: Vec::new(),
+            cumulative_regret: Vec::new(),
+            average_regret: Vec::new(),
+            window_size,
+            recent_rewards: VecDeque::with_capacity(window_size),
+        }
+    }
+
+    /// Record an episode result
+    pub fn record_episode(&mut self, result: EpisodeResult) {
+        let regret = result.regret();
+        let k = self.episodes.len() + 1;
+
+        // Update cumulative regret
+        let prev_cumulative = self.cumulative_regret.last().copied().unwrap_or(0.0);
+        let new_cumulative = prev_cumulative + regret;
+        self.cumulative_regret.push(new_cumulative);
+
+        // Update average regret (R_k/k)
+        let avg_regret = new_cumulative / k as f64;
+        self.average_regret.push(avg_regret);
+
+        // Update moving average window
+        self.recent_rewards.push_back(result.reward);
+        if self.recent_rewards.len() > self.window_size {
+            self.recent_rewards.pop_front();
+        }
+
+        self.episodes.push(result);
+    }
+
+    /// Get current cumulative regret
+    pub fn current_cumulative_regret(&self) -> f64 {
+        self.cumulative_regret.last().copied().unwrap_or(0.0)
+    }
+
+    /// Get current average regret (R_k/k)
+    pub fn current_average_regret(&self) -> f64 {
+        self.average_regret.last().copied().unwrap_or(0.0)
+    }
+
+    /// Check if regret is sublinear (average regret decreasing)
+    pub fn is_sublinear(&self) -> bool {
+        if self.average_regret.len() < 5 {
+            return true; // Not enough data
+        }
+
+        // Check if trend is decreasing
+        let n = self.average_regret.len();
+        let recent = &self.average_regret[n.saturating_sub(5)..];
+        let first = recent[0];
+        let last = recent[recent.len() - 1];
+        last < first
+    }
+
+    /// Get regret trend (slope of average regret)
+    pub fn regret_trend(&self) -> f64 {
+        if self.average_regret.len() < 2 {
+            return 0.0;
+        }
+
+        let n = self.average_regret.len();
+        let window = n.min(10);
+        let recent = &self.average_regret[n - window..];
+
+        // Simple linear regression slope
+        let x_mean = (window - 1) as f64 / 2.0;
+        let y_mean: f64 = recent.iter().sum::<f64>() / window as f64;
+
+        let mut num = 0.0;
+        let mut den = 0.0;
+        for (i, y) in recent.iter().enumerate() {
+            let x = i as f64;
+            num += (x - x_mean) * (y - y_mean);
+            den += (x - x_mean) * (x - x_mean);
+        }
+
+        if den.abs() < 1e-10 {
+            0.0
+        } else {
+            num / den
+        }
+    }
+
+    /// Get moving average reward
+    pub fn moving_average_reward(&self) -> f64 {
+        if self.recent_rewards.is_empty() {
+            return 0.0;
+        }
+        self.recent_rewards.iter().sum::<f64>() / self.recent_rewards.len() as f64
+    }
+
+    /// Get summary statistics
+    pub fn summary(&self) -> RegretSummary {
+        let total_episodes = self.episodes.len();
+        let total_regret = self.current_cumulative_regret();
+        let avg_regret = self.current_average_regret();
+        let trend = self.regret_trend();
+        let is_sublinear = self.is_sublinear();
+
+        let avg_accuracy = if total_episodes > 0 {
+            self.episodes.iter().map(|e| e.accuracy()).sum::<f64>() / total_episodes as f64
+        } else {
+            0.0
+        };
+
+        let avg_reward = if total_episodes > 0 {
+            self.episodes.iter().map(|e| e.reward).sum::<f64>() / total_episodes as f64
+        } else {
+            0.0
+        };
+
+        RegretSummary {
+            total_episodes,
+            total_regret,
+            average_regret: avg_regret,
+            regret_trend: trend,
+            is_sublinear,
+            average_accuracy: avg_accuracy,
+            average_reward: avg_reward,
+            moving_average_reward: self.moving_average_reward(),
+        }
+    }
+}
+
+/// Regret summary statistics
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RegretSummary {
+    pub total_episodes: usize,
+    pub total_regret: f64,
+    pub average_regret: f64,
+    pub regret_trend: f64,
+    pub is_sublinear: bool,
+    pub average_accuracy: f64,
+    pub average_reward: f64,
+    pub moving_average_reward: f64,
+}
+
+/// Oracle baseline for computing optimal rewards
+#[derive(Clone, Debug)]
+pub struct OracleBaseline {
+    /// Perfect accuracy reward
+    pub perfect_accuracy_reward: f64,
+    /// Step penalty factor
+    pub step_penalty: f64,
+    /// Minimum steps for optimal solution
+    pub min_steps: usize,
+}
+
+impl Default for OracleBaseline {
+    fn default() -> Self {
+        Self {
+            perfect_accuracy_reward: 100.0,
+            step_penalty: 0.1,
+            min_steps: 5,
+        }
+    }
+}
+
+impl OracleBaseline {
+    /// Compute oracle reward for a task set
+    pub fn compute_reward(&self, num_tasks: usize) -> f64 {
+        // Oracle solves all tasks with minimum steps
+        let accuracy_reward = self.perfect_accuracy_reward;
+        let step_cost = (self.min_steps * num_tasks) as f64 * self.step_penalty;
+        accuracy_reward - step_cost
+    }
+}
+
+/// Swarm controller with regret tracking
+pub struct SwarmController {
+    /// Regret tracker
+    pub regret: RegretTracker,
+    /// Oracle baseline
+    pub oracle: OracleBaseline,
+    /// Current episode number
+    pub current_episode: usize,
+    /// Tasks per episode
+    pub tasks_per_episode: usize,
+}
+
+impl Default for SwarmController {
+    fn default() -> Self {
+        Self::new(20)
+    }
+}
+
+impl SwarmController {
+    /// Create a new swarm controller
+    pub fn new(tasks_per_episode: usize) -> Self {
+        Self {
+            regret: RegretTracker::new(20),
+            oracle: OracleBaseline::default(),
+            current_episode: 0,
+            tasks_per_episode,
+        }
+    }
+
+    /// Start a new episode
+    pub fn start_episode(&mut self) {
+        self.current_episode += 1;
+    }
+
+    /// Record episode completion
+    pub fn complete_episode(
+        &mut self,
+        solved: usize,
+        correct: usize,
+        total_steps: usize,
+        tool_calls: usize,
+        latency_ms: u64,
+    ) {
+        let num_tasks = self.tasks_per_episode;
+
+        // Compute agent reward
+        let accuracy = if num_tasks > 0 {
+            correct as f64 / num_tasks as f64
+        } else {
+            0.0
+        };
+        let agent_reward = accuracy * self.oracle.perfect_accuracy_reward
+            - total_steps as f64 * self.oracle.step_penalty;
+
+        // Compute oracle reward
+        let oracle_reward = self.oracle.compute_reward(num_tasks);
+
+        let result = EpisodeResult {
+            episode: self.current_episode,
+            num_tasks,
+            solved,
+            correct,
+            total_steps,
+            tool_calls,
+            latency_ms,
+            reward: agent_reward,
+            oracle_reward,
+        };
+
+        self.regret.record_episode(result);
+    }
+
+    /// Get current regret status
+    pub fn status(&self) -> SwarmStatus {
+        let summary = self.regret.summary();
+        SwarmStatus {
+            episode: self.current_episode,
+            cumulative_regret: summary.total_regret,
+            average_regret: summary.average_regret,
+            is_improving: summary.is_sublinear,
+            accuracy: summary.average_accuracy,
+        }
+    }
+}
+
+/// Swarm controller status
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SwarmStatus {
+    pub episode: usize,
+    pub cumulative_regret: f64,
+    pub average_regret: f64,
+    pub is_improving: bool,
+    pub accuracy: f64,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_regret_tracking() {
+        let mut tracker = RegretTracker::new(10);
+
+        // Simulate improving performance
+        for i in 0..10 {
+            let accuracy = 0.5 + 0.05 * i as f64;
+            let result = EpisodeResult {
+                episode: i + 1,
+                num_tasks: 20,
+                solved: (20.0 * accuracy) as usize,
+                correct: (20.0 * accuracy) as usize,
+                total_steps: 100 - i * 5,
+                tool_calls: 20,
+                latency_ms: 1000,
+                reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
+                oracle_reward: 99.0,
+            };
+            tracker.record_episode(result);
+        }
+
+        assert!(tracker.is_sublinear());
+        assert!(tracker.regret_trend() < 0.0);
+    }
+
+    #[test]
+    fn test_swarm_controller() {
+        let mut controller = SwarmController::new(20);
+
+        for _ in 0..5 {
+            controller.start_episode();
+            controller.complete_episode(18, 17, 80, 20, 500);
+        }
+
+        let status = controller.status();
+        assert_eq!(status.episode, 5);
+        assert!(status.accuracy > 0.8);
+    }
+}
--- a/examples/benchmarks/src/temporal.rs
+++ b/examples/benchmarks/src/temporal.rs
--- a/examples/benchmarks/src/timepuzzles.rs
+++ b/examples/benchmarks/src/timepuzzles.rs
@@ -0,0 +1,657 @@
+//! TimePuzzles Generator
+//!
+//! Generates constraint-based temporal reasoning puzzles
+//! based on the TimePuzzles benchmark methodology (arXiv:2601.07148)
+//!
+//! Key features:
+//! - Factual temporal anchors with calendar relations
+//! - Cross-cultural date systems
+//! - Controlled difficulty levels
+//! - Dynamic puzzle generation
+
+use crate::temporal::{TemporalConstraint, TemporalPuzzle};
+use anyhow::Result;
+use chrono::{Datelike, NaiveDate};
+use rand::prelude::*;
+use serde::{Deserialize, Serialize};
+
+/// Multi-dimensional difficulty vector.
+///
+/// Replaces single-axis difficulty to prevent collapsing effects.
+/// Higher difficulty = more work and more ambiguity, NOT tighter posterior.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DifficultyVector {
+    /// Size of the search range (days)
+    pub range_size: usize,
+    /// Target number of valid candidates in posterior
+    pub posterior_target: usize,
+    /// Rate of distractor constraints (0.0 - 1.0)
+    pub distractor_rate: f64,
+    /// Rate of noise injection (0.0 - 1.0)
+    pub noise_rate: f64,
+    /// Number of ambiguous solutions (dates that almost satisfy constraints)
+    pub ambiguity_count: usize,
+}
+
+impl Default for DifficultyVector {
+    fn default() -> Self {
+        Self {
+            range_size: 60,
+            posterior_target: 60,
+            distractor_rate: 0.0,
+            noise_rate: 0.0,
+            ambiguity_count: 0,
+        }
+    }
+}
+
+impl DifficultyVector {
+    /// Build from scalar difficulty (backward compatible).
+    /// Higher difficulty = wider range, more distractors, more ambiguity.
+    pub fn from_scalar(difficulty: u8) -> Self {
+        let d = difficulty.min(10).max(1);
+        Self {
+            range_size: difficulty_to_range_size(d),
+            posterior_target: difficulty_to_posterior(d),
+            distractor_rate: difficulty_to_distractor_rate(d),
+            noise_rate: difficulty_to_noise_rate(d),
+            ambiguity_count: difficulty_to_ambiguity(d),
+        }
+    }
+
+    /// Scalar difficulty estimate (for backward compat).
+    pub fn scalar(&self) -> u8 {
+        // Weighted combination back to 1-10 scale
+        let range_score = (self.range_size as f64 / 365.0 * 10.0).min(10.0);
+        let distractor_score = self.distractor_rate * 10.0;
+        let ambiguity_score = (self.ambiguity_count as f64 / 5.0 * 10.0).min(10.0);
+        let combined = (range_score * 0.3 + distractor_score * 0.3 + ambiguity_score * 0.4) as u8;
+        combined.max(1).min(10)
+    }
+}
+
+/// Puzzle generator configuration
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PuzzleGeneratorConfig {
+    /// Minimum difficulty (1-10)
+    pub min_difficulty: u8,
+    /// Maximum difficulty (1-10)
+    pub max_difficulty: u8,
+    /// Constraint density (1-5)
+    pub constraint_density: u8,
+    /// Include cross-cultural references
+    pub cross_cultural: bool,
+    /// Include relative constraints
+    pub relative_constraints: bool,
+    /// Year range for puzzles
+    pub year_range: (i32, i32),
+    /// Random seed (optional)
+    pub seed: Option<u64>,
+}
+
+impl Default for PuzzleGeneratorConfig {
+    fn default() -> Self {
+        Self {
+            min_difficulty: 1,
+            max_difficulty: 10,
+            constraint_density: 3,
+            cross_cultural: true,
+            relative_constraints: true,
+            year_range: (2000, 2030),
+            seed: None,
+        }
+    }
+}
+
+/// Known events for temporal anchoring
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct TemporalAnchor {
+    pub name: String,
+    pub date: NaiveDate,
+    pub category: String,
+    pub culture: String,
+}
+
+impl TemporalAnchor {
+    pub fn new(
+        name: impl Into<String>,
+        year: i32,
+        month: u32,
+        day: u32,
+        category: impl Into<String>,
+        culture: impl Into<String>,
+    ) -> Self {
+        Self {
+            name: name.into(),
+            date: NaiveDate::from_ymd_opt(year, month, day).unwrap(),
+            category: category.into(),
+            culture: culture.into(),
+        }
+    }
+}
+
+/// TimePuzzles generator
+pub struct PuzzleGenerator {
+    config: PuzzleGeneratorConfig,
+    anchors: Vec<TemporalAnchor>,
+    rng: StdRng,
+}
+
+impl PuzzleGenerator {
+    /// Create a new generator with config
+    pub fn new(config: PuzzleGeneratorConfig) -> Self {
+        let rng = match config.seed {
+            Some(s) => StdRng::seed_from_u64(s),
+            None => StdRng::from_entropy(),
+        };
+
+        let mut gen = Self {
+            config,
+            anchors: Vec::new(),
+            rng,
+        };
+        gen.init_anchors();
+        gen
+    }
+
+    /// Initialize standard temporal anchors
+    fn init_anchors(&mut self) {
+        // Western holidays
+        self.anchors.push(TemporalAnchor::new(
+            "Christmas",
+            2024,
+            12,
+            25,
+            "holiday",
+            "western",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "New Year", 2024, 1, 1, "holiday", "western",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "Independence Day",
+            2024,
+            7,
+            4,
+            "holiday",
+            "american",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "Halloween",
+            2024,
+            10,
+            31,
+            "holiday",
+            "western",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "Valentine's Day",
+            2024,
+            2,
+            14,
+            "holiday",
+            "western",
+        ));
+
+        // Cross-cultural events
+        if self.config.cross_cultural {
+            // Chinese New Year 2024 (Year of the Dragon)
+            self.anchors.push(TemporalAnchor::new(
+                "Chinese New Year 2024",
+                2024,
+                2,
+                10,
+                "holiday",
+                "chinese",
+            ));
+            // Diwali 2024
+            self.anchors.push(TemporalAnchor::new(
+                "Diwali 2024",
+                2024,
+                11,
+                1,
+                "holiday",
+                "indian",
+            ));
+            // Eid al-Fitr 2024
+            self.anchors.push(TemporalAnchor::new(
+                "Eid al-Fitr 2024",
+                2024,
+                4,
+                10,
+                "holiday",
+                "islamic",
+            ));
+            // Hanukkah 2024 (starts)
+            self.anchors.push(TemporalAnchor::new(
+                "Hanukkah 2024",
+                2024,
+                12,
+                25,
+                "holiday",
+                "jewish",
+            ));
+        }
+
+        // Historical events
+        self.anchors.push(TemporalAnchor::new(
+            "Moon Landing",
+            1969,
+            7,
+            20,
+            "historical",
+            "global",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "Fall of Berlin Wall",
+            1989,
+            11,
+            9,
+            "historical",
+            "global",
+        ));
+        self.anchors.push(TemporalAnchor::new(
+            "Y2K",
+            2000,
+            1,
+            1,
+            "historical",
+            "global",
+        ));
+    }
+
+    /// Generate a single puzzle with multi-dimensional difficulty vector.
+    ///
+    /// Difficulty scaling (higher = more work, not tighter posterior):
+    /// - Low (1-2): small range, no DayOfWeek, no distractors
+    /// - Medium (3-6): DayOfWeek + moderate range = 7x cost surface
+    /// - High (7-10): wide range + distractors + ambiguity + anchor constraints
+    ///
+    /// All modes have access to weekday skipping; what differs is the policy.
+    pub fn generate_puzzle(&mut self, id: impl Into<String>) -> Result<TemporalPuzzle> {
+        let id = id.into();
+        let difficulty = self
+            .rng
+            .gen_range(self.config.min_difficulty..=self.config.max_difficulty);
+
+        // Build difficulty vector from scalar
+        let dv = DifficultyVector::from_scalar(difficulty);
+
+        // DayOfWeek (difficulty 3+): creates cost surface for policy decisions
+        let use_day_of_week = difficulty >= 3;
+
+        // Range size from difficulty vector (wider range at higher difficulty)
+        let range_days = dv.range_size as i64;
+
+        // Pick target date
+        let year = self
+            .rng
+            .gen_range(self.config.year_range.0..=self.config.year_range.1);
+        let month = self.rng.gen_range(1..=12);
+        let max_day = days_in_month(year, month);
+        let day = self.rng.gen_range(1..=max_day);
+        let target = NaiveDate::from_ymd_opt(year, month, day).unwrap();
+
+        // Build Between range centered on target, clamped to year
+        let year_start = NaiveDate::from_ymd_opt(year, 1, 1).unwrap();
+        let year_end = NaiveDate::from_ymd_opt(year, 12, 31).unwrap();
+        let half = range_days / 2;
+        let range_start = (target - chrono::Duration::days(half)).max(year_start);
+        let range_end = (range_start + chrono::Duration::days(range_days - 1)).min(year_end);
+
+        let mut puzzle = TemporalPuzzle::new(id.clone(), format!("Find the date (puzzle {})", id))
+            .with_difficulty(difficulty)
+            .with_solutions(vec![target]);
+
+        // Attach difficulty vector
+        puzzle.difficulty_vector = Some(dv.clone());
+
+        // Base constraints: InYear + Between (defines search range)
+        puzzle
+            .constraints
+            .push(TemporalConstraint::InYear(target.year()));
+        puzzle
+            .constraints
+            .push(TemporalConstraint::Between(range_start, range_end));
+
+        let mut used_anchors: Vec<TemporalAnchor> = Vec::new();
+
+        // DayOfWeek (difficulty 3+): creates cost surface for all modes
+        if use_day_of_week {
+            puzzle
+                .constraints
+                .push(TemporalConstraint::DayOfWeek(target.weekday()));
+        }
+
+        // Anchor reference for high difficulty (7+)
+        if difficulty >= 7 && self.config.relative_constraints {
+            if let Some(anchor) = self.anchors.choose(&mut self.rng).cloned() {
+                let diff = (target - anchor.date).num_days();
+                let constraint = if diff >= 0 {
+                    TemporalConstraint::DaysAfter(anchor.name.clone(), diff)
+                } else {
+                    TemporalConstraint::DaysBefore(anchor.name.clone(), diff.abs())
+                };
+                puzzle.constraints.push(constraint);
+                used_anchors.push(anchor);
+            }
+        }
+
+        // Add anchor references
+        for anchor in used_anchors {
+            puzzle.references.insert(anchor.name.clone(), anchor.date);
+        }
+
+        // Distractor injection (from difficulty vector rate)
+        if dv.distractor_rate > 0.0 && self.rng.gen_bool(dv.distractor_rate.min(0.99)) {
+            let distractor = self.generate_distractor(target, range_start, range_end);
+            puzzle.constraints.push(distractor);
+        }
+
+        // Distractor DayOfWeek (difficulty 6+): DayOfWeek present but misleading.
+        // Adds a SECOND DayOfWeek that is a distractor — it matches the target
+        // but unconditional weekday skipping on the wrong dow will miss solutions.
+        // This creates a real tradeoff for the PolicyKernel.
+        if difficulty >= 6 && use_day_of_week {
+            let distractor_dow_chance: f64 = match difficulty {
+                6 => 0.15,
+                7 => 0.25,
+                8 => 0.35,
+                9..=10 => 0.50,
+                _ => 0.0,
+            };
+            if self.rng.gen_bool(distractor_dow_chance.min(0.99)) {
+                // Add a redundant wider Between that doesn't narrow search
+                // but pairs with the existing DayOfWeek to create a trap:
+                // the DayOfWeek is valid but the wider range means skip saves less
+                let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(14..60));
+                let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(14..60));
+                puzzle
+                    .constraints
+                    .push(TemporalConstraint::Between(wider_start, wider_end));
+            }
+        }
+
+        // Ambiguity: add near-miss solutions at high difficulty
+        // These are dates that satisfy most but not all constraints,
+        // making early commits risky.
+        if dv.ambiguity_count > 0 {
+            // No-op structurally (solutions list stays correct),
+            // but the wider range at high difficulty naturally creates more
+            // dates that pass most constraints, increasing false-positive risk
+            // for aggressive skip modes.
+        }
+
+        // Count actual distractors injected (deterministic, observable)
+        let actual_distractor_count = crate::temporal::count_distractors(&puzzle);
+
+        // Tags: all features visible to policies for deterministic observability
+        puzzle.tags = vec![
+            format!("difficulty:{}", difficulty),
+            format!("year:{}", year),
+            format!("range_size:{}", dv.range_size),
+            format!("distractor_rate:{:.2}", dv.distractor_rate),
+            format!("distractor_count:{}", actual_distractor_count),
+            format!("ambiguity:{}", dv.ambiguity_count),
+            format!("has_dow:{}", use_day_of_week),
+        ];
+
+        Ok(puzzle)
+    }
+
+    /// Generate a distractor constraint: true for the target but doesn't narrow the search.
+    fn generate_distractor(
+        &mut self,
+        target: NaiveDate,
+        range_start: NaiveDate,
+        range_end: NaiveDate,
+    ) -> TemporalConstraint {
+        match self.rng.gen_range(0u8..3) {
+            0 => {
+                // Wider Between (superset of existing range → no shrink)
+                let wider_start = range_start - chrono::Duration::days(self.rng.gen_range(10..60));
+                let wider_end = range_end + chrono::Duration::days(self.rng.gen_range(10..60));
+                TemporalConstraint::Between(wider_start, wider_end)
+            }
+            1 => {
+                // Redundant InYear (already present)
+                TemporalConstraint::InYear(target.year())
+            }
+            _ => {
+                // After a date well before the range (no shrink)
+                let days_before = self.rng.gen_range(30..180) as i64;
+                TemporalConstraint::After(target - chrono::Duration::days(days_before))
+            }
+        }
+    }
+
+    /// Generate a batch of puzzles
+    pub fn generate_batch(&mut self, count: usize) -> Result<Vec<TemporalPuzzle>> {
+        let mut puzzles = Vec::with_capacity(count);
+        for i in 0..count {
+            let puzzle = self.generate_puzzle(format!("puzzle-{:04}", i + 1))?;
+            puzzles.push(puzzle);
+        }
+        Ok(puzzles)
+    }
+
+    /// Generate puzzles at specific difficulty
+    pub fn generate_at_difficulty(
+        &mut self,
+        count: usize,
+        difficulty: u8,
+    ) -> Result<Vec<TemporalPuzzle>> {
+        let orig_min = self.config.min_difficulty;
+        let orig_max = self.config.max_difficulty;
+
+        self.config.min_difficulty = difficulty;
+        self.config.max_difficulty = difficulty;
+
+        let puzzles = self.generate_batch(count);
+
+        self.config.min_difficulty = orig_min;
+        self.config.max_difficulty = orig_max;
+
+        puzzles
+    }
+}
+
+/// Range size by difficulty level.
+/// Higher difficulty → wider range → more work for the solver.
+fn difficulty_to_range_size(difficulty: u8) -> usize {
+    match difficulty {
+        1 => 14,
+        2 => 30,
+        3 => 56, // 8 weeks
+        4 => 84, // 12 weeks
+        5 => 120,
+        6 => 150,
+        7 => 200,
+        8 => 250,
+        9 => 300,
+        10 => 365,
+        _ => 120,
+    }
+}
+
+/// Posterior target by difficulty level.
+/// Higher difficulty → more valid candidates → more ambiguity.
+/// (Flipped from old model: difficulty increases ambiguity, not reduces it.)
+fn difficulty_to_posterior(difficulty: u8) -> usize {
+    match difficulty {
+        1 => 2,
+        2 => 4,
+        3 => 8,
+        4 => 12,
+        5 => 18,
+        6 => 25,
+        7 => 35,
+        8 => 50,
+        9 => 70,
+        10 => 100,
+        _ => 18,
+    }
+}
+
+/// Distractor rate by difficulty level.
+fn difficulty_to_distractor_rate(difficulty: u8) -> f64 {
+    match difficulty {
+        1..=3 => 0.0,
+        4 => 0.05,
+        5 => 0.10,
+        6 => 0.20,
+        7 => 0.30,
+        8 => 0.40,
+        9 => 0.50,
+        10 => 0.60,
+        _ => 0.10,
+    }
+}
+
+/// Noise rate by difficulty level.
+fn difficulty_to_noise_rate(difficulty: u8) -> f64 {
+    match difficulty {
+        1..=3 => 0.0,
+        4..=5 => 0.10,
+        6..=7 => 0.20,
+        8..=9 => 0.30,
+        10 => 0.40,
+        _ => 0.10,
+    }
+}
+
+/// Ambiguity count by difficulty level (near-miss solutions).
+fn difficulty_to_ambiguity(difficulty: u8) -> usize {
+    match difficulty {
+        1..=4 => 0,
+        5..=6 => 1,
+        7..=8 => 2,
+        9 => 3,
+        10 => 5,
+        _ => 0,
+    }
+}
+
+/// Days in a given month (handles leap years).
+fn days_in_month(year: i32, month: u32) -> u32 {
+    match month {
+        4 | 6 | 9 | 11 => 30,
+        2 => {
+            if year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) {
+                29
+            } else {
+                28
+            }
+        }
+        _ => 31,
+    }
+}
+
+/// Sample puzzle sets
+pub struct SamplePuzzles;
+
+impl SamplePuzzles {
+    /// Get easy puzzles (difficulty 1-3)
+    pub fn easy() -> Vec<TemporalPuzzle> {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            min_difficulty: 1,
+            max_difficulty: 3,
+            seed: Some(42),
+            ..Default::default()
+        });
+        gen.generate_batch(10).unwrap()
+    }
+
+    /// Get medium puzzles (difficulty 4-6)
+    pub fn medium() -> Vec<TemporalPuzzle> {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            min_difficulty: 4,
+            max_difficulty: 6,
+            seed: Some(42),
+            ..Default::default()
+        });
+        gen.generate_batch(10).unwrap()
+    }
+
+    /// Get hard puzzles (difficulty 7-10)
+    pub fn hard() -> Vec<TemporalPuzzle> {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            min_difficulty: 7,
+            max_difficulty: 10,
+            seed: Some(42),
+            ..Default::default()
+        });
+        gen.generate_batch(10).unwrap()
+    }
+
+    /// Get cross-cultural puzzles
+    pub fn cross_cultural() -> Vec<TemporalPuzzle> {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            cross_cultural: true,
+            relative_constraints: true,
+            min_difficulty: 5,
+            max_difficulty: 8,
+            seed: Some(42),
+            ..Default::default()
+        });
+        gen.generate_batch(10).unwrap()
+    }
+
+    /// Get a mixed sample set (50 puzzles across all difficulties)
+    pub fn mixed_sample() -> Vec<TemporalPuzzle> {
+        let mut all = Vec::new();
+        all.extend(Self::easy());
+        all.extend(Self::medium());
+        all.extend(Self::hard());
+        all.extend(Self::cross_cultural());
+
+        // Add more easy/medium to match TimePuzzles distribution
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            min_difficulty: 2,
+            max_difficulty: 5,
+            seed: Some(123),
+            ..Default::default()
+        });
+        all.extend(gen.generate_batch(10).unwrap());
+
+        all
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_puzzle_generation() {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            seed: Some(42),
+            ..Default::default()
+        });
+
+        let puzzle = gen.generate_puzzle("test-1").unwrap();
+        assert!(!puzzle.constraints.is_empty());
+        assert!(!puzzle.solutions.is_empty());
+    }
+
+    #[test]
+    fn test_batch_generation() {
+        let mut gen = PuzzleGenerator::new(PuzzleGeneratorConfig {
+            seed: Some(42),
+            ..Default::default()
+        });
+
+        let puzzles = gen.generate_batch(20).unwrap();
+        assert_eq!(puzzles.len(), 20);
+    }
+
+    #[test]
+    fn test_sample_puzzles() {
+        let easy = SamplePuzzles::easy();
+        assert_eq!(easy.len(), 10);
+        assert!(easy.iter().all(|p| p.difficulty <= 3));
+
+        let hard = SamplePuzzles::hard();
+        assert!(hard.iter().all(|p| p.difficulty >= 7));
+    }
+}
--- a/examples/benchmarks/src/vector_index.rs
+++ b/examples/benchmarks/src/vector_index.rs
--- a/examples/benchmarks/tests/integration_tests.rs
+++ b/examples/benchmarks/tests/integration_tests.rs
@@ -0,0 +1,417 @@
+//! Integration tests for benchmark suite
+
+use chrono::{NaiveDate, Weekday};
+use ruvector_benchmarks::{
+    logging::BenchmarkLogger,
+    swarm_regret::{EpisodeResult, RegretTracker, SwarmController},
+    temporal::{TemporalConstraint, TemporalPuzzle, TemporalSolver},
+    timepuzzles::{PuzzleGenerator, PuzzleGeneratorConfig, SamplePuzzles},
+    vector_index::{CoherenceGate, DenseVec, IvfConfig, VectorIndex},
+};
+use tempfile::tempdir;
+
+// ============================================================================
+// Vector Index Tests
+// ============================================================================
+
+#[test]
+fn test_vector_index_insert_search() {
+    let mut idx = VectorIndex::new(4);
+
+    let id1 = idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
+    let id2 = idx.insert(DenseVec::new(vec![0.9, 0.1, 0.0, 0.0])).unwrap();
+    let _id3 = idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
+
+    let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
+    let results = idx.search(&q, 2, 1.0).unwrap();
+
+    assert_eq!(results.len(), 2);
+    assert_eq!(results[0].id, id1);
+    assert!(results[0].score > results[1].score);
+}
+
+#[test]
+fn test_vector_index_coherence_gate() {
+    let gate = CoherenceGate::new(0.5);
+    let mut idx = VectorIndex::new(4).with_gate(gate);
+
+    idx.insert(DenseVec::new(vec![1.0, 0.0, 0.0, 0.0])).unwrap();
+    idx.insert(DenseVec::new(vec![0.0, 1.0, 0.0, 0.0])).unwrap();
+
+    let q = DenseVec::new(vec![1.0, 0.0, 0.0, 0.0]);
+
+    // Low coherence - blocked
+    let results = idx.search(&q, 10, 0.3).unwrap();
+    assert!(results.is_empty());
+
+    // High coherence - allowed
+    let results = idx.search(&q, 10, 0.7).unwrap();
+    assert!(!results.is_empty());
+}
+
+#[test]
+fn test_vector_index_ivf() {
+    let ivf = IvfConfig::new(4, 2);
+    let mut idx = VectorIndex::new(8).with_ivf(ivf);
+
+    // Insert enough vectors for clustering
+    for _ in 0..100 {
+        idx.insert(DenseVec::random(8)).unwrap();
+    }
+
+    idx.rebuild_ivf().unwrap();
+
+    let stats = idx.stats();
+    assert!(stats.ivf_enabled);
+    assert!(stats.ivf_clusters > 0);
+
+    // Search should work
+    let q = DenseVec::random(8);
+    let results = idx.search(&q, 5, 1.0).unwrap();
+    assert!(results.len() <= 5);
+}
+
+#[test]
+fn test_vector_index_persistence() {
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("test_index.bin");
+
+    let mut idx = VectorIndex::new(4);
+    idx.insert(DenseVec::new(vec![1.0, 2.0, 3.0, 4.0])).unwrap();
+    idx.insert(DenseVec::new(vec![5.0, 6.0, 7.0, 8.0])).unwrap();
+
+    idx.save_to_file(&path).unwrap();
+
+    let loaded = VectorIndex::load_from_file(&path).unwrap();
+    assert_eq!(loaded.len(), 2);
+    assert_eq!(loaded.dim(), 4);
+}
+
+// ============================================================================
+// Temporal Reasoning Tests
+// ============================================================================
+
+#[test]
+fn test_temporal_puzzle_exact_date() {
+    let target = NaiveDate::from_ymd_opt(2024, 6, 15).unwrap();
+    let puzzle = TemporalPuzzle::new("test", "Find June 15, 2024")
+        .with_constraint(TemporalConstraint::Exact(target))
+        .with_solutions(vec![target]);
+
+    assert!(puzzle.check_date(target).unwrap());
+    assert!(!puzzle
+        .check_date(NaiveDate::from_ymd_opt(2024, 6, 14).unwrap())
+        .unwrap());
+}
+
+#[test]
+fn test_temporal_puzzle_range() {
+    let start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
+    let end = NaiveDate::from_ymd_opt(2024, 1, 31).unwrap();
+
+    let puzzle = TemporalPuzzle::new("test", "Find a date in January 2024")
+        .with_constraint(TemporalConstraint::Between(start, end));
+
+    assert!(puzzle
+        .check_date(NaiveDate::from_ymd_opt(2024, 1, 15).unwrap())
+        .unwrap());
+    assert!(!puzzle
+        .check_date(NaiveDate::from_ymd_opt(2024, 2, 1).unwrap())
+        .unwrap());
+}
+
+#[test]
+fn test_temporal_puzzle_day_of_week() {
+    let puzzle = TemporalPuzzle::new("test", "Find a Monday in 2024")
+        .with_constraint(TemporalConstraint::InYear(2024))
+        .with_constraint(TemporalConstraint::DayOfWeek(Weekday::Mon));
+
+    // Jan 1, 2024 is a Monday
+    assert!(puzzle
+        .check_date(NaiveDate::from_ymd_opt(2024, 1, 1).unwrap())
+        .unwrap());
+    // Jan 2, 2024 is a Tuesday
+    assert!(!puzzle
+        .check_date(NaiveDate::from_ymd_opt(2024, 1, 2).unwrap())
+        .unwrap());
+}
+
+#[test]
+fn test_temporal_puzzle_relative() {
+    let base = NaiveDate::from_ymd_opt(2024, 3, 1).unwrap();
+    let puzzle = TemporalPuzzle::new("test", "Find 10 days after base")
+        .with_reference("base", base)
+        .with_constraint(TemporalConstraint::DaysAfter("base".to_string(), 10));
+
+    let target = NaiveDate::from_ymd_opt(2024, 3, 11).unwrap();
+    assert!(puzzle.check_date(target).unwrap());
+}
+
+#[test]
+fn test_temporal_solver_basic() {
+    let target = NaiveDate::from_ymd_opt(2024, 5, 20).unwrap();
+    let puzzle = TemporalPuzzle::new("test", "Simple puzzle")
+        .with_constraint(TemporalConstraint::Exact(target))
+        .with_solutions(vec![target]);
+
+    let mut solver = TemporalSolver::with_tools(true, false);
+    let result = solver.solve(&puzzle).unwrap();
+
+    assert!(result.solved);
+    assert!(result.correct);
+}
+
+#[test]
+fn test_temporal_solver_with_rewriting() {
+    let base = NaiveDate::from_ymd_opt(2024, 7, 4).unwrap();
+    let target = NaiveDate::from_ymd_opt(2024, 7, 14).unwrap();
+
+    let puzzle = TemporalPuzzle::new("test", "Relative puzzle")
+        .with_reference("event", base)
+        .with_constraint(TemporalConstraint::DaysAfter("event".to_string(), 10))
+        .with_solutions(vec![target]);
+
+    let mut solver = TemporalSolver::with_tools(true, false);
+    let result = solver.solve(&puzzle).unwrap();
+
+    assert!(result.solved);
+    assert!(result.correct);
+    assert!(result.tool_calls > 0); // Rewriting used
+}
+
+// ============================================================================
+// TimePuzzles Generator Tests
+// ============================================================================
+
+#[test]
+fn test_puzzle_generator_basic() {
+    let config = PuzzleGeneratorConfig {
+        seed: Some(42),
+        ..Default::default()
+    };
+
+    let mut gen = PuzzleGenerator::new(config);
+    let puzzle = gen.generate_puzzle("test-1").unwrap();
+
+    assert!(!puzzle.constraints.is_empty());
+    assert!(!puzzle.solutions.is_empty());
+    assert!(puzzle.difficulty >= 1 && puzzle.difficulty <= 10);
+}
+
+#[test]
+fn test_puzzle_generator_batch() {
+    let config = PuzzleGeneratorConfig {
+        seed: Some(42),
+        ..Default::default()
+    };
+
+    let mut gen = PuzzleGenerator::new(config);
+    let puzzles = gen.generate_batch(20).unwrap();
+
+    assert_eq!(puzzles.len(), 20);
+
+    // All puzzles should be valid
+    for puzzle in &puzzles {
+        assert!(!puzzle.constraints.is_empty());
+        assert!(!puzzle.solutions.is_empty());
+    }
+}
+
+#[test]
+fn test_puzzle_generator_difficulty() {
+    let config = PuzzleGeneratorConfig {
+        min_difficulty: 7,
+        max_difficulty: 10,
+        seed: Some(42),
+        ..Default::default()
+    };
+
+    let mut gen = PuzzleGenerator::new(config);
+    let puzzles = gen.generate_batch(10).unwrap();
+
+    for puzzle in &puzzles {
+        assert!(puzzle.difficulty >= 7);
+        assert!(puzzle.difficulty <= 10);
+    }
+}
+
+#[test]
+fn test_sample_puzzles() {
+    let easy = SamplePuzzles::easy();
+    assert_eq!(easy.len(), 10);
+    assert!(easy.iter().all(|p| p.difficulty <= 3));
+
+    let medium = SamplePuzzles::medium();
+    assert!(medium
+        .iter()
+        .all(|p| p.difficulty >= 4 && p.difficulty <= 6));
+
+    let hard = SamplePuzzles::hard();
+    assert!(hard.iter().all(|p| p.difficulty >= 7));
+
+    let mixed = SamplePuzzles::mixed_sample();
+    assert!(mixed.len() >= 40);
+}
+
+// ============================================================================
+// Swarm Regret Tests
+// ============================================================================
+
+#[test]
+fn test_regret_tracker_basic() {
+    let mut tracker = RegretTracker::new(10);
+
+    let result = EpisodeResult {
+        episode: 1,
+        num_tasks: 20,
+        solved: 18,
+        correct: 17,
+        total_steps: 100,
+        tool_calls: 20,
+        latency_ms: 1000,
+        reward: 80.0,
+        oracle_reward: 99.0,
+    };
+
+    tracker.record_episode(result);
+
+    assert_eq!(tracker.episodes.len(), 1);
+    assert!((tracker.current_cumulative_regret() - 19.0).abs() < 0.01);
+}
+
+#[test]
+fn test_regret_tracker_sublinear() {
+    let mut tracker = RegretTracker::new(10);
+
+    // Simulate improving performance (decreasing regret)
+    for i in 0..10 {
+        let accuracy = 0.5 + 0.05 * i as f64;
+        let result = EpisodeResult {
+            episode: i + 1,
+            num_tasks: 20,
+            solved: (20.0 * accuracy) as usize,
+            correct: (20.0 * accuracy) as usize,
+            total_steps: 100 - i * 5,
+            tool_calls: 20,
+            latency_ms: 1000,
+            reward: accuracy * 100.0 - (100 - i * 5) as f64 * 0.1,
+            oracle_reward: 99.0,
+        };
+        tracker.record_episode(result);
+    }
+
+    // Average regret should be decreasing
+    assert!(tracker.is_sublinear());
+    assert!(tracker.regret_trend() < 0.0);
+}
+
+#[test]
+fn test_swarm_controller() {
+    let mut controller = SwarmController::new(20);
+
+    // Run a few episodes
+    for _ in 0..5 {
+        controller.start_episode();
+        controller.complete_episode(18, 17, 80, 20, 500);
+    }
+
+    let status = controller.status();
+    assert_eq!(status.episode, 5);
+    assert!(status.accuracy > 0.8);
+}
+
+// ============================================================================
+// Logging Tests
+// ============================================================================
+
+#[test]
+fn test_benchmark_logger() {
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("test.log");
+
+    let mut logger = BenchmarkLogger::new(path.to_str().unwrap()).unwrap();
+
+    logger
+        .log_temporal(
+            "bench-1", "puzzle-1", 5, true, true, 10, 2, 100, 3, true, false,
+        )
+        .unwrap();
+
+    logger
+        .log_vector("search", 128, 10000, 1, 10, true, 0.9, 500, 10)
+        .unwrap();
+
+    logger
+        .log_swarm(1, 20, 18, 17, 85.0, 99.0, 14.0, 14.0, true)
+        .unwrap();
+
+    logger.flush().unwrap();
+
+    // Read back
+    let reader = ruvector_benchmarks::logging::LogReader::new(path.to_str().unwrap());
+    let entries = reader.read_all().unwrap();
+    assert_eq!(entries.len(), 3);
+}
+
+// ============================================================================
+// End-to-End Tests
+// ============================================================================
+
+#[test]
+fn test_full_benchmark_workflow() {
+    // Generate puzzles
+    let config = PuzzleGeneratorConfig {
+        min_difficulty: 2,
+        max_difficulty: 5,
+        seed: Some(12345),
+        ..Default::default()
+    };
+
+    let mut gen = PuzzleGenerator::new(config);
+    let puzzles = gen.generate_batch(10).unwrap();
+
+    // Create solver (budget must cover wider posterior-based ranges)
+    let mut solver = TemporalSolver::with_tools(true, false);
+    solver.max_steps = 400;
+
+    // Run all puzzles
+    let mut results = Vec::new();
+    for puzzle in &puzzles {
+        let result = solver.solve(puzzle).unwrap();
+        results.push(result);
+    }
+
+    // Check results
+    let solved = results.iter().filter(|r| r.solved).count();
+    let correct = results.iter().filter(|r| r.correct).count();
+
+    // Should solve most easy-medium puzzles
+    assert!(solved >= 5);
+    assert!(correct >= 5);
+}
+
+#[test]
+fn test_vector_temporal_integration() {
+    // This tests using vector index to store temporal embeddings
+    let mut idx = VectorIndex::new(64);
+
+    // Create "embeddings" for dates (simplified)
+    for day in 1..=31 {
+        let mut values = vec![0.0f32; 64];
+        values[0] = day as f32 / 31.0; // Day component
+        values[1] = 1.0 / 12.0; // Month component (January)
+        values[2] = 2024.0 / 3000.0; // Year component
+        idx.insert(DenseVec::new(values)).unwrap();
+    }
+
+    // Search for similar dates
+    let mut query = vec![0.0f32; 64];
+    query[0] = 15.0 / 31.0; // Looking for mid-month
+    query[1] = 1.0 / 12.0;
+    query[2] = 2024.0 / 3000.0;
+
+    let results = idx.search(&DenseVec::new(query), 5, 1.0).unwrap();
+
+    // Should find dates near the 15th
+    assert!(!results.is_empty());
+}