Files
wifi-densepose/vendor/ruvector/examples/benchmarks/src/intelligence_metrics.rs

961 lines
30 KiB
Rust

//! Intelligence Metrics Module
//!
//! Measures cognitive capabilities, reasoning quality, and learning indicators
//! for agent evaluation based on established AI benchmarking methodologies.
//!
//! Key metrics tracked:
//! - Reasoning quality (logical coherence, constraint satisfaction)
//! - Learning efficiency (regret curves, sample efficiency)
//! - Working memory (context utilization, information integration)
//! - Tool use proficiency (appropriate selection, effective utilization)
//! - Meta-cognitive awareness (self-correction, uncertainty estimation)
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Intelligence assessment result
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IntelligenceAssessment {
/// Overall intelligence score (0-100)
pub overall_score: f64,
/// Individual capability scores
pub capabilities: CapabilityScores,
/// Reasoning quality metrics
pub reasoning: ReasoningMetrics,
/// Learning efficiency metrics
pub learning: LearningMetrics,
/// Tool use proficiency
pub tool_use: ToolUseMetrics,
/// Meta-cognitive indicators
pub meta_cognition: MetaCognitiveMetrics,
/// Cost efficiency metrics
pub cost: CostMetrics,
/// Robustness under noise
pub robustness: RobustnessMetrics,
/// Raw performance data
pub raw_data: RawMetrics,
}
/// Capability scores across dimensions
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CapabilityScores {
/// Temporal reasoning (date inference, calendar math)
pub temporal_reasoning: f64,
/// Constraint satisfaction (multi-constraint solving)
pub constraint_satisfaction: f64,
/// Information retrieval (semantic search, recall)
pub information_retrieval: f64,
/// Pattern recognition (learning from examples)
pub pattern_recognition: f64,
/// Planning and sequencing
pub planning: f64,
/// Error recovery and adaptation
pub adaptation: f64,
}
impl Default for CapabilityScores {
fn default() -> Self {
Self {
temporal_reasoning: 0.0,
constraint_satisfaction: 0.0,
information_retrieval: 0.0,
pattern_recognition: 0.0,
planning: 0.0,
adaptation: 0.0,
}
}
}
impl CapabilityScores {
/// Compute weighted average
pub fn weighted_average(&self, weights: &[f64; 6]) -> f64 {
let scores = [
self.temporal_reasoning,
self.constraint_satisfaction,
self.information_retrieval,
self.pattern_recognition,
self.planning,
self.adaptation,
];
let total_weight: f64 = weights.iter().sum();
if total_weight == 0.0 {
return 0.0;
}
scores
.iter()
.zip(weights.iter())
.map(|(s, w)| s * w)
.sum::<f64>()
/ total_weight
}
}
/// Reasoning quality metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ReasoningMetrics {
/// Logical coherence (steps follow logically)
pub logical_coherence: f64,
/// Constraint satisfaction rate
pub constraint_satisfaction_rate: f64,
/// Solution optimality (vs. best possible)
pub solution_optimality: f64,
/// Reasoning efficiency (steps to solution)
pub reasoning_efficiency: f64,
/// Error rate in logical steps
pub error_rate: f64,
}
impl Default for ReasoningMetrics {
fn default() -> Self {
Self {
logical_coherence: 0.0,
constraint_satisfaction_rate: 0.0,
solution_optimality: 0.0,
reasoning_efficiency: 0.0,
error_rate: 0.0,
}
}
}
/// Learning efficiency metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct LearningMetrics {
/// Sample efficiency (performance vs. examples seen)
pub sample_efficiency: f64,
/// Regret trajectory (sublinear indicator)
pub regret_sublinearity: f64,
/// Transfer learning capability
pub transfer_capability: f64,
/// Learning rate (improvement per episode)
pub learning_rate: f64,
/// Generalization ability
pub generalization: f64,
}
impl Default for LearningMetrics {
fn default() -> Self {
Self {
sample_efficiency: 0.0,
regret_sublinearity: 0.0,
transfer_capability: 0.0,
learning_rate: 0.0,
generalization: 0.0,
}
}
}
/// Tool use proficiency metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ToolUseMetrics {
/// Tool selection appropriateness
pub selection_appropriateness: f64,
/// Tool utilization effectiveness
pub utilization_effectiveness: f64,
/// Tool composition (combining tools)
pub composition_ability: f64,
/// Tool discovery (finding needed tools)
pub discovery_ability: f64,
}
impl Default for ToolUseMetrics {
fn default() -> Self {
Self {
selection_appropriateness: 0.0,
utilization_effectiveness: 0.0,
composition_ability: 0.0,
discovery_ability: 0.0,
}
}
}
/// Meta-cognitive metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MetaCognitiveMetrics {
/// Self-correction rate
pub self_correction_rate: f64,
/// Uncertainty calibration (confidence vs. accuracy)
pub uncertainty_calibration: f64,
/// Strategy adaptation
pub strategy_adaptation: f64,
/// Progress monitoring accuracy
pub progress_monitoring: f64,
}
impl Default for MetaCognitiveMetrics {
fn default() -> Self {
Self {
self_correction_rate: 0.0,
uncertainty_calibration: 0.0,
strategy_adaptation: 0.0,
progress_monitoring: 0.0,
}
}
}
/// Cost efficiency metrics — first-class IQ dimension
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CostMetrics {
/// Steps per correct solve (lower = better)
pub steps_per_solve: f64,
/// Tool calls per correct solve (lower = better)
pub tools_per_solve: f64,
/// Cost efficiency score (0-1, higher = cheaper)
pub cost_efficiency: f64,
/// Cost trend over episodes (positive = improving)
pub cost_trend: f64,
}
impl Default for CostMetrics {
fn default() -> Self {
Self {
steps_per_solve: 100.0,
tools_per_solve: 10.0,
cost_efficiency: 0.0,
cost_trend: 0.0,
}
}
}
/// Robustness under adversarial conditions — first-class IQ dimension
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RobustnessMetrics {
/// Accuracy on noise-injected tasks
pub noise_accuracy: f64,
/// Accuracy drop from clean to noisy (lower = more robust)
pub noise_degradation: f64,
/// Per-episode accuracy consistency (higher = steadier)
pub consistency: f64,
/// Composite robustness score (0-1)
pub robustness_score: f64,
}
impl Default for RobustnessMetrics {
fn default() -> Self {
Self {
noise_accuracy: 0.0,
noise_degradation: 1.0,
consistency: 0.0,
robustness_score: 0.0,
}
}
}
/// Raw metrics from benchmarks
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RawMetrics {
/// Total tasks attempted
pub tasks_attempted: usize,
/// Tasks completed successfully
pub tasks_completed: usize,
/// Tasks with correct solutions
pub tasks_correct: usize,
/// Total steps taken
pub total_steps: usize,
/// Total tool calls
pub total_tool_calls: usize,
/// Total latency in ms
pub total_latency_ms: u64,
/// Performance by difficulty
pub by_difficulty: HashMap<u8, DifficultyStats>,
/// Episode-level metrics
pub episodes: Vec<EpisodeMetrics>,
/// Tasks attempted under noise injection
pub noise_tasks_attempted: usize,
/// Tasks correct under noise injection
pub noise_tasks_correct: usize,
/// Policy violations (contradictions, budget overruns)
pub policy_violations: usize,
/// Solved-but-incorrect count (contradiction rate numerator)
pub contradictions: usize,
/// Successful rollbacks from noisy to clean
pub rollback_successes: usize,
/// Attempted rollbacks from noisy to clean
pub rollback_attempts: usize,
}
impl Default for RawMetrics {
fn default() -> Self {
Self {
tasks_attempted: 0,
tasks_completed: 0,
tasks_correct: 0,
total_steps: 0,
total_tool_calls: 0,
total_latency_ms: 0,
by_difficulty: HashMap::new(),
episodes: Vec::new(),
noise_tasks_attempted: 0,
noise_tasks_correct: 0,
policy_violations: 0,
contradictions: 0,
rollback_successes: 0,
rollback_attempts: 0,
}
}
}
/// Stats per difficulty level
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DifficultyStats {
pub attempted: usize,
pub completed: usize,
pub correct: usize,
pub avg_steps: f64,
}
/// Per-episode metrics
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EpisodeMetrics {
pub episode: usize,
pub accuracy: f64,
pub reward: f64,
pub regret: f64,
pub cumulative_regret: f64,
}
/// Intelligence metrics calculator
pub struct IntelligenceCalculator {
/// Weights for capability scoring
pub capability_weights: [f64; 6],
/// Baseline for comparison
pub baseline_accuracy: f64,
/// Oracle performance for regret calculation
pub oracle_reward: f64,
}
impl Default for IntelligenceCalculator {
fn default() -> Self {
Self {
capability_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
baseline_accuracy: 0.5,
oracle_reward: 100.0,
}
}
}
impl IntelligenceCalculator {
/// Calculate intelligence assessment from raw metrics
pub fn calculate(&self, raw: &RawMetrics) -> IntelligenceAssessment {
let capabilities = self.calculate_capabilities(raw);
let reasoning = self.calculate_reasoning(raw);
let learning = self.calculate_learning(raw);
let tool_use = self.calculate_tool_use(raw);
let meta_cognition = self.calculate_meta_cognition(raw);
let cost = self.calculate_cost(raw);
let robustness = self.calculate_robustness(raw);
// Overall score: three equal pillars — graded outcomes, cost, robustness
let overall_score = self.calculate_overall_score(
&capabilities,
&reasoning,
&learning,
&tool_use,
&meta_cognition,
&cost,
&robustness,
);
IntelligenceAssessment {
overall_score,
capabilities,
reasoning,
learning,
tool_use,
meta_cognition,
cost,
robustness,
raw_data: raw.clone(),
}
}
fn calculate_capabilities(&self, raw: &RawMetrics) -> CapabilityScores {
let base_accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Temporal reasoning: accuracy on time-based tasks
let temporal_reasoning = base_accuracy * 100.0;
// Constraint satisfaction: correct solutions
let constraint_satisfaction = base_accuracy * 100.0;
// Information retrieval: based on steps to solution
let avg_steps = if raw.tasks_attempted > 0 {
raw.total_steps as f64 / raw.tasks_attempted as f64
} else {
100.0
};
let information_retrieval = (100.0 - avg_steps).max(0.0).min(100.0);
// Pattern recognition: performance improvement across difficulties
let pattern_recognition = self.calculate_pattern_recognition(raw);
// Planning: efficiency of tool use
let avg_tools = if raw.tasks_attempted > 0 {
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let planning = if avg_tools > 0.0 && avg_tools <= 2.0 {
100.0 * (1.0 - (avg_tools - 1.0).abs() / 2.0)
} else {
50.0
};
// Adaptation: improvement over episodes
let adaptation = self.calculate_adaptation(raw);
CapabilityScores {
temporal_reasoning,
constraint_satisfaction,
information_retrieval,
pattern_recognition,
planning,
adaptation,
}
}
fn calculate_pattern_recognition(&self, raw: &RawMetrics) -> f64 {
if raw.by_difficulty.len() < 2 {
return 50.0;
}
// Check if harder problems are still solvable
let mut difficulties: Vec<_> = raw.by_difficulty.keys().copied().collect();
difficulties.sort();
let mut scores = Vec::new();
for d in &difficulties {
if let Some(stats) = raw.by_difficulty.get(d) {
if stats.attempted > 0 {
scores.push(stats.correct as f64 / stats.attempted as f64);
}
}
}
if scores.is_empty() {
return 50.0;
}
// Average accuracy across difficulties
let avg: f64 = scores.iter().sum::<f64>() / scores.len() as f64;
avg * 100.0
}
fn calculate_adaptation(&self, raw: &RawMetrics) -> f64 {
if raw.episodes.len() < 3 {
return 50.0;
}
// Check if accuracy improves over episodes
let first_half: f64 = raw.episodes[..raw.episodes.len() / 2]
.iter()
.map(|e| e.accuracy)
.sum::<f64>()
/ (raw.episodes.len() / 2) as f64;
let second_half: f64 = raw.episodes[raw.episodes.len() / 2..]
.iter()
.map(|e| e.accuracy)
.sum::<f64>()
/ (raw.episodes.len() - raw.episodes.len() / 2) as f64;
let improvement = second_half - first_half;
// Scale: -0.2 to +0.2 improvement maps to 0-100
((improvement + 0.2) / 0.4 * 100.0).max(0.0).min(100.0)
}
fn calculate_reasoning(&self, raw: &RawMetrics) -> ReasoningMetrics {
let constraint_satisfaction_rate = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let avg_steps = if raw.tasks_attempted > 0 {
raw.total_steps as f64 / raw.tasks_attempted as f64
} else {
100.0
};
// Reasoning efficiency: inverse of steps (normalized)
let reasoning_efficiency = (100.0 - avg_steps).max(0.0).min(100.0) / 100.0;
// Logical coherence: based on completion rate vs correct rate
let completion_rate = if raw.tasks_attempted > 0 {
raw.tasks_completed as f64 / raw.tasks_attempted as f64
} else {
0.0
};
let logical_coherence = if completion_rate > 0.0 {
constraint_satisfaction_rate / completion_rate
} else {
0.0
};
ReasoningMetrics {
logical_coherence,
constraint_satisfaction_rate,
solution_optimality: constraint_satisfaction_rate,
reasoning_efficiency,
error_rate: 1.0 - constraint_satisfaction_rate,
}
}
fn calculate_learning(&self, raw: &RawMetrics) -> LearningMetrics {
let mut learning = LearningMetrics::default();
if raw.episodes.is_empty() {
return learning;
}
// Sample efficiency: accuracy per episode
learning.sample_efficiency =
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
// Regret sublinearity: check if cumulative regret grows sublinearly
// True sublinearity means R_k/k → 0 as k → ∞ (regret per episode decreasing)
if raw.episodes.len() >= 5 {
// Calculate regret trend using linear regression
let n = raw.episodes.len() as f64;
let mut sum_x = 0.0;
let mut sum_y = 0.0;
let mut sum_xy = 0.0;
let mut sum_xx = 0.0;
for (i, ep) in raw.episodes.iter().enumerate() {
let x = (i + 1) as f64;
let y = ep.regret;
sum_x += x;
sum_y += y;
sum_xy += x * y;
sum_xx += x * x;
}
let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
// Negative slope = decreasing regret = sublinear
// Transform: slope < 0 → sublinearity > 0
if slope < 0.0 {
// Stronger negative slope = better sublinearity (cap at 1.0)
learning.regret_sublinearity = (-slope / 10.0).min(1.0);
}
// Also check cumulative average
let last = raw.episodes.last().unwrap();
let avg_regret = last.cumulative_regret / n;
let first_half_avg = raw
.episodes
.iter()
.take(raw.episodes.len() / 2)
.map(|e| e.regret)
.sum::<f64>()
/ (n / 2.0);
// If second half has lower per-episode regret, that's sublinear
if avg_regret < first_half_avg && learning.regret_sublinearity == 0.0 {
learning.regret_sublinearity =
((first_half_avg - avg_regret) / first_half_avg).max(0.0);
}
}
// Learning rate: improvement in accuracy over episodes
if raw.episodes.len() >= 2 {
let first_acc = raw.episodes[0].accuracy;
let last_acc = raw.episodes.last().unwrap().accuracy;
learning.learning_rate = (last_acc - first_acc + 1.0) / 2.0;
}
// Generalization: consistency across difficulties
if raw.by_difficulty.len() >= 2 {
let accuracies: Vec<f64> = raw
.by_difficulty
.values()
.filter(|s| s.attempted > 0)
.map(|s| s.correct as f64 / s.attempted as f64)
.collect();
if !accuracies.is_empty() {
let mean = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
let variance = accuracies.iter().map(|a| (a - mean).powi(2)).sum::<f64>()
/ accuracies.len() as f64;
let std_dev = variance.sqrt();
// Lower variance = better generalization
learning.generalization = (1.0 - std_dev).max(0.0);
}
}
learning
}
fn calculate_tool_use(&self, raw: &RawMetrics) -> ToolUseMetrics {
let avg_tools = if raw.tasks_attempted > 0 {
raw.total_tool_calls as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Selection appropriateness: using tools when helpful
let accuracy = if raw.tasks_attempted > 0 {
raw.tasks_correct as f64 / raw.tasks_attempted as f64
} else {
0.0
};
// Effectiveness: accuracy when tools are used
let utilization_effectiveness = accuracy;
// Appropriateness: not overusing tools
let selection_appropriateness = if avg_tools > 0.0 {
(accuracy / avg_tools.min(2.0)).min(1.0)
} else {
0.5
};
ToolUseMetrics {
selection_appropriateness,
utilization_effectiveness,
composition_ability: avg_tools.min(1.0), // Using multiple tools
discovery_ability: accuracy, // Finding solutions
}
}
fn calculate_meta_cognition(&self, raw: &RawMetrics) -> MetaCognitiveMetrics {
// Self-correction: completed but not correct -> corrected
let completed_but_wrong = raw.tasks_completed.saturating_sub(raw.tasks_correct);
let self_correction_rate = if completed_but_wrong > 0 {
0.0 // No self-correction if still wrong
} else if raw.tasks_completed > 0 {
1.0 // All completed are correct
} else {
0.5
};
// Strategy adaptation: improvement over episodes
let strategy_adaptation = if raw.episodes.len() >= 3 {
let trend: f64 = raw
.episodes
.windows(2)
.map(|w| {
if w[1].accuracy > w[0].accuracy {
1.0
} else {
0.0
}
})
.sum::<f64>();
trend / (raw.episodes.len() - 1) as f64
} else {
0.5
};
MetaCognitiveMetrics {
self_correction_rate,
uncertainty_calibration: 0.5, // Would need confidence scores
strategy_adaptation,
progress_monitoring: strategy_adaptation, // Similar metric
}
}
fn calculate_cost(&self, raw: &RawMetrics) -> CostMetrics {
let steps_per_solve = if raw.tasks_correct > 0 {
raw.total_steps as f64 / raw.tasks_correct as f64
} else if raw.tasks_attempted > 0 {
raw.total_steps as f64
} else {
100.0
};
let tools_per_solve = if raw.tasks_correct > 0 {
raw.total_tool_calls as f64 / raw.tasks_correct as f64
} else {
10.0
};
// Efficiency: 1.0 at <=5 steps/solve, 0.0 at >=100 steps/solve
let cost_efficiency = (1.0 - (steps_per_solve - 5.0) / 95.0).clamp(0.0, 1.0);
// Cost trend: compare early vs late episode accuracy per step
let cost_trend = if raw.episodes.len() >= 4 {
let half = raw.episodes.len() / 2;
let early_acc: f64 =
raw.episodes[..half].iter().map(|e| e.accuracy).sum::<f64>() / half as f64;
let late_acc: f64 = raw.episodes[half..].iter().map(|e| e.accuracy).sum::<f64>()
/ (raw.episodes.len() - half) as f64;
// If accuracy improves, effective cost per solve drops
if early_acc > 0.01 {
(late_acc - early_acc) / early_acc
} else {
0.0
}
} else {
0.0
};
CostMetrics {
steps_per_solve,
tools_per_solve,
cost_efficiency,
cost_trend,
}
}
fn calculate_robustness(&self, raw: &RawMetrics) -> RobustnessMetrics {
let noise_accuracy = if raw.noise_tasks_attempted > 0 {
raw.noise_tasks_correct as f64 / raw.noise_tasks_attempted as f64
} else {
0.5 // no noise data -> neutral prior
};
let clean_attempted = raw
.tasks_attempted
.saturating_sub(raw.noise_tasks_attempted);
let clean_correct = raw.tasks_correct.saturating_sub(raw.noise_tasks_correct);
let clean_accuracy = if clean_attempted > 0 {
clean_correct as f64 / clean_attempted as f64
} else {
0.0
};
let noise_degradation = (clean_accuracy - noise_accuracy).max(0.0);
let consistency = if raw.episodes.len() >= 2 {
let mean =
raw.episodes.iter().map(|e| e.accuracy).sum::<f64>() / raw.episodes.len() as f64;
let variance = raw
.episodes
.iter()
.map(|e| (e.accuracy - mean).powi(2))
.sum::<f64>()
/ raw.episodes.len() as f64;
(1.0 - variance.sqrt()).max(0.0)
} else {
0.5
};
let robustness_score =
noise_accuracy * 0.4 + (1.0 - noise_degradation.min(1.0)) * 0.3 + consistency * 0.3;
RobustnessMetrics {
noise_accuracy,
noise_degradation,
consistency,
robustness_score,
}
}
fn calculate_overall_score(
&self,
capabilities: &CapabilityScores,
reasoning: &ReasoningMetrics,
learning: &LearningMetrics,
tool_use: &ToolUseMetrics,
meta_cognition: &MetaCognitiveMetrics,
cost: &CostMetrics,
robustness: &RobustnessMetrics,
) -> f64 {
// Sub-scores (0-100 scale)
let cap_score = capabilities.weighted_average(&self.capability_weights);
let reasoning_score = (reasoning.logical_coherence
+ reasoning.constraint_satisfaction_rate
+ reasoning.solution_optimality
+ reasoning.reasoning_efficiency)
/ 4.0
* 100.0;
let learning_score = (learning.sample_efficiency
+ learning.regret_sublinearity
+ learning.learning_rate
+ learning.generalization)
/ 4.0
* 100.0;
let tool_score = (tool_use.selection_appropriateness
+ tool_use.utilization_effectiveness
+ tool_use.composition_ability
+ tool_use.discovery_ability)
/ 4.0
* 100.0;
let meta_score = (meta_cognition.self_correction_rate
+ meta_cognition.strategy_adaptation
+ meta_cognition.progress_monitoring)
/ 3.0
* 100.0;
let cost_score = cost.cost_efficiency * 100.0;
let robustness_score = robustness.robustness_score * 100.0;
// Three equal pillars: graded outcomes (~0.34), cost (~0.33), robustness (~0.33)
// Graded outcomes = capabilities + reasoning + learning + tool + meta
cap_score * 0.12
+ reasoning_score * 0.10
+ learning_score * 0.06
+ tool_score * 0.03
+ meta_score * 0.03
+ cost_score * 0.33
+ robustness_score * 0.33
}
}
/// Print a formatted intelligence report
pub fn print_intelligence_report(assessment: &IntelligenceAssessment) {
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Intelligence Assessment Report ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!(
"🧠 Overall Intelligence Score: {:.1}/100",
assessment.overall_score
);
println!();
println!("📊 Capability Scores:");
println!(
" Temporal Reasoning: {:5.1}",
assessment.capabilities.temporal_reasoning
);
println!(
" Constraint Satisfaction:{:5.1}",
assessment.capabilities.constraint_satisfaction
);
println!(
" Information Retrieval: {:5.1}",
assessment.capabilities.information_retrieval
);
println!(
" Pattern Recognition: {:5.1}",
assessment.capabilities.pattern_recognition
);
println!(
" Planning: {:5.1}",
assessment.capabilities.planning
);
println!(
" Adaptation: {:5.1}",
assessment.capabilities.adaptation
);
println!();
println!("🔍 Reasoning Quality:");
println!(
" Logical Coherence: {:.2}",
assessment.reasoning.logical_coherence
);
println!(
" Constraint Satisfaction:{:.2}",
assessment.reasoning.constraint_satisfaction_rate
);
println!(
" Solution Optimality: {:.2}",
assessment.reasoning.solution_optimality
);
println!(
" Reasoning Efficiency: {:.2}",
assessment.reasoning.reasoning_efficiency
);
println!(
" Error Rate: {:.2}",
assessment.reasoning.error_rate
);
println!();
println!("📈 Learning Metrics:");
println!(
" Sample Efficiency: {:.2}",
assessment.learning.sample_efficiency
);
println!(
" Regret Sublinearity: {:.2}",
assessment.learning.regret_sublinearity
);
println!(
" Learning Rate: {:.2}",
assessment.learning.learning_rate
);
println!(
" Generalization: {:.2}",
assessment.learning.generalization
);
println!();
println!("🔧 Tool Use Proficiency:");
println!(
" Selection: {:.2}",
assessment.tool_use.selection_appropriateness
);
println!(
" Effectiveness: {:.2}",
assessment.tool_use.utilization_effectiveness
);
println!(
" Composition: {:.2}",
assessment.tool_use.composition_ability
);
println!();
println!("🪞 Meta-Cognitive Indicators:");
println!(
" Self-Correction: {:.2}",
assessment.meta_cognition.self_correction_rate
);
println!(
" Strategy Adaptation: {:.2}",
assessment.meta_cognition.strategy_adaptation
);
println!(
" Progress Monitoring: {:.2}",
assessment.meta_cognition.progress_monitoring
);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intelligence_calculation() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 100;
raw.tasks_completed = 90;
raw.tasks_correct = 80;
raw.total_steps = 500;
raw.total_tool_calls = 100;
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&raw);
assert!(assessment.overall_score > 0.0);
assert!(assessment.capabilities.temporal_reasoning > 0.0);
}
#[test]
fn test_learning_metrics() {
let mut raw = RawMetrics::default();
raw.tasks_attempted = 50;
raw.tasks_correct = 40;
// Add episodes showing improvement
for i in 0..10 {
raw.episodes.push(EpisodeMetrics {
episode: i + 1,
accuracy: 0.5 + 0.04 * i as f64,
reward: 50.0 + 4.0 * i as f64,
regret: 50.0 - 4.0 * i as f64,
cumulative_regret: (0..=i).map(|j| 50.0 - 4.0 * j as f64).sum(),
});
}
let calculator = IntelligenceCalculator::default();
let assessment = calculator.calculate(&raw);
// Should show learning (improvement over time)
assert!(assessment.learning.learning_rate > 0.5);
}
}