Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,825 @@
//! Problem Detection for Self-Healing Engine
//!
//! Implements continuous monitoring and problem classification:
//! - IndexDegradation: Index performance has degraded
//! - ReplicaLag: Replica is falling behind primary
//! - StorageExhaustion: Storage space is running low
//! - QueryTimeout: Queries are timing out excessively
//! - IntegrityViolation: Graph integrity has been compromised
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::SystemTime;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
// ============================================================================
// Problem Types
// ============================================================================
/// Types of problems that can be detected
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ProblemType {
/// Index performance has degraded (fragmentation, poor connectivity)
IndexDegradation,
/// Replica is lagging behind primary
ReplicaLag,
/// Storage space is running low
StorageExhaustion,
/// Queries are timing out excessively
QueryTimeout,
/// Graph integrity has been violated (mincut below threshold)
IntegrityViolation,
/// Memory pressure is high
MemoryPressure,
/// Connection pool exhaustion
ConnectionExhaustion,
/// Hot partition detected (uneven load distribution)
HotPartition,
}
impl ProblemType {
/// Get human-readable description
pub fn description(&self) -> &'static str {
match self {
ProblemType::IndexDegradation => "Index performance degradation detected",
ProblemType::ReplicaLag => "Replica lag exceeds threshold",
ProblemType::StorageExhaustion => "Storage space running low",
ProblemType::QueryTimeout => "Excessive query timeouts",
ProblemType::IntegrityViolation => "Graph integrity violation",
ProblemType::MemoryPressure => "Memory pressure detected",
ProblemType::ConnectionExhaustion => "Connection pool exhausted",
ProblemType::HotPartition => "Hot partition detected",
}
}
/// Get all problem types
pub fn all() -> Vec<ProblemType> {
vec![
ProblemType::IndexDegradation,
ProblemType::ReplicaLag,
ProblemType::StorageExhaustion,
ProblemType::QueryTimeout,
ProblemType::IntegrityViolation,
ProblemType::MemoryPressure,
ProblemType::ConnectionExhaustion,
ProblemType::HotPartition,
]
}
}
impl std::fmt::Display for ProblemType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ProblemType::IndexDegradation => write!(f, "index_degradation"),
ProblemType::ReplicaLag => write!(f, "replica_lag"),
ProblemType::StorageExhaustion => write!(f, "storage_exhaustion"),
ProblemType::QueryTimeout => write!(f, "query_timeout"),
ProblemType::IntegrityViolation => write!(f, "integrity_violation"),
ProblemType::MemoryPressure => write!(f, "memory_pressure"),
ProblemType::ConnectionExhaustion => write!(f, "connection_exhaustion"),
ProblemType::HotPartition => write!(f, "hot_partition"),
}
}
}
impl std::str::FromStr for ProblemType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"index_degradation" | "indexdegradation" => Ok(ProblemType::IndexDegradation),
"replica_lag" | "replicalag" => Ok(ProblemType::ReplicaLag),
"storage_exhaustion" | "storageexhaustion" => Ok(ProblemType::StorageExhaustion),
"query_timeout" | "querytimeout" => Ok(ProblemType::QueryTimeout),
"integrity_violation" | "integrityviolation" => Ok(ProblemType::IntegrityViolation),
"memory_pressure" | "memorypressure" => Ok(ProblemType::MemoryPressure),
"connection_exhaustion" | "connectionexhaustion" => {
Ok(ProblemType::ConnectionExhaustion)
}
"hot_partition" | "hotpartition" => Ok(ProblemType::HotPartition),
_ => Err(format!("Unknown problem type: {}", s)),
}
}
}
// ============================================================================
// Severity Levels
// ============================================================================
/// Problem severity levels
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum Severity {
/// Informational, no action required
Info,
/// Low severity, can be addressed during maintenance
Low,
/// Medium severity, should be addressed soon
Medium,
/// High severity, requires prompt attention
High,
/// Critical severity, immediate action required
Critical,
}
impl Severity {
/// Get numeric value for comparison
pub fn value(&self) -> u8 {
match self {
Severity::Info => 0,
Severity::Low => 1,
Severity::Medium => 2,
Severity::High => 3,
Severity::Critical => 4,
}
}
}
// ============================================================================
// Problem Definition
// ============================================================================
/// A detected problem with full context
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Problem {
/// Type of problem
pub problem_type: ProblemType,
/// Severity level
pub severity: Severity,
/// When the problem was detected
#[serde(with = "system_time_serde")]
pub detected_at: SystemTime,
/// Additional details about the problem
pub details: serde_json::Value,
/// Affected partition IDs (if applicable)
pub affected_partitions: Vec<i64>,
}
impl Problem {
/// Create a new problem
pub fn new(problem_type: ProblemType, severity: Severity) -> Self {
Self {
problem_type,
severity,
detected_at: SystemTime::now(),
details: serde_json::json!({}),
affected_partitions: vec![],
}
}
/// Add details to the problem
pub fn with_details(mut self, details: serde_json::Value) -> Self {
self.details = details;
self
}
/// Add affected partitions
pub fn with_partitions(mut self, partitions: Vec<i64>) -> Self {
self.affected_partitions = partitions;
self
}
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
let detected_ts = self
.detected_at
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs();
serde_json::json!({
"problem_type": self.problem_type.to_string(),
"severity": format!("{:?}", self.severity).to_lowercase(),
"detected_at": detected_ts,
"details": self.details,
"affected_partitions": self.affected_partitions,
})
}
}
// Custom serde for SystemTime
mod system_time_serde {
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
pub fn serialize<S>(time: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let duration = time.duration_since(UNIX_EPOCH).unwrap();
duration.as_secs().serialize(serializer)
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: Deserializer<'de>,
{
let secs = u64::deserialize(deserializer)?;
Ok(UNIX_EPOCH + Duration::from_secs(secs))
}
}
// ============================================================================
// Detection Thresholds
// ============================================================================
/// Configurable thresholds for problem detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetectionThresholds {
/// Index fragmentation percentage threshold (0-100)
pub index_fragmentation_pct: f32,
/// Replica lag in seconds threshold
pub replica_lag_seconds: f32,
/// Storage usage percentage threshold (0-100)
pub storage_usage_pct: f32,
/// Query timeout rate threshold (0-1)
pub query_timeout_rate: f32,
/// Minimum lambda (mincut) value for integrity
pub min_integrity_lambda: f32,
/// Memory usage percentage threshold (0-100)
pub memory_usage_pct: f32,
/// Connection pool usage percentage threshold (0-100)
pub connection_usage_pct: f32,
/// Partition load ratio threshold (vs average)
pub partition_load_ratio: f32,
}
impl Default for DetectionThresholds {
fn default() -> Self {
Self {
index_fragmentation_pct: 30.0,
replica_lag_seconds: 5.0,
storage_usage_pct: 85.0,
query_timeout_rate: 0.05, // 5% timeout rate
min_integrity_lambda: 0.5,
memory_usage_pct: 85.0,
connection_usage_pct: 90.0,
partition_load_ratio: 3.0, // 3x average load
}
}
}
// ============================================================================
// System Metrics
// ============================================================================
/// System metrics collected for problem detection
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SystemMetrics {
/// Index fragmentation percentage per index
pub index_fragmentation: HashMap<String, f32>,
/// Replica lag in seconds per replica
pub replica_lag: HashMap<String, f32>,
/// Storage usage percentage
pub storage_usage_pct: f32,
/// Query timeout rate (0-1)
pub query_timeout_rate: f32,
/// Current integrity lambda value
pub integrity_lambda: f32,
/// Memory usage percentage
pub memory_usage_pct: f32,
/// Connection pool usage percentage
pub connection_usage_pct: f32,
/// Load per partition
pub partition_loads: HashMap<i64, f64>,
/// Witness edges from mincut computation
pub witness_edges: Vec<WitnessEdge>,
/// Maintenance queue depth
pub maintenance_queue_depth: usize,
/// Top memory consumers
pub top_memory_consumers: Vec<(String, usize)>,
/// Fragmented index IDs
pub fragmented_indexes: Vec<i64>,
/// Timestamp of metrics collection
pub collected_at: u64,
}
impl SystemMetrics {
/// Create new empty metrics
pub fn new() -> Self {
Self {
collected_at: SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs(),
..Default::default()
}
}
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"index_fragmentation": self.index_fragmentation,
"replica_lag": self.replica_lag,
"storage_usage_pct": self.storage_usage_pct,
"query_timeout_rate": self.query_timeout_rate,
"integrity_lambda": self.integrity_lambda,
"memory_usage_pct": self.memory_usage_pct,
"connection_usage_pct": self.connection_usage_pct,
"partition_loads": self.partition_loads,
"witness_edge_count": self.witness_edges.len(),
"maintenance_queue_depth": self.maintenance_queue_depth,
"collected_at": self.collected_at,
})
}
}
/// Witness edge from mincut computation
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WitnessEdge {
/// Source node ID
pub from: i64,
/// Target node ID
pub to: i64,
/// Edge type (e.g., "partition_link", "replication", "dependency")
pub edge_type: String,
/// Edge weight/capacity
pub weight: f32,
}
// ============================================================================
// Problem Detector
// ============================================================================
/// Problem detector with configurable thresholds
pub struct ProblemDetector {
/// Detection thresholds
thresholds: RwLock<DetectionThresholds>,
/// Number of problems detected
problems_detected: AtomicU64,
/// Last detection timestamp
last_detection: AtomicU64,
}
impl ProblemDetector {
/// Create a new problem detector with default thresholds
pub fn new() -> Self {
Self {
thresholds: RwLock::new(DetectionThresholds::default()),
problems_detected: AtomicU64::new(0),
last_detection: AtomicU64::new(0),
}
}
/// Create with custom thresholds
pub fn with_thresholds(thresholds: DetectionThresholds) -> Self {
Self {
thresholds: RwLock::new(thresholds),
problems_detected: AtomicU64::new(0),
last_detection: AtomicU64::new(0),
}
}
/// Update thresholds
pub fn update_thresholds(&self, thresholds: DetectionThresholds) {
*self.thresholds.write() = thresholds;
}
/// Get current thresholds
pub fn get_thresholds(&self) -> DetectionThresholds {
self.thresholds.read().clone()
}
/// Collect current system metrics
pub fn collect_metrics(&self) -> SystemMetrics {
let mut metrics = SystemMetrics::new();
// In production, these would query PostgreSQL system catalogs
// and index statistics. For now, we simulate with reasonable defaults.
// Query pg_stat_user_indexes for fragmentation
metrics.index_fragmentation = self.collect_index_fragmentation();
// Query pg_stat_replication for replica lag
metrics.replica_lag = self.collect_replica_lag();
// Query pg_tablespace for storage usage
metrics.storage_usage_pct = self.collect_storage_usage();
// Query pg_stat_statements for timeout rate
metrics.query_timeout_rate = self.collect_query_timeout_rate();
// Get integrity lambda from mincut computation
metrics.integrity_lambda = self.collect_integrity_lambda();
// Query memory usage
metrics.memory_usage_pct = self.collect_memory_usage();
// Query connection pool usage
metrics.connection_usage_pct = self.collect_connection_usage();
// Query partition loads
metrics.partition_loads = self.collect_partition_loads();
// Get witness edges from mincut
metrics.witness_edges = self.collect_witness_edges();
metrics
}
/// Detect problems from collected metrics
pub fn detect_problems(&self, metrics: &SystemMetrics) -> Vec<Problem> {
let thresholds = self.thresholds.read();
let mut problems = Vec::new();
// Check index fragmentation
for (index_name, frag_pct) in &metrics.index_fragmentation {
if *frag_pct > thresholds.index_fragmentation_pct {
let severity = if *frag_pct > 60.0 {
Severity::High
} else if *frag_pct > 45.0 {
Severity::Medium
} else {
Severity::Low
};
problems.push(
Problem::new(ProblemType::IndexDegradation, severity).with_details(
serde_json::json!({
"index_name": index_name,
"fragmentation_pct": frag_pct,
"threshold": thresholds.index_fragmentation_pct,
}),
),
);
}
}
// Check replica lag
for (replica_id, lag_seconds) in &metrics.replica_lag {
if *lag_seconds > thresholds.replica_lag_seconds {
let severity = if *lag_seconds > 30.0 {
Severity::Critical
} else if *lag_seconds > 15.0 {
Severity::High
} else if *lag_seconds > 10.0 {
Severity::Medium
} else {
Severity::Low
};
problems.push(
Problem::new(ProblemType::ReplicaLag, severity).with_details(
serde_json::json!({
"replica_id": replica_id,
"lag_seconds": lag_seconds,
"threshold": thresholds.replica_lag_seconds,
}),
),
);
}
}
// Check storage usage
if metrics.storage_usage_pct > thresholds.storage_usage_pct {
let severity = if metrics.storage_usage_pct > 95.0 {
Severity::Critical
} else if metrics.storage_usage_pct > 90.0 {
Severity::High
} else {
Severity::Medium
};
problems.push(
Problem::new(ProblemType::StorageExhaustion, severity).with_details(
serde_json::json!({
"usage_pct": metrics.storage_usage_pct,
"threshold": thresholds.storage_usage_pct,
}),
),
);
}
// Check query timeout rate
if metrics.query_timeout_rate > thresholds.query_timeout_rate {
let severity = if metrics.query_timeout_rate > 0.20 {
Severity::Critical
} else if metrics.query_timeout_rate > 0.10 {
Severity::High
} else {
Severity::Medium
};
problems.push(
Problem::new(ProblemType::QueryTimeout, severity).with_details(serde_json::json!({
"timeout_rate": metrics.query_timeout_rate,
"threshold": thresholds.query_timeout_rate,
})),
);
}
// Check integrity lambda
if metrics.integrity_lambda < thresholds.min_integrity_lambda
&& metrics.integrity_lambda > 0.0
{
let severity = if metrics.integrity_lambda < 0.2 {
Severity::Critical
} else if metrics.integrity_lambda < 0.35 {
Severity::High
} else {
Severity::Medium
};
problems.push(
Problem::new(ProblemType::IntegrityViolation, severity).with_details(
serde_json::json!({
"lambda": metrics.integrity_lambda,
"threshold": thresholds.min_integrity_lambda,
"witness_edges": metrics.witness_edges.len(),
}),
),
);
}
// Check memory pressure
if metrics.memory_usage_pct > thresholds.memory_usage_pct {
let severity = if metrics.memory_usage_pct > 95.0 {
Severity::Critical
} else if metrics.memory_usage_pct > 90.0 {
Severity::High
} else {
Severity::Medium
};
problems.push(
Problem::new(ProblemType::MemoryPressure, severity).with_details(
serde_json::json!({
"usage_pct": metrics.memory_usage_pct,
"threshold": thresholds.memory_usage_pct,
}),
),
);
}
// Check connection exhaustion
if metrics.connection_usage_pct > thresholds.connection_usage_pct {
let severity = if metrics.connection_usage_pct > 98.0 {
Severity::Critical
} else if metrics.connection_usage_pct > 95.0 {
Severity::High
} else {
Severity::Medium
};
problems.push(
Problem::new(ProblemType::ConnectionExhaustion, severity).with_details(
serde_json::json!({
"usage_pct": metrics.connection_usage_pct,
"threshold": thresholds.connection_usage_pct,
}),
),
);
}
// Check for hot partitions
if !metrics.partition_loads.is_empty() {
let avg_load: f64 = metrics.partition_loads.values().sum::<f64>()
/ metrics.partition_loads.len() as f64;
let hot_partitions: Vec<i64> = metrics
.partition_loads
.iter()
.filter(|(_, load)| **load > avg_load * thresholds.partition_load_ratio as f64)
.map(|(id, _)| *id)
.collect();
if !hot_partitions.is_empty() {
let max_ratio = hot_partitions
.iter()
.filter_map(|id| metrics.partition_loads.get(id))
.map(|load| *load / avg_load)
.fold(0.0_f64, f64::max);
let severity = if max_ratio > 10.0 {
Severity::High
} else if max_ratio > 5.0 {
Severity::Medium
} else {
Severity::Low
};
problems.push(
Problem::new(ProblemType::HotPartition, severity)
.with_details(serde_json::json!({
"avg_load": avg_load,
"max_ratio": max_ratio,
"threshold_ratio": thresholds.partition_load_ratio,
}))
.with_partitions(hot_partitions),
);
}
}
// Update statistics
self.problems_detected
.fetch_add(problems.len() as u64, Ordering::SeqCst);
self.last_detection.store(
SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs(),
Ordering::SeqCst,
);
problems
}
/// Get detection statistics
pub fn get_stats(&self) -> DetectorStats {
DetectorStats {
problems_detected: self.problems_detected.load(Ordering::SeqCst),
last_detection: self.last_detection.load(Ordering::SeqCst),
}
}
// ========================================================================
// Metric Collection Helpers (would use SPI in production)
// ========================================================================
fn collect_index_fragmentation(&self) -> HashMap<String, f32> {
// In production: Query pg_stat_user_indexes and compute fragmentation
// For now, return empty (healthy state)
HashMap::new()
}
fn collect_replica_lag(&self) -> HashMap<String, f32> {
// In production: Query pg_stat_replication
HashMap::new()
}
fn collect_storage_usage(&self) -> f32 {
// In production: Query pg_tablespace sizes
0.0
}
fn collect_query_timeout_rate(&self) -> f32 {
// In production: Query pg_stat_statements for timeout metrics
0.0
}
fn collect_integrity_lambda(&self) -> f32 {
// In production: Get from integrity control plane
1.0 // Healthy default
}
fn collect_memory_usage(&self) -> f32 {
// In production: Query pg_shmem_allocations or OS metrics
0.0
}
fn collect_connection_usage(&self) -> f32 {
// In production: Query pg_stat_activity vs max_connections
0.0
}
fn collect_partition_loads(&self) -> HashMap<i64, f64> {
// In production: Query partition statistics
HashMap::new()
}
fn collect_witness_edges(&self) -> Vec<WitnessEdge> {
// In production: Get from mincut computation
Vec::new()
}
}
impl Default for ProblemDetector {
fn default() -> Self {
Self::new()
}
}
/// Detector statistics
#[derive(Debug, Clone)]
pub struct DetectorStats {
pub problems_detected: u64,
pub last_detection: u64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_problem_type_display() {
assert_eq!(
ProblemType::IndexDegradation.to_string(),
"index_degradation"
);
assert_eq!(ProblemType::ReplicaLag.to_string(), "replica_lag");
assert_eq!(
ProblemType::IntegrityViolation.to_string(),
"integrity_violation"
);
}
#[test]
fn test_problem_type_parse() {
assert_eq!(
"index_degradation".parse::<ProblemType>().unwrap(),
ProblemType::IndexDegradation
);
assert_eq!(
"replica_lag".parse::<ProblemType>().unwrap(),
ProblemType::ReplicaLag
);
}
#[test]
fn test_detect_index_degradation() {
let detector = ProblemDetector::new();
let mut metrics = SystemMetrics::new();
metrics
.index_fragmentation
.insert("test_idx".to_string(), 50.0);
let problems = detector.detect_problems(&metrics);
assert_eq!(problems.len(), 1);
assert_eq!(problems[0].problem_type, ProblemType::IndexDegradation);
assert_eq!(problems[0].severity, Severity::Medium);
}
#[test]
fn test_detect_storage_exhaustion() {
let detector = ProblemDetector::new();
let mut metrics = SystemMetrics::new();
metrics.storage_usage_pct = 92.0;
let problems = detector.detect_problems(&metrics);
assert_eq!(problems.len(), 1);
assert_eq!(problems[0].problem_type, ProblemType::StorageExhaustion);
assert_eq!(problems[0].severity, Severity::High);
}
#[test]
fn test_detect_integrity_violation() {
let detector = ProblemDetector::new();
let mut metrics = SystemMetrics::new();
metrics.integrity_lambda = 0.3;
let problems = detector.detect_problems(&metrics);
assert_eq!(problems.len(), 1);
assert_eq!(problems[0].problem_type, ProblemType::IntegrityViolation);
assert_eq!(problems[0].severity, Severity::High);
}
#[test]
fn test_detect_hot_partition() {
let detector = ProblemDetector::new();
let mut metrics = SystemMetrics::new();
metrics.partition_loads.insert(1, 100.0);
metrics.partition_loads.insert(2, 100.0);
metrics.partition_loads.insert(3, 500.0); // Hot partition
let problems = detector.detect_problems(&metrics);
assert_eq!(problems.len(), 1);
assert_eq!(problems[0].problem_type, ProblemType::HotPartition);
assert!(problems[0].affected_partitions.contains(&3));
}
#[test]
fn test_severity_ordering() {
assert!(Severity::Critical > Severity::High);
assert!(Severity::High > Severity::Medium);
assert!(Severity::Medium > Severity::Low);
assert!(Severity::Low > Severity::Info);
}
#[test]
fn test_healthy_metrics_no_problems() {
let detector = ProblemDetector::new();
let metrics = SystemMetrics::new();
let problems = detector.detect_problems(&metrics);
assert!(problems.is_empty());
}
#[test]
fn test_custom_thresholds() {
let thresholds = DetectionThresholds {
index_fragmentation_pct: 10.0, // More sensitive
..Default::default()
};
let detector = ProblemDetector::with_thresholds(thresholds);
let mut metrics = SystemMetrics::new();
metrics
.index_fragmentation
.insert("test_idx".to_string(), 15.0);
let problems = detector.detect_problems(&metrics);
assert_eq!(problems.len(), 1);
assert_eq!(problems[0].problem_type, ProblemType::IndexDegradation);
}
}

View File

@@ -0,0 +1,788 @@
//! Remediation Engine for Self-Healing System
//!
//! Orchestrates remediation execution with:
//! - Strategy selection based on problem type and weights
//! - Execution with timeout and rollback capability
//! - Outcome verification
//! - Cooldown periods to prevent thrashing
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use super::detector::{Problem, ProblemType, SystemMetrics};
use super::learning::OutcomeTracker;
use super::strategies::{
RemediationResult, RemediationStrategy, StrategyContext, StrategyRegistry,
};
// ============================================================================
// Healing Configuration
// ============================================================================
/// Configuration for the healing engine
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealingConfig {
/// Minimum time between healing attempts for same problem type
pub min_healing_interval: Duration,
/// Maximum attempts per time window
pub max_attempts_per_window: usize,
/// Time window for attempt counting
pub attempt_window: Duration,
/// Maximum impact level for auto-healing (0-1)
pub max_auto_heal_impact: f32,
/// Problem types that require human approval
pub require_approval: Vec<ProblemType>,
/// Strategy names that require human approval
pub require_approval_strategies: Vec<String>,
/// Enable learning from outcomes
pub learning_enabled: bool,
/// Cooldown after failed remediation
pub failure_cooldown: Duration,
/// Whether to verify improvement after remediation
pub verify_improvement: bool,
/// Minimum improvement percentage to consider success
pub min_improvement_pct: f32,
/// Maximum concurrent remediations
pub max_concurrent_remediations: usize,
}
impl Default for HealingConfig {
fn default() -> Self {
Self {
min_healing_interval: Duration::from_secs(300), // 5 minutes
max_attempts_per_window: 3,
attempt_window: Duration::from_secs(3600), // 1 hour
max_auto_heal_impact: 0.5,
require_approval: vec![],
require_approval_strategies: vec!["promote_replica".to_string()],
learning_enabled: true,
failure_cooldown: Duration::from_secs(600), // 10 minutes
verify_improvement: true,
min_improvement_pct: 5.0,
max_concurrent_remediations: 2,
}
}
}
// ============================================================================
// Healing Outcome
// ============================================================================
/// Outcome of a healing attempt
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum HealingOutcome {
/// Healing completed (may or may not have succeeded)
Completed {
problem_type: ProblemType,
strategy: String,
result: RemediationResult,
verified: bool,
},
/// Healing was deferred (needs approval or cooldown)
Deferred {
reason: String,
problem_type: ProblemType,
},
/// No suitable strategy found
NoStrategy { problem_type: ProblemType },
/// Healing is disabled
Disabled,
/// Already at maximum concurrent remediations
MaxConcurrent,
}
impl HealingOutcome {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
match self {
HealingOutcome::Completed {
problem_type,
strategy,
result,
verified,
} => {
serde_json::json!({
"status": "completed",
"problem_type": problem_type.to_string(),
"strategy": strategy,
"result": result.to_json(),
"verified": verified,
})
}
HealingOutcome::Deferred {
reason,
problem_type,
} => {
serde_json::json!({
"status": "deferred",
"reason": reason,
"problem_type": problem_type.to_string(),
})
}
HealingOutcome::NoStrategy { problem_type } => {
serde_json::json!({
"status": "no_strategy",
"problem_type": problem_type.to_string(),
})
}
HealingOutcome::Disabled => {
serde_json::json!({
"status": "disabled",
})
}
HealingOutcome::MaxConcurrent => {
serde_json::json!({
"status": "max_concurrent",
})
}
}
}
}
// ============================================================================
// Active Remediation
// ============================================================================
/// An active remediation in progress
#[derive(Debug, Clone)]
pub struct ActiveRemediation {
/// Unique ID
pub id: u64,
/// Problem being remediated
pub problem: Problem,
/// Strategy being used
pub strategy_name: String,
/// When remediation started
pub started_at: SystemTime,
/// Expected completion time
pub expected_completion: SystemTime,
}
impl ActiveRemediation {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
let started_ts = self
.started_at
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
let expected_ts = self
.expected_completion
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
serde_json::json!({
"id": self.id,
"problem_type": self.problem.problem_type.to_string(),
"strategy": self.strategy_name,
"started_at": started_ts,
"expected_completion": expected_ts,
})
}
}
// ============================================================================
// Remediation Context
// ============================================================================
/// Full context for remediation execution
#[derive(Debug, Clone)]
pub struct RemediationContext {
/// The problem being remediated
pub problem: Problem,
/// Collection/table being remediated
pub collection_id: i64,
/// Tenant ID (for multi-tenant)
pub tenant_id: Option<String>,
/// Initial integrity lambda
pub initial_lambda: f32,
/// Target integrity lambda
pub target_lambda: f32,
/// System metrics at start
pub initial_metrics: SystemMetrics,
/// When context was created
pub created_at: SystemTime,
/// Maximum impact allowed
pub max_impact: f32,
/// Timeout for remediation
pub timeout: Duration,
/// Healing attempts in current window
pub attempts_in_window: usize,
/// Last healing attempt time
pub last_attempt: Option<SystemTime>,
}
impl RemediationContext {
/// Create a new remediation context
pub fn new(problem: Problem, metrics: SystemMetrics) -> Self {
Self {
problem,
collection_id: 0,
tenant_id: None,
initial_lambda: metrics.integrity_lambda,
target_lambda: 0.8,
initial_metrics: metrics,
created_at: SystemTime::now(),
max_impact: 0.5,
timeout: Duration::from_secs(300),
attempts_in_window: 0,
last_attempt: None,
}
}
/// Set collection ID
pub fn with_collection(mut self, collection_id: i64) -> Self {
self.collection_id = collection_id;
self
}
/// Set tenant ID
pub fn with_tenant(mut self, tenant_id: String) -> Self {
self.tenant_id = Some(tenant_id);
self
}
/// Create strategy context
pub fn to_strategy_context(&self) -> StrategyContext {
StrategyContext {
problem: self.problem.clone(),
collection_id: self.collection_id,
initial_lambda: self.initial_lambda,
target_lambda: self.target_lambda,
max_impact: self.max_impact,
timeout: self.timeout,
start_time: SystemTime::now(),
dry_run: false,
}
}
}
// ============================================================================
// Remediation Engine
// ============================================================================
/// The main remediation engine
pub struct RemediationEngine {
/// Strategy registry
pub registry: StrategyRegistry,
/// Configuration
config: RwLock<HealingConfig>,
/// Outcome tracker for learning
tracker: OutcomeTracker,
/// Active remediations
active: RwLock<Vec<ActiveRemediation>>,
/// Next remediation ID
next_id: AtomicU64,
/// Healing attempt history (problem_type -> timestamps)
attempt_history: RwLock<HashMap<ProblemType, VecDeque<SystemTime>>>,
/// Whether engine is enabled
enabled: AtomicBool,
/// Total healings attempted
total_healings: AtomicU64,
/// Successful healings
successful_healings: AtomicU64,
}
impl RemediationEngine {
/// Create a new remediation engine
pub fn new(registry: StrategyRegistry, config: HealingConfig, tracker: OutcomeTracker) -> Self {
Self {
registry,
config: RwLock::new(config),
tracker,
active: RwLock::new(Vec::new()),
next_id: AtomicU64::new(1),
attempt_history: RwLock::new(HashMap::new()),
enabled: AtomicBool::new(true),
total_healings: AtomicU64::new(0),
successful_healings: AtomicU64::new(0),
}
}
/// Enable or disable the engine
pub fn set_enabled(&self, enabled: bool) {
self.enabled.store(enabled, Ordering::SeqCst);
}
/// Check if engine is enabled
pub fn is_enabled(&self) -> bool {
self.enabled.load(Ordering::SeqCst)
}
/// Update configuration
pub fn update_config(&self, config: HealingConfig) {
*self.config.write() = config;
}
/// Get current configuration
pub fn get_config(&self) -> HealingConfig {
self.config.read().clone()
}
/// Get active remediations
pub fn active_remediations(&self) -> Vec<ActiveRemediation> {
self.active.read().clone()
}
/// Main healing method
pub fn heal(&self, problem: &Problem) -> HealingOutcome {
// Check if enabled
if !self.is_enabled() {
return HealingOutcome::Disabled;
}
let config = self.config.read().clone();
// Check concurrent limit
if self.active.read().len() >= config.max_concurrent_remediations {
return HealingOutcome::MaxConcurrent;
}
// Check if we should auto-heal
if !self.should_auto_heal(problem, &config) {
return HealingOutcome::Deferred {
reason: self.get_defer_reason(problem, &config),
problem_type: problem.problem_type,
};
}
// Select strategy
let strategy = match self.registry.select(problem, config.max_auto_heal_impact) {
Some(s) => s,
None => {
return HealingOutcome::NoStrategy {
problem_type: problem.problem_type,
};
}
};
// Check if strategy requires approval
if config
.require_approval_strategies
.contains(&strategy.name().to_string())
{
return HealingOutcome::Deferred {
reason: format!("Strategy '{}' requires human approval", strategy.name()),
problem_type: problem.problem_type,
};
}
// Record attempt
self.record_attempt(problem.problem_type);
self.total_healings.fetch_add(1, Ordering::SeqCst);
// Start active remediation
let remediation_id = self.next_id.fetch_add(1, Ordering::SeqCst);
let active_rem = ActiveRemediation {
id: remediation_id,
problem: problem.clone(),
strategy_name: strategy.name().to_string(),
started_at: SystemTime::now(),
expected_completion: SystemTime::now() + strategy.estimated_duration(),
};
self.active.write().push(active_rem);
// Execute strategy
let context = StrategyContext {
problem: problem.clone(),
collection_id: 0,
initial_lambda: 1.0,
target_lambda: 0.8,
max_impact: config.max_auto_heal_impact,
timeout: strategy.estimated_duration() * 2,
start_time: SystemTime::now(),
dry_run: false,
};
let result = self.execute_with_safeguards(&*strategy, &context);
// Remove from active
self.active.write().retain(|r| r.id != remediation_id);
// Verify improvement
let verified = if config.verify_improvement && result.is_success() {
self.verify_improvement(&result, config.min_improvement_pct)
} else {
result.is_success()
};
// Rollback if not verified and reversible
if !verified && strategy.reversible() {
pgrx::log!(
"Remediation not verified, rolling back: {}",
strategy.name()
);
if let Err(e) = strategy.rollback(&context, &result) {
pgrx::warning!("Rollback failed: {}", e);
}
}
// Update learning
if config.learning_enabled {
self.registry
.update_weight(strategy.name(), verified, result.improvement_pct);
self.tracker
.record(problem, strategy.name(), &result, verified);
}
if verified {
self.successful_healings.fetch_add(1, Ordering::SeqCst);
}
HealingOutcome::Completed {
problem_type: problem.problem_type,
strategy: strategy.name().to_string(),
result,
verified,
}
}
/// Execute strategy with safeguards (timeout, panic catching)
fn execute_with_safeguards(
&self,
strategy: &dyn RemediationStrategy,
context: &StrategyContext,
) -> RemediationResult {
// In production, wrap in timeout and panic handling
// For now, execute directly
let start = std::time::Instant::now();
let mut result = strategy.execute(context);
result.duration_ms = start.elapsed().as_millis() as u64;
result
}
/// Check if we should auto-heal this problem
fn should_auto_heal(&self, problem: &Problem, config: &HealingConfig) -> bool {
// Check if problem type requires approval
if config.require_approval.contains(&problem.problem_type) {
return false;
}
// Check cooldown
if !self.is_past_cooldown(problem.problem_type, config) {
return false;
}
// Check attempt limit
if self.attempts_in_window(problem.problem_type, &config.attempt_window)
>= config.max_attempts_per_window
{
return false;
}
true
}
/// Get reason for deferring
fn get_defer_reason(&self, problem: &Problem, config: &HealingConfig) -> String {
if config.require_approval.contains(&problem.problem_type) {
return format!(
"Problem type '{:?}' requires human approval",
problem.problem_type
);
}
if !self.is_past_cooldown(problem.problem_type, config) {
return "In cooldown period after recent healing attempt".to_string();
}
if self.attempts_in_window(problem.problem_type, &config.attempt_window)
>= config.max_attempts_per_window
{
return format!(
"Exceeded maximum {} attempts per {:?}",
config.max_attempts_per_window, config.attempt_window
);
}
"Unknown reason".to_string()
}
/// Check if past cooldown period
fn is_past_cooldown(&self, problem_type: ProblemType, config: &HealingConfig) -> bool {
let history = self.attempt_history.read();
if let Some(attempts) = history.get(&problem_type) {
if let Some(last) = attempts.back() {
if let Ok(elapsed) = last.elapsed() {
return elapsed >= config.min_healing_interval;
}
}
}
true
}
/// Count attempts in window
fn attempts_in_window(&self, problem_type: ProblemType, window: &Duration) -> usize {
let history = self.attempt_history.read();
if let Some(attempts) = history.get(&problem_type) {
let cutoff = SystemTime::now() - *window;
attempts.iter().filter(|t| **t > cutoff).count()
} else {
0
}
}
/// Record an attempt
fn record_attempt(&self, problem_type: ProblemType) {
let mut history = self.attempt_history.write();
let attempts = history.entry(problem_type).or_insert_with(VecDeque::new);
attempts.push_back(SystemTime::now());
// Keep only recent attempts
let cutoff = SystemTime::now() - Duration::from_secs(86400); // 24 hours
while let Some(front) = attempts.front() {
if *front < cutoff {
attempts.pop_front();
} else {
break;
}
}
}
/// Verify improvement after remediation
fn verify_improvement(&self, result: &RemediationResult, min_pct: f32) -> bool {
result.improvement_pct >= min_pct
}
/// Get engine statistics
pub fn get_stats(&self) -> EngineStats {
let total = self.total_healings.load(Ordering::SeqCst);
let successful = self.successful_healings.load(Ordering::SeqCst);
EngineStats {
enabled: self.is_enabled(),
total_healings: total,
successful_healings: successful,
success_rate: if total > 0 {
successful as f32 / total as f32
} else {
0.0
},
active_remediations: self.active.read().len(),
strategy_weights: self.registry.get_all_weights(),
}
}
/// Execute a specific strategy manually
pub fn execute_strategy(
&self,
strategy_name: &str,
problem: &Problem,
dry_run: bool,
) -> Option<HealingOutcome> {
let strategy = self.registry.get_by_name(strategy_name)?;
let _config = self.config.read().clone();
let context = StrategyContext {
problem: problem.clone(),
collection_id: 0,
initial_lambda: 1.0,
target_lambda: 0.8,
max_impact: 1.0, // Manual execution allows higher impact
timeout: strategy.estimated_duration() * 2,
start_time: SystemTime::now(),
dry_run,
};
let result = strategy.execute(&context);
Some(HealingOutcome::Completed {
problem_type: problem.problem_type,
strategy: strategy_name.to_string(),
result,
verified: !dry_run,
})
}
}
/// Engine statistics
#[derive(Debug, Clone)]
pub struct EngineStats {
pub enabled: bool,
pub total_healings: u64,
pub successful_healings: u64,
pub success_rate: f32,
pub active_remediations: usize,
pub strategy_weights: HashMap<String, f32>,
}
impl EngineStats {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"enabled": self.enabled,
"total_healings": self.total_healings,
"successful_healings": self.successful_healings,
"success_rate": self.success_rate,
"active_remediations": self.active_remediations,
"strategy_weights": self.strategy_weights,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::healing::detector::Severity;
fn create_engine() -> RemediationEngine {
let registry = StrategyRegistry::new_with_defaults();
let config = HealingConfig::default();
let tracker = OutcomeTracker::new();
RemediationEngine::new(registry, config, tracker)
}
#[test]
fn test_engine_creation() {
let engine = create_engine();
assert!(engine.is_enabled());
assert!(engine.active_remediations().is_empty());
}
#[test]
fn test_engine_enable_disable() {
let engine = create_engine();
engine.set_enabled(false);
assert!(!engine.is_enabled());
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
let outcome = engine.heal(&problem);
assert!(matches!(outcome, HealingOutcome::Disabled));
engine.set_enabled(true);
assert!(engine.is_enabled());
}
#[test]
fn test_heal_index_degradation() {
let engine = create_engine();
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
let outcome = engine.heal(&problem);
match outcome {
HealingOutcome::Completed { strategy, .. } => {
assert!(strategy.contains("reindex") || strategy.contains("integrity"));
}
_ => panic!("Expected Completed outcome"),
}
}
#[test]
fn test_cooldown_enforcement() {
let mut config = HealingConfig::default();
config.min_healing_interval = Duration::from_secs(60);
let registry = StrategyRegistry::new_with_defaults();
let tracker = OutcomeTracker::new();
let engine = RemediationEngine::new(registry, config, tracker);
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
// First healing should succeed
let outcome1 = engine.heal(&problem);
assert!(matches!(outcome1, HealingOutcome::Completed { .. }));
// Second should be deferred (in cooldown)
let outcome2 = engine.heal(&problem);
assert!(matches!(outcome2, HealingOutcome::Deferred { .. }));
}
#[test]
fn test_max_attempts_enforcement() {
let mut config = HealingConfig::default();
config.max_attempts_per_window = 2;
config.min_healing_interval = Duration::from_millis(1);
let registry = StrategyRegistry::new_with_defaults();
let tracker = OutcomeTracker::new();
let engine = RemediationEngine::new(registry, config, tracker);
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
// First two should succeed
engine.heal(&problem);
std::thread::sleep(Duration::from_millis(2));
engine.heal(&problem);
std::thread::sleep(Duration::from_millis(2));
// Third should be deferred
let outcome = engine.heal(&problem);
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
}
#[test]
fn test_approval_requirement() {
let mut config = HealingConfig::default();
config.require_approval.push(ProblemType::ReplicaLag);
let registry = StrategyRegistry::new_with_defaults();
let tracker = OutcomeTracker::new();
let engine = RemediationEngine::new(registry, config, tracker);
let problem = Problem::new(ProblemType::ReplicaLag, Severity::High);
let outcome = engine.heal(&problem);
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
}
#[test]
fn test_strategy_approval_requirement() {
let mut config = HealingConfig::default();
config
.require_approval_strategies
.push("promote_replica".to_string());
config.max_auto_heal_impact = 1.0; // Allow high impact
let registry = StrategyRegistry::new_with_defaults();
let tracker = OutcomeTracker::new();
let engine = RemediationEngine::new(registry, config, tracker);
let problem = Problem::new(ProblemType::ReplicaLag, Severity::High);
let outcome = engine.heal(&problem);
// Should be deferred because promote_replica requires approval
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
}
#[test]
fn test_no_strategy() {
let registry = StrategyRegistry::new(); // Empty registry
let config = HealingConfig::default();
let tracker = OutcomeTracker::new();
let engine = RemediationEngine::new(registry, config, tracker);
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
let outcome = engine.heal(&problem);
assert!(matches!(outcome, HealingOutcome::NoStrategy { .. }));
}
#[test]
fn test_manual_execution() {
let engine = create_engine();
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
let outcome = engine.execute_strategy("reindex_partition", &problem, true);
assert!(outcome.is_some());
if let Some(HealingOutcome::Completed { result, .. }) = outcome {
assert!(result.metadata.get("dry_run") == Some(&serde_json::json!(true)));
}
}
#[test]
fn test_engine_stats() {
let engine = create_engine();
let stats = engine.get_stats();
assert!(stats.enabled);
assert_eq!(stats.total_healings, 0);
assert_eq!(stats.active_remediations, 0);
}
}

View File

@@ -0,0 +1,467 @@
//! SQL Functions for Self-Healing Engine
//!
//! Provides PostgreSQL-accessible functions for:
//! - Health status monitoring
//! - Healing history queries
//! - Manual healing triggers
//! - Configuration management
use pgrx::prelude::*;
use super::detector::ProblemType;
use super::{get_healing_engine, Problem};
// ============================================================================
// Health Status Functions
// ============================================================================
/// Get current health status of the RuVector system
///
/// Returns JSON with:
/// - healthy: whether system is healthy
/// - problem_count: number of detected problems
/// - active_remediation_count: ongoing remediations
/// - problems: list of current problems
/// - enabled: whether healing is enabled
#[pg_extern]
pub fn ruvector_health_status() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let status = engine_lock.health_status();
pgrx::JsonB(status.to_json())
}
/// Check if system is currently healthy (no detected problems)
#[pg_extern]
pub fn ruvector_is_healthy() -> bool {
let engine = get_healing_engine();
let engine_lock = engine.read();
let status = engine_lock.health_status();
status.healthy
}
/// Get system metrics used for problem detection
#[pg_extern]
pub fn ruvector_system_metrics() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let metrics = engine_lock.detector.collect_metrics();
pgrx::JsonB(metrics.to_json())
}
// ============================================================================
// Healing History Functions
// ============================================================================
/// Get recent healing history
///
/// # Arguments
/// * `limit` - Maximum number of records to return (default 20)
#[pg_extern]
pub fn ruvector_healing_history(limit: default!(i32, 20)) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let records = engine_lock.tracker.get_recent(limit as usize);
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
pgrx::JsonB(serde_json::json!({
"history": history,
"count": history.len(),
}))
}
/// Get healing history since a specific timestamp
///
/// # Arguments
/// * `since_timestamp` - Unix timestamp to filter from
#[pg_extern]
pub fn ruvector_healing_history_since(since_timestamp: i64) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let records = engine_lock.tracker.get_since(since_timestamp as u64);
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
pgrx::JsonB(serde_json::json!({
"history": history,
"count": history.len(),
"since": since_timestamp,
}))
}
/// Get healing history for a specific strategy
#[pg_extern]
pub fn ruvector_healing_history_for_strategy(
strategy_name: &str,
limit: default!(i32, 20),
) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let records = engine_lock
.tracker
.get_for_strategy(strategy_name, limit as usize);
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
pgrx::JsonB(serde_json::json!({
"strategy": strategy_name,
"history": history,
"count": history.len(),
}))
}
// ============================================================================
// Healing Trigger Functions
// ============================================================================
/// Manually trigger healing for a specific problem type
///
/// # Arguments
/// * `problem_type` - One of: index_degradation, replica_lag, storage_exhaustion,
/// query_timeout, integrity_violation, memory_pressure,
/// connection_exhaustion, hot_partition
#[pg_extern]
pub fn ruvector_healing_trigger(problem_type: &str) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
// Parse problem type
let ptype = match problem_type.parse::<ProblemType>() {
Ok(pt) => pt,
Err(e) => {
return pgrx::JsonB(serde_json::json!({
"success": false,
"error": e,
}));
}
};
// Trigger healing
match engine_lock.trigger_healing(ptype) {
Some(outcome) => pgrx::JsonB(serde_json::json!({
"success": true,
"outcome": outcome.to_json(),
})),
None => pgrx::JsonB(serde_json::json!({
"success": false,
"error": "Healing is disabled",
})),
}
}
/// Execute a specific healing strategy manually
///
/// # Arguments
/// * `strategy_name` - Strategy to execute
/// * `problem_type` - Problem type for context
/// * `dry_run` - If true, don't actually execute (default false)
#[pg_extern]
pub fn ruvector_healing_execute(
strategy_name: &str,
problem_type: &str,
dry_run: default!(bool, false),
) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
// Parse problem type
let ptype = match problem_type.parse::<ProblemType>() {
Ok(pt) => pt,
Err(e) => {
return pgrx::JsonB(serde_json::json!({
"success": false,
"error": e,
}));
}
};
let problem = Problem::new(ptype, super::detector::Severity::Medium);
match engine_lock
.remediation
.execute_strategy(strategy_name, &problem, dry_run)
{
Some(outcome) => pgrx::JsonB(serde_json::json!({
"success": true,
"dry_run": dry_run,
"outcome": outcome.to_json(),
})),
None => pgrx::JsonB(serde_json::json!({
"success": false,
"error": format!("Strategy '{}' not found", strategy_name),
})),
}
}
// ============================================================================
// Configuration Functions
// ============================================================================
/// Configure healing engine settings
///
/// # Arguments
/// * `config_json` - JSON configuration object with optional keys:
/// - min_healing_interval_secs
/// - max_attempts_per_window
/// - max_auto_heal_impact
/// - learning_enabled
/// - verify_improvement
/// - min_improvement_pct
#[pg_extern]
pub fn ruvector_healing_configure(config_json: pgrx::JsonB) -> pgrx::JsonB {
let engine = get_healing_engine();
let mut engine_lock = engine.write();
let mut config = engine_lock.config.clone();
let json = config_json.0;
// Update configuration from JSON
if let Some(interval) = json
.get("min_healing_interval_secs")
.and_then(|v| v.as_i64())
{
if interval > 0 {
config.min_healing_interval = std::time::Duration::from_secs(interval as u64);
}
}
if let Some(attempts) = json.get("max_attempts_per_window").and_then(|v| v.as_i64()) {
if attempts > 0 {
config.max_attempts_per_window = attempts as usize;
}
}
if let Some(impact) = json.get("max_auto_heal_impact").and_then(|v| v.as_f64()) {
if impact >= 0.0 && impact <= 1.0 {
config.max_auto_heal_impact = impact as f32;
}
}
if let Some(learning) = json.get("learning_enabled").and_then(|v| v.as_bool()) {
config.learning_enabled = learning;
}
if let Some(verify) = json.get("verify_improvement").and_then(|v| v.as_bool()) {
config.verify_improvement = verify;
}
if let Some(min_pct) = json.get("min_improvement_pct").and_then(|v| v.as_f64()) {
if min_pct >= 0.0 {
config.min_improvement_pct = min_pct as f32;
}
}
if let Some(enabled) = json.get("enabled").and_then(|v| v.as_bool()) {
engine_lock.set_enabled(enabled);
}
engine_lock.update_config(config.clone());
pgrx::JsonB(serde_json::json!({
"status": "updated",
"config": {
"min_healing_interval_secs": config.min_healing_interval.as_secs(),
"max_attempts_per_window": config.max_attempts_per_window,
"max_auto_heal_impact": config.max_auto_heal_impact,
"learning_enabled": config.learning_enabled,
"verify_improvement": config.verify_improvement,
"min_improvement_pct": config.min_improvement_pct,
"enabled": engine_lock.enabled,
}
}))
}
/// Get current healing configuration
#[pg_extern]
pub fn ruvector_healing_get_config() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let config = &engine_lock.config;
pgrx::JsonB(serde_json::json!({
"min_healing_interval_secs": config.min_healing_interval.as_secs(),
"max_attempts_per_window": config.max_attempts_per_window,
"attempt_window_secs": config.attempt_window.as_secs(),
"max_auto_heal_impact": config.max_auto_heal_impact,
"learning_enabled": config.learning_enabled,
"failure_cooldown_secs": config.failure_cooldown.as_secs(),
"verify_improvement": config.verify_improvement,
"min_improvement_pct": config.min_improvement_pct,
"max_concurrent_remediations": config.max_concurrent_remediations,
"require_approval_strategies": config.require_approval_strategies,
"enabled": engine_lock.enabled,
}))
}
/// Enable or disable healing
#[pg_extern]
pub fn ruvector_healing_enable(enabled: bool) -> bool {
let engine = get_healing_engine();
let mut engine_lock = engine.write();
engine_lock.set_enabled(enabled);
engine_lock.enabled
}
// ============================================================================
// Strategy Functions
// ============================================================================
/// List all available healing strategies
#[pg_extern]
pub fn ruvector_healing_strategies() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let strategies: Vec<serde_json::Value> = engine_lock
.remediation
.registry
.all_strategies()
.iter()
.map(|s| {
serde_json::json!({
"name": s.name(),
"description": s.description(),
"handles": s.handles().iter().map(|h| h.to_string()).collect::<Vec<_>>(),
"impact": s.impact(),
"estimated_duration_secs": s.estimated_duration().as_secs(),
"reversible": s.reversible(),
"weight": engine_lock.remediation.registry.get_weight(s.name()),
})
})
.collect();
pgrx::JsonB(serde_json::json!({
"strategies": strategies,
"count": strategies.len(),
}))
}
/// Get effectiveness report for all strategies
#[pg_extern]
pub fn ruvector_healing_effectiveness() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let report = engine_lock.tracker.effectiveness_report();
pgrx::JsonB(report.to_json())
}
/// Get statistics for the healing engine
#[pg_extern]
pub fn ruvector_healing_stats() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let engine_stats = engine_lock.remediation.get_stats();
let tracker_stats = engine_lock.tracker.get_stats();
pgrx::JsonB(serde_json::json!({
"engine": engine_stats.to_json(),
"tracker": tracker_stats.to_json(),
}))
}
// ============================================================================
// Detection Threshold Functions
// ============================================================================
/// Get current detection thresholds
#[pg_extern]
pub fn ruvector_healing_thresholds() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let thresholds = engine_lock.detector.get_thresholds();
pgrx::JsonB(serde_json::json!({
"index_fragmentation_pct": thresholds.index_fragmentation_pct,
"replica_lag_seconds": thresholds.replica_lag_seconds,
"storage_usage_pct": thresholds.storage_usage_pct,
"query_timeout_rate": thresholds.query_timeout_rate,
"min_integrity_lambda": thresholds.min_integrity_lambda,
"memory_usage_pct": thresholds.memory_usage_pct,
"connection_usage_pct": thresholds.connection_usage_pct,
"partition_load_ratio": thresholds.partition_load_ratio,
}))
}
/// Update detection thresholds
#[pg_extern]
pub fn ruvector_healing_set_thresholds(thresholds_json: pgrx::JsonB) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let mut thresholds = engine_lock.detector.get_thresholds();
let json = thresholds_json.0;
if let Some(v) = json.get("index_fragmentation_pct").and_then(|v| v.as_f64()) {
thresholds.index_fragmentation_pct = v as f32;
}
if let Some(v) = json.get("replica_lag_seconds").and_then(|v| v.as_f64()) {
thresholds.replica_lag_seconds = v as f32;
}
if let Some(v) = json.get("storage_usage_pct").and_then(|v| v.as_f64()) {
thresholds.storage_usage_pct = v as f32;
}
if let Some(v) = json.get("query_timeout_rate").and_then(|v| v.as_f64()) {
thresholds.query_timeout_rate = v as f32;
}
if let Some(v) = json.get("min_integrity_lambda").and_then(|v| v.as_f64()) {
thresholds.min_integrity_lambda = v as f32;
}
if let Some(v) = json.get("memory_usage_pct").and_then(|v| v.as_f64()) {
thresholds.memory_usage_pct = v as f32;
}
if let Some(v) = json.get("connection_usage_pct").and_then(|v| v.as_f64()) {
thresholds.connection_usage_pct = v as f32;
}
if let Some(v) = json.get("partition_load_ratio").and_then(|v| v.as_f64()) {
thresholds.partition_load_ratio = v as f32;
}
engine_lock.detector.update_thresholds(thresholds.clone());
pgrx::JsonB(serde_json::json!({
"status": "updated",
"thresholds": {
"index_fragmentation_pct": thresholds.index_fragmentation_pct,
"replica_lag_seconds": thresholds.replica_lag_seconds,
"storage_usage_pct": thresholds.storage_usage_pct,
"query_timeout_rate": thresholds.query_timeout_rate,
"min_integrity_lambda": thresholds.min_integrity_lambda,
"memory_usage_pct": thresholds.memory_usage_pct,
"connection_usage_pct": thresholds.connection_usage_pct,
"partition_load_ratio": thresholds.partition_load_ratio,
}
}))
}
// ============================================================================
// Problem Type Reference
// ============================================================================
/// List all supported problem types
#[pg_extern]
pub fn ruvector_healing_problem_types() -> pgrx::JsonB {
let types: Vec<serde_json::Value> = ProblemType::all()
.iter()
.map(|t| {
serde_json::json!({
"name": t.to_string(),
"description": t.description(),
})
})
.collect();
pgrx::JsonB(serde_json::json!({
"problem_types": types,
"count": types.len(),
}))
}
#[cfg(test)]
mod tests {
// These tests would run in a PostgreSQL context with pg_test
// For now, they verify the function signatures compile correctly
}

View File

@@ -0,0 +1,669 @@
//! Learning System for Self-Healing Engine
//!
//! Tracks remediation outcomes and adjusts strategy selection:
//! - Outcome recording with full context
//! - Strategy weight updates based on success/failure
//! - Confidence scoring for strategies
//! - Effectiveness reporting
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use super::detector::{Problem, ProblemType, Severity};
use super::strategies::RemediationResult;
// ============================================================================
// Outcome Record
// ============================================================================
/// A recorded remediation outcome
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutcomeRecord {
/// Unique ID
pub id: u64,
/// Problem type
pub problem_type: ProblemType,
/// Problem severity
pub severity: Severity,
/// Strategy used
pub strategy_name: String,
/// Whether remediation succeeded
pub success: bool,
/// Whether improvement was verified
pub verified: bool,
/// Actions taken
pub actions_taken: usize,
/// Improvement percentage
pub improvement_pct: f32,
/// Duration in milliseconds
pub duration_ms: u64,
/// Error message if failed
pub error_message: Option<String>,
/// Timestamp
pub timestamp: u64,
/// Human feedback score (if provided, 0-1)
pub feedback_score: Option<f32>,
/// Additional metadata
pub metadata: serde_json::Value,
}
impl OutcomeRecord {
/// Create from a problem and result
pub fn from_result(
id: u64,
problem: &Problem,
strategy_name: &str,
result: &RemediationResult,
verified: bool,
) -> Self {
Self {
id,
problem_type: problem.problem_type,
severity: problem.severity,
strategy_name: strategy_name.to_string(),
success: result.is_success(),
verified,
actions_taken: result.actions_taken,
improvement_pct: result.improvement_pct,
duration_ms: result.duration_ms,
error_message: result.error_message.clone(),
timestamp: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs(),
feedback_score: None,
metadata: result.metadata.clone(),
}
}
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"id": self.id,
"problem_type": self.problem_type.to_string(),
"severity": format!("{:?}", self.severity).to_lowercase(),
"strategy_name": self.strategy_name,
"success": self.success,
"verified": self.verified,
"actions_taken": self.actions_taken,
"improvement_pct": self.improvement_pct,
"duration_ms": self.duration_ms,
"error_message": self.error_message,
"timestamp": self.timestamp,
"feedback_score": self.feedback_score,
})
}
}
// ============================================================================
// Strategy Weight
// ============================================================================
/// Strategy weight with confidence metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StrategyWeight {
/// Strategy name
pub strategy_name: String,
/// Current weight (1.0 = baseline)
pub weight: f32,
/// Confidence in weight (0-1)
pub confidence: f32,
/// Number of observations
pub observations: usize,
/// Success count
pub successes: usize,
/// Average improvement when successful
pub avg_improvement: f32,
/// Average duration in milliseconds
pub avg_duration_ms: u64,
/// Last update timestamp
pub last_updated: u64,
}
impl StrategyWeight {
/// Create new weight for strategy
pub fn new(strategy_name: &str) -> Self {
Self {
strategy_name: strategy_name.to_string(),
weight: 1.0,
confidence: 0.0,
observations: 0,
successes: 0,
avg_improvement: 0.0,
avg_duration_ms: 0,
last_updated: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs(),
}
}
/// Update with new observation
pub fn update(&mut self, success: bool, improvement_pct: f32, duration_ms: u64) {
self.observations += 1;
if success {
self.successes += 1;
}
// Update running averages
let n = self.observations as f32;
self.avg_improvement = ((n - 1.0) * self.avg_improvement + improvement_pct) / n;
self.avg_duration_ms = ((self.observations as u64 - 1) * self.avg_duration_ms
+ duration_ms)
/ self.observations as u64;
// Calculate success rate
let success_rate = self.successes as f32 / self.observations as f32;
// Weight = success_rate * (1 + avg_improvement/100)
self.weight = success_rate * (1.0 + self.avg_improvement / 100.0);
self.weight = self.weight.max(0.1).min(2.0);
// Confidence increases with observations (asymptotic to 1.0)
self.confidence = 1.0 - 1.0 / (1.0 + (self.observations as f32 / 10.0));
self.last_updated = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
}
/// Get success rate
pub fn success_rate(&self) -> f32 {
if self.observations > 0 {
self.successes as f32 / self.observations as f32
} else {
0.0
}
}
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"strategy_name": self.strategy_name,
"weight": self.weight,
"confidence": self.confidence,
"observations": self.observations,
"successes": self.successes,
"success_rate": self.success_rate(),
"avg_improvement": self.avg_improvement,
"avg_duration_ms": self.avg_duration_ms,
"last_updated": self.last_updated,
})
}
}
// ============================================================================
// Outcome Tracker
// ============================================================================
/// Tracks remediation outcomes for learning
#[derive(Clone)]
pub struct OutcomeTracker {
/// Outcome history
history: std::sync::Arc<RwLock<VecDeque<OutcomeRecord>>>,
/// Strategy weights
weights: std::sync::Arc<RwLock<HashMap<String, StrategyWeight>>>,
/// Maximum history size
max_history: usize,
/// Next record ID
next_id: std::sync::Arc<AtomicU64>,
}
impl OutcomeTracker {
/// Create new tracker
pub fn new() -> Self {
Self {
history: std::sync::Arc::new(RwLock::new(VecDeque::new())),
weights: std::sync::Arc::new(RwLock::new(HashMap::new())),
max_history: 10000,
next_id: std::sync::Arc::new(AtomicU64::new(1)),
}
}
/// Create with custom history size
pub fn with_max_history(max_history: usize) -> Self {
Self {
history: std::sync::Arc::new(RwLock::new(VecDeque::new())),
weights: std::sync::Arc::new(RwLock::new(HashMap::new())),
max_history,
next_id: std::sync::Arc::new(AtomicU64::new(1)),
}
}
/// Record a remediation outcome
pub fn record(
&self,
problem: &Problem,
strategy_name: &str,
result: &RemediationResult,
verified: bool,
) {
let id = self.next_id.fetch_add(1, Ordering::SeqCst);
let record = OutcomeRecord::from_result(id, problem, strategy_name, result, verified);
// Add to history
let mut history = self.history.write();
history.push_back(record.clone());
while history.len() > self.max_history {
history.pop_front();
}
// Update strategy weight
let mut weights = self.weights.write();
let weight = weights
.entry(strategy_name.to_string())
.or_insert_with(|| StrategyWeight::new(strategy_name));
weight.update(verified, result.improvement_pct, result.duration_ms);
}
/// Get recent outcomes
pub fn get_recent(&self, limit: usize) -> Vec<OutcomeRecord> {
let history = self.history.read();
history.iter().rev().take(limit).cloned().collect()
}
/// Get outcomes since timestamp
pub fn get_since(&self, since: u64) -> Vec<OutcomeRecord> {
let history = self.history.read();
history
.iter()
.filter(|r| r.timestamp >= since)
.cloned()
.collect()
}
/// Get outcomes for a specific strategy
pub fn get_for_strategy(&self, strategy_name: &str, limit: usize) -> Vec<OutcomeRecord> {
let history = self.history.read();
history
.iter()
.rev()
.filter(|r| r.strategy_name == strategy_name)
.take(limit)
.cloned()
.collect()
}
/// Get outcomes for a specific problem type
pub fn get_for_problem_type(
&self,
problem_type: ProblemType,
limit: usize,
) -> Vec<OutcomeRecord> {
let history = self.history.read();
history
.iter()
.rev()
.filter(|r| r.problem_type == problem_type)
.take(limit)
.cloned()
.collect()
}
/// Get strategy weight
pub fn get_weight(&self, strategy_name: &str) -> Option<StrategyWeight> {
self.weights.read().get(strategy_name).cloned()
}
/// Get all strategy weights
pub fn get_all_weights(&self) -> Vec<StrategyWeight> {
self.weights.read().values().cloned().collect()
}
/// Add human feedback to an outcome
pub fn add_feedback(&self, outcome_id: u64, score: f32) -> bool {
let mut history = self.history.write();
for record in history.iter_mut() {
if record.id == outcome_id {
record.feedback_score = Some(score.max(0.0).min(1.0));
return true;
}
}
false
}
/// Get overall statistics
pub fn get_stats(&self) -> TrackerStats {
let history = self.history.read();
let weights = self.weights.read();
let total = history.len();
let successes = history.iter().filter(|r| r.success && r.verified).count();
let total_improvement: f32 = history.iter().map(|r| r.improvement_pct).sum();
let total_duration: u64 = history.iter().map(|r| r.duration_ms).sum();
TrackerStats {
total_outcomes: total,
successful_outcomes: successes,
success_rate: if total > 0 {
successes as f32 / total as f32
} else {
0.0
},
avg_improvement: if total > 0 {
total_improvement / total as f32
} else {
0.0
},
avg_duration_ms: if total > 0 {
total_duration / total as u64
} else {
0
},
tracked_strategies: weights.len(),
}
}
/// Generate effectiveness report
pub fn effectiveness_report(&self) -> EffectivenessReport {
let weights = self.get_all_weights();
let stats = self.get_stats();
let strategy_reports: Vec<StrategyEffectiveness> = weights
.iter()
.map(|w| {
let recent = self.get_for_strategy(&w.strategy_name, 10);
StrategyEffectiveness {
strategy_name: w.strategy_name.clone(),
weight: w.weight,
confidence: w.confidence,
success_rate: w.success_rate(),
avg_improvement: w.avg_improvement,
recent_outcomes: recent.len(),
}
})
.collect();
EffectivenessReport {
strategies: strategy_reports,
overall_success_rate: stats.success_rate,
avg_time_to_recovery_ms: stats.avg_duration_ms,
total_outcomes: stats.total_outcomes,
}
}
/// Update weights from historical data (for batch learning)
pub fn recalculate_weights(&self, lookback: Duration) {
let cutoff = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs()
- lookback.as_secs();
let history = self.history.read();
let mut weights = self.weights.write();
// Group outcomes by strategy
let mut strategy_outcomes: HashMap<String, Vec<&OutcomeRecord>> = HashMap::new();
for record in history.iter().filter(|r| r.timestamp >= cutoff) {
strategy_outcomes
.entry(record.strategy_name.clone())
.or_default()
.push(record);
}
// Recalculate each strategy's weight
for (strategy_name, outcomes) in strategy_outcomes {
let weight = weights
.entry(strategy_name.clone())
.or_insert_with(|| StrategyWeight::new(&strategy_name));
// Reset counters
weight.observations = outcomes.len();
weight.successes = outcomes.iter().filter(|o| o.success && o.verified).count();
weight.avg_improvement =
outcomes.iter().map(|o| o.improvement_pct).sum::<f32>() / outcomes.len() as f32;
weight.avg_duration_ms =
outcomes.iter().map(|o| o.duration_ms).sum::<u64>() / outcomes.len() as u64;
// Recalculate weight
let success_rate = weight.success_rate();
weight.weight = success_rate * (1.0 + weight.avg_improvement / 100.0);
weight.weight = weight.weight.max(0.1).min(2.0);
weight.confidence = 1.0 - 1.0 / (1.0 + (weight.observations as f32 / 10.0));
weight.last_updated = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
}
}
}
impl Default for OutcomeTracker {
fn default() -> Self {
Self::new()
}
}
/// Tracker statistics
#[derive(Debug, Clone)]
pub struct TrackerStats {
pub total_outcomes: usize,
pub successful_outcomes: usize,
pub success_rate: f32,
pub avg_improvement: f32,
pub avg_duration_ms: u64,
pub tracked_strategies: usize,
}
impl TrackerStats {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"total_outcomes": self.total_outcomes,
"successful_outcomes": self.successful_outcomes,
"success_rate": self.success_rate,
"avg_improvement": self.avg_improvement,
"avg_duration_ms": self.avg_duration_ms,
"tracked_strategies": self.tracked_strategies,
})
}
}
/// Strategy effectiveness
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StrategyEffectiveness {
pub strategy_name: String,
pub weight: f32,
pub confidence: f32,
pub success_rate: f32,
pub avg_improvement: f32,
pub recent_outcomes: usize,
}
/// Effectiveness report
#[derive(Debug, Clone)]
pub struct EffectivenessReport {
pub strategies: Vec<StrategyEffectiveness>,
pub overall_success_rate: f32,
pub avg_time_to_recovery_ms: u64,
pub total_outcomes: usize,
}
impl EffectivenessReport {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"strategies": self.strategies,
"overall_success_rate": self.overall_success_rate,
"avg_time_to_recovery_ms": self.avg_time_to_recovery_ms,
"total_outcomes": self.total_outcomes,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_problem() -> Problem {
Problem::new(ProblemType::IndexDegradation, Severity::Medium)
}
fn create_result(success: bool, improvement: f32) -> RemediationResult {
if success {
RemediationResult::success(1, improvement).with_duration(1000)
} else {
RemediationResult::failure("test error").with_duration(500)
}
}
#[test]
fn test_record_outcome() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
let result = create_result(true, 15.0);
tracker.record(&problem, "test_strategy", &result, true);
let recent = tracker.get_recent(10);
assert_eq!(recent.len(), 1);
assert!(recent[0].success);
assert!(recent[0].verified);
}
#[test]
fn test_weight_updates() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
// Record successes
for _ in 0..5 {
let result = create_result(true, 20.0);
tracker.record(&problem, "test_strategy", &result, true);
}
let weight = tracker.get_weight("test_strategy").unwrap();
assert_eq!(weight.observations, 5);
assert_eq!(weight.successes, 5);
assert!(weight.weight > 1.0); // Should be elevated
assert!(weight.confidence > 0.3); // Should have some confidence
}
#[test]
fn test_mixed_outcomes() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
// 3 successes
for _ in 0..3 {
let result = create_result(true, 10.0);
tracker.record(&problem, "test_strategy", &result, true);
}
// 2 failures
for _ in 0..2 {
let result = create_result(false, 0.0);
tracker.record(&problem, "test_strategy", &result, false);
}
let weight = tracker.get_weight("test_strategy").unwrap();
assert_eq!(weight.observations, 5);
assert_eq!(weight.successes, 3);
assert!((weight.success_rate() - 0.6).abs() < 0.01);
}
#[test]
fn test_get_for_strategy() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
// Record for different strategies
tracker.record(&problem, "strategy_a", &create_result(true, 10.0), true);
tracker.record(&problem, "strategy_b", &create_result(true, 15.0), true);
tracker.record(&problem, "strategy_a", &create_result(true, 20.0), true);
let a_outcomes = tracker.get_for_strategy("strategy_a", 10);
assert_eq!(a_outcomes.len(), 2);
let b_outcomes = tracker.get_for_strategy("strategy_b", 10);
assert_eq!(b_outcomes.len(), 1);
}
#[test]
fn test_feedback() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
tracker.record(&problem, "test_strategy", &create_result(true, 10.0), true);
let recent = tracker.get_recent(1);
let id = recent[0].id;
assert!(tracker.add_feedback(id, 0.9));
let updated = tracker.get_recent(1);
assert_eq!(updated[0].feedback_score, Some(0.9));
}
#[test]
fn test_max_history() {
let tracker = OutcomeTracker::with_max_history(5);
let problem = create_problem();
// Record 10 outcomes
for i in 0..10 {
tracker.record(
&problem,
"test_strategy",
&create_result(true, i as f32),
true,
);
}
let history = tracker.get_recent(100);
assert_eq!(history.len(), 5); // Should be capped at 5
}
#[test]
fn test_effectiveness_report() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
for _ in 0..5 {
tracker.record(&problem, "strategy_a", &create_result(true, 15.0), true);
}
for _ in 0..5 {
tracker.record(&problem, "strategy_b", &create_result(true, 25.0), true);
}
let report = tracker.effectiveness_report();
assert_eq!(report.strategies.len(), 2);
assert_eq!(report.total_outcomes, 10);
assert_eq!(report.overall_success_rate, 1.0);
}
#[test]
fn test_strategy_weight_confidence() {
let mut weight = StrategyWeight::new("test");
// Few observations = low confidence
weight.update(true, 10.0, 1000);
assert!(weight.confidence < 0.5);
// More observations = higher confidence
for _ in 0..20 {
weight.update(true, 10.0, 1000);
}
assert!(weight.confidence > 0.5);
}
#[test]
fn test_tracker_stats() {
let tracker = OutcomeTracker::new();
let problem = create_problem();
tracker.record(&problem, "strategy_a", &create_result(true, 10.0), true);
tracker.record(&problem, "strategy_b", &create_result(false, 0.0), false);
let stats = tracker.get_stats();
assert_eq!(stats.total_outcomes, 2);
assert_eq!(stats.successful_outcomes, 1);
assert_eq!(stats.success_rate, 0.5);
}
}

View File

@@ -0,0 +1,233 @@
//! Self-Healing Engine for RuVector Postgres v2
//!
//! This module provides automated problem detection and remediation capabilities:
//! - **Problem Detection**: Monitors system health and detects issues
//! - **Remediation Strategies**: Defines actions to fix detected problems
//! - **Remediation Engine**: Orchestrates strategy execution with rollback
//! - **Learning System**: Tracks outcomes and improves strategy selection
//! - **Background Worker**: Continuous health monitoring
//!
//! # Architecture
//!
//! ```text
//! +------------------------------------------------------------------+
//! | Integrity Monitor |
//! | - Detects state transitions (normal -> stress -> critical) |
//! +------------------------------------------------------------------+
//! |
//! v
//! +------------------------------------------------------------------+
//! | Problem Detector |
//! | - Classifies problem types from witness edges |
//! +------------------------------------------------------------------+
//! |
//! v
//! +------------------------------------------------------------------+
//! | Remediation Engine |
//! | - Selects strategy, executes with timeout/rollback |
//! +------------------------------------------------------------------+
//! |
//! v
//! +------------------------------------------------------------------+
//! | Learning System |
//! | - Records outcomes, updates strategy weights |
//! +------------------------------------------------------------------+
//! ```
pub mod detector;
pub mod engine;
pub mod functions;
pub mod learning;
pub mod strategies;
pub mod worker;
pub use detector::{Problem, ProblemDetector, ProblemType, SystemMetrics};
pub use engine::{HealingConfig, HealingOutcome, RemediationContext, RemediationEngine};
pub use learning::{OutcomeRecord, OutcomeTracker, StrategyWeight};
pub use strategies::{
IntegrityRecovery, PromoteReplica, QueryCircuitBreaker, ReindexPartition, RemediationOutcome,
RemediationResult, RemediationStrategy, StrategyRegistry, TierEviction,
};
pub use worker::{HealingWorker, HealingWorkerConfig, HealingWorkerState};
use parking_lot::RwLock;
use std::sync::Arc;
/// Global healing engine instance
static HEALING_ENGINE: std::sync::OnceLock<Arc<RwLock<HealingEngine>>> = std::sync::OnceLock::new();
/// Get or initialize the global healing engine
pub fn get_healing_engine() -> Arc<RwLock<HealingEngine>> {
HEALING_ENGINE
.get_or_init(|| Arc::new(RwLock::new(HealingEngine::new())))
.clone()
}
/// Main healing engine combining all components
pub struct HealingEngine {
/// Problem detector
pub detector: ProblemDetector,
/// Remediation engine
pub remediation: RemediationEngine,
/// Outcome tracker for learning
pub tracker: OutcomeTracker,
/// Background worker state
pub worker_state: Arc<HealingWorkerState>,
/// Configuration
pub config: HealingConfig,
/// Whether healing is enabled
pub enabled: bool,
}
impl HealingEngine {
/// Create a new healing engine with default configuration
pub fn new() -> Self {
let config = HealingConfig::default();
let tracker = OutcomeTracker::new();
let registry = StrategyRegistry::new_with_defaults();
Self {
detector: ProblemDetector::new(),
remediation: RemediationEngine::new(registry, config.clone(), tracker.clone()),
tracker,
worker_state: Arc::new(HealingWorkerState::new(HealingWorkerConfig::default())),
config,
enabled: true,
}
}
/// Create with custom configuration
pub fn with_config(config: HealingConfig) -> Self {
let tracker = OutcomeTracker::new();
let registry = StrategyRegistry::new_with_defaults();
Self {
detector: ProblemDetector::new(),
remediation: RemediationEngine::new(registry, config.clone(), tracker.clone()),
tracker,
worker_state: Arc::new(HealingWorkerState::new(HealingWorkerConfig::default())),
config,
enabled: true,
}
}
/// Check system health and return current status
pub fn health_status(&self) -> HealthStatus {
let metrics = self.detector.collect_metrics();
let problems = self.detector.detect_problems(&metrics);
let active_remediations = self.remediation.active_remediations();
HealthStatus {
healthy: problems.is_empty() && active_remediations.is_empty(),
problem_count: problems.len(),
active_remediation_count: active_remediations.len(),
problems,
metrics,
enabled: self.enabled,
last_check: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs(),
}
}
/// Enable or disable healing
pub fn set_enabled(&mut self, enabled: bool) {
self.enabled = enabled;
}
/// Update configuration
pub fn update_config(&mut self, config: HealingConfig) {
self.config = config.clone();
self.remediation.update_config(config);
}
/// Trigger manual healing for a specific problem type
pub fn trigger_healing(&self, problem_type: ProblemType) -> Option<HealingOutcome> {
if !self.enabled {
return None;
}
let problem = Problem {
problem_type,
severity: detector::Severity::Medium,
detected_at: std::time::SystemTime::now(),
details: serde_json::json!({"source": "manual_trigger"}),
affected_partitions: vec![],
};
Some(self.remediation.heal(&problem))
}
}
impl Default for HealingEngine {
fn default() -> Self {
Self::new()
}
}
/// Health status summary
#[derive(Debug, Clone)]
pub struct HealthStatus {
/// Whether the system is healthy
pub healthy: bool,
/// Number of detected problems
pub problem_count: usize,
/// Number of active remediations
pub active_remediation_count: usize,
/// List of detected problems
pub problems: Vec<Problem>,
/// Current system metrics
pub metrics: SystemMetrics,
/// Whether healing is enabled
pub enabled: bool,
/// Timestamp of last health check
pub last_check: u64,
}
impl HealthStatus {
/// Convert to JSON for SQL function output
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"healthy": self.healthy,
"problem_count": self.problem_count,
"active_remediation_count": self.active_remediation_count,
"problems": self.problems.iter().map(|p| p.to_json()).collect::<Vec<_>>(),
"metrics": self.metrics.to_json(),
"enabled": self.enabled,
"last_check": self.last_check,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_healing_engine_creation() {
let engine = HealingEngine::new();
assert!(engine.enabled);
let status = engine.health_status();
assert!(status.healthy);
}
#[test]
fn test_healing_enable_disable() {
let mut engine = HealingEngine::new();
engine.set_enabled(false);
assert!(!engine.enabled);
engine.set_enabled(true);
assert!(engine.enabled);
}
#[test]
fn test_global_instance() {
let engine1 = get_healing_engine();
let engine2 = get_healing_engine();
assert!(Arc::ptr_eq(&engine1, &engine2));
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,618 @@
//! Background Worker for Self-Healing Engine
//!
//! Provides continuous health monitoring and async remediation:
//! - Periodic health checks
//! - Automatic problem detection
//! - Async remediation execution
//! - Integration with integrity control plane
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use super::detector::ProblemDetector;
use super::engine::HealingOutcome;
use super::get_healing_engine;
// ============================================================================
// Worker Configuration
// ============================================================================
/// Configuration for the healing background worker
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealingWorkerConfig {
/// Health check interval
pub check_interval: Duration,
/// Whether to auto-remediate detected problems
pub auto_remediate: bool,
/// Minimum severity to auto-remediate
pub min_auto_severity: u8, // 0=Info, 1=Low, 2=Medium, 3=High, 4=Critical
/// Maximum concurrent remediations
pub max_concurrent: usize,
/// Whether to log health status
pub log_status: bool,
/// Enable metrics collection
pub collect_metrics: bool,
}
impl Default for HealingWorkerConfig {
fn default() -> Self {
Self {
check_interval: Duration::from_secs(60),
auto_remediate: true,
min_auto_severity: 2, // Medium and above
max_concurrent: 2,
log_status: true,
collect_metrics: true,
}
}
}
// ============================================================================
// Worker State
// ============================================================================
/// State of the healing background worker
pub struct HealingWorkerState {
/// Configuration
config: RwLock<HealingWorkerConfig>,
/// Whether worker is running
running: AtomicBool,
/// Last health check timestamp
last_check: AtomicU64,
/// Total health checks performed
checks_completed: AtomicU64,
/// Total problems detected
problems_detected: AtomicU64,
/// Total remediations triggered
remediations_triggered: AtomicU64,
/// Recent health statuses
recent_statuses: RwLock<Vec<HealthCheckResult>>,
}
impl HealingWorkerState {
/// Create new worker state
pub fn new(config: HealingWorkerConfig) -> Self {
Self {
config: RwLock::new(config),
running: AtomicBool::new(false),
last_check: AtomicU64::new(0),
checks_completed: AtomicU64::new(0),
problems_detected: AtomicU64::new(0),
remediations_triggered: AtomicU64::new(0),
recent_statuses: RwLock::new(Vec::new()),
}
}
/// Check if worker is running
pub fn is_running(&self) -> bool {
self.running.load(Ordering::SeqCst)
}
/// Start worker
pub fn start(&self) {
self.running.store(true, Ordering::SeqCst);
}
/// Stop worker
pub fn stop(&self) {
self.running.store(false, Ordering::SeqCst);
}
/// Get configuration
pub fn get_config(&self) -> HealingWorkerConfig {
self.config.read().clone()
}
/// Update configuration
pub fn set_config(&self, config: HealingWorkerConfig) {
*self.config.write() = config;
}
/// Record a health check
pub fn record_check(&self, result: HealthCheckResult) {
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
self.last_check.store(now, Ordering::SeqCst);
self.checks_completed.fetch_add(1, Ordering::SeqCst);
self.problems_detected
.fetch_add(result.problems_found as u64, Ordering::SeqCst);
self.remediations_triggered
.fetch_add(result.remediations_triggered as u64, Ordering::SeqCst);
// Keep last 100 statuses
let mut statuses = self.recent_statuses.write();
statuses.push(result);
while statuses.len() > 100 {
statuses.remove(0);
}
}
/// Get worker statistics
pub fn get_stats(&self) -> WorkerStats {
WorkerStats {
running: self.running.load(Ordering::SeqCst),
last_check: self.last_check.load(Ordering::SeqCst),
checks_completed: self.checks_completed.load(Ordering::SeqCst),
problems_detected: self.problems_detected.load(Ordering::SeqCst),
remediations_triggered: self.remediations_triggered.load(Ordering::SeqCst),
}
}
/// Get recent health check results
pub fn get_recent_checks(&self, limit: usize) -> Vec<HealthCheckResult> {
let statuses = self.recent_statuses.read();
statuses.iter().rev().take(limit).cloned().collect()
}
}
/// Worker statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WorkerStats {
pub running: bool,
pub last_check: u64,
pub checks_completed: u64,
pub problems_detected: u64,
pub remediations_triggered: u64,
}
impl WorkerStats {
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"running": self.running,
"last_check": self.last_check,
"checks_completed": self.checks_completed,
"problems_detected": self.problems_detected,
"remediations_triggered": self.remediations_triggered,
})
}
}
// ============================================================================
// Health Check Result
// ============================================================================
/// Result of a health check
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckResult {
/// Timestamp of check
pub timestamp: u64,
/// Whether system is healthy
pub healthy: bool,
/// Number of problems found
pub problems_found: usize,
/// Number of remediations triggered
pub remediations_triggered: usize,
/// Remediation outcomes
pub outcomes: Vec<serde_json::Value>,
/// Metrics collected
pub metrics: Option<serde_json::Value>,
/// Duration of check in milliseconds
pub duration_ms: u64,
}
impl HealthCheckResult {
/// Create a healthy result
pub fn healthy() -> Self {
Self {
timestamp: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs(),
healthy: true,
problems_found: 0,
remediations_triggered: 0,
outcomes: vec![],
metrics: None,
duration_ms: 0,
}
}
/// Convert to JSON
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"timestamp": self.timestamp,
"healthy": self.healthy,
"problems_found": self.problems_found,
"remediations_triggered": self.remediations_triggered,
"outcomes": self.outcomes,
"duration_ms": self.duration_ms,
})
}
}
// ============================================================================
// Healing Worker
// ============================================================================
/// Background worker for continuous health monitoring
pub struct HealingWorker {
/// Worker state
state: Arc<HealingWorkerState>,
/// Problem detector
detector: ProblemDetector,
}
impl HealingWorker {
/// Create new healing worker
pub fn new(config: HealingWorkerConfig) -> Self {
Self {
state: Arc::new(HealingWorkerState::new(config)),
detector: ProblemDetector::new(),
}
}
/// Create with shared state
pub fn with_state(state: Arc<HealingWorkerState>) -> Self {
Self {
state,
detector: ProblemDetector::new(),
}
}
/// Get worker state
pub fn state(&self) -> &Arc<HealingWorkerState> {
&self.state
}
/// Perform one health check cycle
pub fn check_health(&self) -> HealthCheckResult {
let start = std::time::Instant::now();
let config = self.state.get_config();
// Collect metrics
let metrics = self.detector.collect_metrics();
// Detect problems
let problems = self.detector.detect_problems(&metrics);
let problems_found = problems.len();
if config.log_status {
if problems_found > 0 {
pgrx::log!("Healing worker: {} problems detected", problems_found);
} else {
pgrx::debug1!("Healing worker: no problems detected");
}
}
let mut remediations_triggered = 0;
let mut outcomes = Vec::new();
// Auto-remediate if enabled
if config.auto_remediate && problems_found > 0 {
let engine = get_healing_engine();
let engine_lock = engine.read();
for problem in &problems {
// Check severity threshold
if problem.severity.value() < config.min_auto_severity {
continue;
}
// Attempt remediation
let outcome = engine_lock.remediation.heal(problem);
outcomes.push(outcome.to_json());
if matches!(outcome, HealingOutcome::Completed { .. }) {
remediations_triggered += 1;
}
}
}
let duration_ms = start.elapsed().as_millis() as u64;
let result = HealthCheckResult {
timestamp: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs(),
healthy: problems_found == 0,
problems_found,
remediations_triggered,
outcomes,
metrics: if config.collect_metrics {
Some(metrics.to_json())
} else {
None
},
duration_ms,
};
self.state.record_check(result.clone());
result
}
/// Run the worker loop (blocking)
pub fn run(&self) {
self.state.start();
pgrx::log!("Healing background worker started");
while self.state.is_running() {
// Perform health check
let _result = self.check_health();
// Sleep until next check
let interval = self.state.get_config().check_interval;
// Use PostgreSQL's WaitLatch for interruptible sleep
self.wait_for_interval(interval);
}
pgrx::log!("Healing background worker stopped");
}
/// Wait for interval with interruption support
fn wait_for_interval(&self, interval: Duration) {
// Use simple thread sleep which works in all contexts.
// In production as a full background worker, one would use
// PostgreSQL's WaitLatch for interruptible sleep.
std::thread::sleep(interval);
}
/// Stop the worker
pub fn stop(&self) {
self.state.stop();
}
}
// ============================================================================
// Background Worker Entry Point
// ============================================================================
/// PostgreSQL background worker entry point
#[pgrx::pg_guard]
pub extern "C" fn healing_bgworker_main(_arg: pgrx::pg_sys::Datum) {
pgrx::log!("RuVector healing background worker starting");
let config = HealingWorkerConfig::default();
let worker = HealingWorker::new(config);
worker.run();
}
/// Register the background worker with PostgreSQL
pub fn register_healing_worker() {
pgrx::log!("Registering RuVector healing background worker");
// In production, use pg_sys::RegisterBackgroundWorker
// This is a placeholder for now
//
// unsafe {
// let mut worker = pg_sys::BackgroundWorker::default();
// // Configure worker...
// pg_sys::RegisterBackgroundWorker(&mut worker);
// }
}
// ============================================================================
// SQL Functions for Worker Control
// ============================================================================
use pgrx::prelude::*;
/// Start the healing background worker
#[pg_extern]
pub fn ruvector_healing_worker_start() -> bool {
let engine = get_healing_engine();
let engine_lock = engine.read();
if engine_lock.worker_state.is_running() {
pgrx::warning!("Healing worker is already running");
return false;
}
// In production, would launch actual background worker
engine_lock.worker_state.start();
pgrx::log!("Healing worker started");
true
}
/// Stop the healing background worker
#[pg_extern]
pub fn ruvector_healing_worker_stop() -> bool {
let engine = get_healing_engine();
let engine_lock = engine.read();
if !engine_lock.worker_state.is_running() {
pgrx::warning!("Healing worker is not running");
return false;
}
engine_lock.worker_state.stop();
pgrx::log!("Healing worker stopped");
true
}
/// Get healing worker status
#[pg_extern]
pub fn ruvector_healing_worker_status() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let stats = engine_lock.worker_state.get_stats();
let config = engine_lock.worker_state.get_config();
let status = serde_json::json!({
"stats": stats.to_json(),
"config": {
"check_interval_secs": config.check_interval.as_secs(),
"auto_remediate": config.auto_remediate,
"min_auto_severity": config.min_auto_severity,
"max_concurrent": config.max_concurrent,
}
});
pgrx::JsonB(status)
}
/// Configure the healing worker
#[pg_extern]
pub fn ruvector_healing_worker_config(
check_interval_secs: Option<i32>,
auto_remediate: Option<bool>,
min_auto_severity: Option<i32>,
) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let mut config = engine_lock.worker_state.get_config();
if let Some(interval) = check_interval_secs {
if interval > 0 {
config.check_interval = Duration::from_secs(interval as u64);
}
}
if let Some(auto_rem) = auto_remediate {
config.auto_remediate = auto_rem;
}
if let Some(severity) = min_auto_severity {
if severity >= 0 && severity <= 4 {
config.min_auto_severity = severity as u8;
}
}
engine_lock.worker_state.set_config(config.clone());
pgrx::JsonB(serde_json::json!({
"status": "updated",
"config": {
"check_interval_secs": config.check_interval.as_secs(),
"auto_remediate": config.auto_remediate,
"min_auto_severity": config.min_auto_severity,
}
}))
}
/// Manually trigger a health check
#[pg_extern]
pub fn ruvector_healing_check_now() -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let detector = ProblemDetector::new();
let start = std::time::Instant::now();
let metrics = detector.collect_metrics();
let problems = detector.detect_problems(&metrics);
let mut outcomes = Vec::new();
for problem in &problems {
let outcome = engine_lock.remediation.heal(problem);
outcomes.push(outcome.to_json());
}
let result = serde_json::json!({
"healthy": problems.is_empty(),
"problems_found": problems.len(),
"problems": problems.iter().map(|p| p.to_json()).collect::<Vec<_>>(),
"outcomes": outcomes,
"metrics": metrics.to_json(),
"duration_ms": start.elapsed().as_millis() as u64,
});
pgrx::JsonB(result)
}
/// Get recent health check results
#[pg_extern]
pub fn ruvector_healing_recent_checks(limit: default!(i32, 10)) -> pgrx::JsonB {
let engine = get_healing_engine();
let engine_lock = engine.read();
let checks = engine_lock.worker_state.get_recent_checks(limit as usize);
pgrx::JsonB(serde_json::json!({
"checks": checks.iter().map(|c| c.to_json()).collect::<Vec<_>>(),
"count": checks.len(),
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_worker_state() {
let state = HealingWorkerState::new(HealingWorkerConfig::default());
assert!(!state.is_running());
state.start();
assert!(state.is_running());
state.stop();
assert!(!state.is_running());
}
#[test]
fn test_health_check_result() {
let result = HealthCheckResult::healthy();
assert!(result.healthy);
assert_eq!(result.problems_found, 0);
}
#[test]
fn test_worker_config() {
let config = HealingWorkerConfig::default();
assert!(config.auto_remediate);
assert_eq!(config.min_auto_severity, 2);
}
#[test]
fn test_state_recording() {
let state = HealingWorkerState::new(HealingWorkerConfig::default());
let result = HealthCheckResult {
timestamp: 12345,
healthy: false,
problems_found: 2,
remediations_triggered: 1,
outcomes: vec![],
metrics: None,
duration_ms: 100,
};
state.record_check(result);
let stats = state.get_stats();
assert_eq!(stats.checks_completed, 1);
assert_eq!(stats.problems_detected, 2);
assert_eq!(stats.remediations_triggered, 1);
}
#[test]
fn test_recent_checks() {
let state = HealingWorkerState::new(HealingWorkerConfig::default());
for i in 0..5 {
state.record_check(HealthCheckResult {
timestamp: i,
healthy: true,
problems_found: 0,
remediations_triggered: 0,
outcomes: vec![],
metrics: None,
duration_ms: 10,
});
}
let recent = state.get_recent_checks(3);
assert_eq!(recent.len(), 3);
// Most recent first
assert_eq!(recent[0].timestamp, 4);
}
#[test]
fn test_worker_creation() {
let worker = HealingWorker::new(HealingWorkerConfig::default());
assert!(!worker.state().is_running());
}
}