Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
825
vendor/ruvector/crates/ruvector-postgres/src/healing/detector.rs
vendored
Normal file
825
vendor/ruvector/crates/ruvector-postgres/src/healing/detector.rs
vendored
Normal file
@@ -0,0 +1,825 @@
|
||||
//! Problem Detection for Self-Healing Engine
|
||||
//!
|
||||
//! Implements continuous monitoring and problem classification:
|
||||
//! - IndexDegradation: Index performance has degraded
|
||||
//! - ReplicaLag: Replica is falling behind primary
|
||||
//! - StorageExhaustion: Storage space is running low
|
||||
//! - QueryTimeout: Queries are timing out excessively
|
||||
//! - IntegrityViolation: Graph integrity has been compromised
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// ============================================================================
|
||||
// Problem Types
|
||||
// ============================================================================
|
||||
|
||||
/// Types of problems that can be detected
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum ProblemType {
|
||||
/// Index performance has degraded (fragmentation, poor connectivity)
|
||||
IndexDegradation,
|
||||
/// Replica is lagging behind primary
|
||||
ReplicaLag,
|
||||
/// Storage space is running low
|
||||
StorageExhaustion,
|
||||
/// Queries are timing out excessively
|
||||
QueryTimeout,
|
||||
/// Graph integrity has been violated (mincut below threshold)
|
||||
IntegrityViolation,
|
||||
/// Memory pressure is high
|
||||
MemoryPressure,
|
||||
/// Connection pool exhaustion
|
||||
ConnectionExhaustion,
|
||||
/// Hot partition detected (uneven load distribution)
|
||||
HotPartition,
|
||||
}
|
||||
|
||||
impl ProblemType {
|
||||
/// Get human-readable description
|
||||
pub fn description(&self) -> &'static str {
|
||||
match self {
|
||||
ProblemType::IndexDegradation => "Index performance degradation detected",
|
||||
ProblemType::ReplicaLag => "Replica lag exceeds threshold",
|
||||
ProblemType::StorageExhaustion => "Storage space running low",
|
||||
ProblemType::QueryTimeout => "Excessive query timeouts",
|
||||
ProblemType::IntegrityViolation => "Graph integrity violation",
|
||||
ProblemType::MemoryPressure => "Memory pressure detected",
|
||||
ProblemType::ConnectionExhaustion => "Connection pool exhausted",
|
||||
ProblemType::HotPartition => "Hot partition detected",
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all problem types
|
||||
pub fn all() -> Vec<ProblemType> {
|
||||
vec![
|
||||
ProblemType::IndexDegradation,
|
||||
ProblemType::ReplicaLag,
|
||||
ProblemType::StorageExhaustion,
|
||||
ProblemType::QueryTimeout,
|
||||
ProblemType::IntegrityViolation,
|
||||
ProblemType::MemoryPressure,
|
||||
ProblemType::ConnectionExhaustion,
|
||||
ProblemType::HotPartition,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ProblemType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ProblemType::IndexDegradation => write!(f, "index_degradation"),
|
||||
ProblemType::ReplicaLag => write!(f, "replica_lag"),
|
||||
ProblemType::StorageExhaustion => write!(f, "storage_exhaustion"),
|
||||
ProblemType::QueryTimeout => write!(f, "query_timeout"),
|
||||
ProblemType::IntegrityViolation => write!(f, "integrity_violation"),
|
||||
ProblemType::MemoryPressure => write!(f, "memory_pressure"),
|
||||
ProblemType::ConnectionExhaustion => write!(f, "connection_exhaustion"),
|
||||
ProblemType::HotPartition => write!(f, "hot_partition"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for ProblemType {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"index_degradation" | "indexdegradation" => Ok(ProblemType::IndexDegradation),
|
||||
"replica_lag" | "replicalag" => Ok(ProblemType::ReplicaLag),
|
||||
"storage_exhaustion" | "storageexhaustion" => Ok(ProblemType::StorageExhaustion),
|
||||
"query_timeout" | "querytimeout" => Ok(ProblemType::QueryTimeout),
|
||||
"integrity_violation" | "integrityviolation" => Ok(ProblemType::IntegrityViolation),
|
||||
"memory_pressure" | "memorypressure" => Ok(ProblemType::MemoryPressure),
|
||||
"connection_exhaustion" | "connectionexhaustion" => {
|
||||
Ok(ProblemType::ConnectionExhaustion)
|
||||
}
|
||||
"hot_partition" | "hotpartition" => Ok(ProblemType::HotPartition),
|
||||
_ => Err(format!("Unknown problem type: {}", s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Severity Levels
|
||||
// ============================================================================
|
||||
|
||||
/// Problem severity levels
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub enum Severity {
|
||||
/// Informational, no action required
|
||||
Info,
|
||||
/// Low severity, can be addressed during maintenance
|
||||
Low,
|
||||
/// Medium severity, should be addressed soon
|
||||
Medium,
|
||||
/// High severity, requires prompt attention
|
||||
High,
|
||||
/// Critical severity, immediate action required
|
||||
Critical,
|
||||
}
|
||||
|
||||
impl Severity {
|
||||
/// Get numeric value for comparison
|
||||
pub fn value(&self) -> u8 {
|
||||
match self {
|
||||
Severity::Info => 0,
|
||||
Severity::Low => 1,
|
||||
Severity::Medium => 2,
|
||||
Severity::High => 3,
|
||||
Severity::Critical => 4,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Problem Definition
|
||||
// ============================================================================
|
||||
|
||||
/// A detected problem with full context
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Problem {
|
||||
/// Type of problem
|
||||
pub problem_type: ProblemType,
|
||||
/// Severity level
|
||||
pub severity: Severity,
|
||||
/// When the problem was detected
|
||||
#[serde(with = "system_time_serde")]
|
||||
pub detected_at: SystemTime,
|
||||
/// Additional details about the problem
|
||||
pub details: serde_json::Value,
|
||||
/// Affected partition IDs (if applicable)
|
||||
pub affected_partitions: Vec<i64>,
|
||||
}
|
||||
|
||||
impl Problem {
|
||||
/// Create a new problem
|
||||
pub fn new(problem_type: ProblemType, severity: Severity) -> Self {
|
||||
Self {
|
||||
problem_type,
|
||||
severity,
|
||||
detected_at: SystemTime::now(),
|
||||
details: serde_json::json!({}),
|
||||
affected_partitions: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Add details to the problem
|
||||
pub fn with_details(mut self, details: serde_json::Value) -> Self {
|
||||
self.details = details;
|
||||
self
|
||||
}
|
||||
|
||||
/// Add affected partitions
|
||||
pub fn with_partitions(mut self, partitions: Vec<i64>) -> Self {
|
||||
self.affected_partitions = partitions;
|
||||
self
|
||||
}
|
||||
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
let detected_ts = self
|
||||
.detected_at
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
serde_json::json!({
|
||||
"problem_type": self.problem_type.to_string(),
|
||||
"severity": format!("{:?}", self.severity).to_lowercase(),
|
||||
"detected_at": detected_ts,
|
||||
"details": self.details,
|
||||
"affected_partitions": self.affected_partitions,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Custom serde for SystemTime
|
||||
mod system_time_serde {
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
pub fn serialize<S>(time: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let duration = time.duration_since(UNIX_EPOCH).unwrap();
|
||||
duration.as_secs().serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let secs = u64::deserialize(deserializer)?;
|
||||
Ok(UNIX_EPOCH + Duration::from_secs(secs))
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Detection Thresholds
|
||||
// ============================================================================
|
||||
|
||||
/// Configurable thresholds for problem detection
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DetectionThresholds {
|
||||
/// Index fragmentation percentage threshold (0-100)
|
||||
pub index_fragmentation_pct: f32,
|
||||
/// Replica lag in seconds threshold
|
||||
pub replica_lag_seconds: f32,
|
||||
/// Storage usage percentage threshold (0-100)
|
||||
pub storage_usage_pct: f32,
|
||||
/// Query timeout rate threshold (0-1)
|
||||
pub query_timeout_rate: f32,
|
||||
/// Minimum lambda (mincut) value for integrity
|
||||
pub min_integrity_lambda: f32,
|
||||
/// Memory usage percentage threshold (0-100)
|
||||
pub memory_usage_pct: f32,
|
||||
/// Connection pool usage percentage threshold (0-100)
|
||||
pub connection_usage_pct: f32,
|
||||
/// Partition load ratio threshold (vs average)
|
||||
pub partition_load_ratio: f32,
|
||||
}
|
||||
|
||||
impl Default for DetectionThresholds {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
index_fragmentation_pct: 30.0,
|
||||
replica_lag_seconds: 5.0,
|
||||
storage_usage_pct: 85.0,
|
||||
query_timeout_rate: 0.05, // 5% timeout rate
|
||||
min_integrity_lambda: 0.5,
|
||||
memory_usage_pct: 85.0,
|
||||
connection_usage_pct: 90.0,
|
||||
partition_load_ratio: 3.0, // 3x average load
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// System Metrics
|
||||
// ============================================================================
|
||||
|
||||
/// System metrics collected for problem detection
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct SystemMetrics {
|
||||
/// Index fragmentation percentage per index
|
||||
pub index_fragmentation: HashMap<String, f32>,
|
||||
/// Replica lag in seconds per replica
|
||||
pub replica_lag: HashMap<String, f32>,
|
||||
/// Storage usage percentage
|
||||
pub storage_usage_pct: f32,
|
||||
/// Query timeout rate (0-1)
|
||||
pub query_timeout_rate: f32,
|
||||
/// Current integrity lambda value
|
||||
pub integrity_lambda: f32,
|
||||
/// Memory usage percentage
|
||||
pub memory_usage_pct: f32,
|
||||
/// Connection pool usage percentage
|
||||
pub connection_usage_pct: f32,
|
||||
/// Load per partition
|
||||
pub partition_loads: HashMap<i64, f64>,
|
||||
/// Witness edges from mincut computation
|
||||
pub witness_edges: Vec<WitnessEdge>,
|
||||
/// Maintenance queue depth
|
||||
pub maintenance_queue_depth: usize,
|
||||
/// Top memory consumers
|
||||
pub top_memory_consumers: Vec<(String, usize)>,
|
||||
/// Fragmented index IDs
|
||||
pub fragmented_indexes: Vec<i64>,
|
||||
/// Timestamp of metrics collection
|
||||
pub collected_at: u64,
|
||||
}
|
||||
|
||||
impl SystemMetrics {
|
||||
/// Create new empty metrics
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
collected_at: SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"index_fragmentation": self.index_fragmentation,
|
||||
"replica_lag": self.replica_lag,
|
||||
"storage_usage_pct": self.storage_usage_pct,
|
||||
"query_timeout_rate": self.query_timeout_rate,
|
||||
"integrity_lambda": self.integrity_lambda,
|
||||
"memory_usage_pct": self.memory_usage_pct,
|
||||
"connection_usage_pct": self.connection_usage_pct,
|
||||
"partition_loads": self.partition_loads,
|
||||
"witness_edge_count": self.witness_edges.len(),
|
||||
"maintenance_queue_depth": self.maintenance_queue_depth,
|
||||
"collected_at": self.collected_at,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Witness edge from mincut computation
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WitnessEdge {
|
||||
/// Source node ID
|
||||
pub from: i64,
|
||||
/// Target node ID
|
||||
pub to: i64,
|
||||
/// Edge type (e.g., "partition_link", "replication", "dependency")
|
||||
pub edge_type: String,
|
||||
/// Edge weight/capacity
|
||||
pub weight: f32,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Problem Detector
|
||||
// ============================================================================
|
||||
|
||||
/// Problem detector with configurable thresholds
|
||||
pub struct ProblemDetector {
|
||||
/// Detection thresholds
|
||||
thresholds: RwLock<DetectionThresholds>,
|
||||
/// Number of problems detected
|
||||
problems_detected: AtomicU64,
|
||||
/// Last detection timestamp
|
||||
last_detection: AtomicU64,
|
||||
}
|
||||
|
||||
impl ProblemDetector {
|
||||
/// Create a new problem detector with default thresholds
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
thresholds: RwLock::new(DetectionThresholds::default()),
|
||||
problems_detected: AtomicU64::new(0),
|
||||
last_detection: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom thresholds
|
||||
pub fn with_thresholds(thresholds: DetectionThresholds) -> Self {
|
||||
Self {
|
||||
thresholds: RwLock::new(thresholds),
|
||||
problems_detected: AtomicU64::new(0),
|
||||
last_detection: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Update thresholds
|
||||
pub fn update_thresholds(&self, thresholds: DetectionThresholds) {
|
||||
*self.thresholds.write() = thresholds;
|
||||
}
|
||||
|
||||
/// Get current thresholds
|
||||
pub fn get_thresholds(&self) -> DetectionThresholds {
|
||||
self.thresholds.read().clone()
|
||||
}
|
||||
|
||||
/// Collect current system metrics
|
||||
pub fn collect_metrics(&self) -> SystemMetrics {
|
||||
let mut metrics = SystemMetrics::new();
|
||||
|
||||
// In production, these would query PostgreSQL system catalogs
|
||||
// and index statistics. For now, we simulate with reasonable defaults.
|
||||
|
||||
// Query pg_stat_user_indexes for fragmentation
|
||||
metrics.index_fragmentation = self.collect_index_fragmentation();
|
||||
|
||||
// Query pg_stat_replication for replica lag
|
||||
metrics.replica_lag = self.collect_replica_lag();
|
||||
|
||||
// Query pg_tablespace for storage usage
|
||||
metrics.storage_usage_pct = self.collect_storage_usage();
|
||||
|
||||
// Query pg_stat_statements for timeout rate
|
||||
metrics.query_timeout_rate = self.collect_query_timeout_rate();
|
||||
|
||||
// Get integrity lambda from mincut computation
|
||||
metrics.integrity_lambda = self.collect_integrity_lambda();
|
||||
|
||||
// Query memory usage
|
||||
metrics.memory_usage_pct = self.collect_memory_usage();
|
||||
|
||||
// Query connection pool usage
|
||||
metrics.connection_usage_pct = self.collect_connection_usage();
|
||||
|
||||
// Query partition loads
|
||||
metrics.partition_loads = self.collect_partition_loads();
|
||||
|
||||
// Get witness edges from mincut
|
||||
metrics.witness_edges = self.collect_witness_edges();
|
||||
|
||||
metrics
|
||||
}
|
||||
|
||||
/// Detect problems from collected metrics
|
||||
pub fn detect_problems(&self, metrics: &SystemMetrics) -> Vec<Problem> {
|
||||
let thresholds = self.thresholds.read();
|
||||
let mut problems = Vec::new();
|
||||
|
||||
// Check index fragmentation
|
||||
for (index_name, frag_pct) in &metrics.index_fragmentation {
|
||||
if *frag_pct > thresholds.index_fragmentation_pct {
|
||||
let severity = if *frag_pct > 60.0 {
|
||||
Severity::High
|
||||
} else if *frag_pct > 45.0 {
|
||||
Severity::Medium
|
||||
} else {
|
||||
Severity::Low
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::IndexDegradation, severity).with_details(
|
||||
serde_json::json!({
|
||||
"index_name": index_name,
|
||||
"fragmentation_pct": frag_pct,
|
||||
"threshold": thresholds.index_fragmentation_pct,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Check replica lag
|
||||
for (replica_id, lag_seconds) in &metrics.replica_lag {
|
||||
if *lag_seconds > thresholds.replica_lag_seconds {
|
||||
let severity = if *lag_seconds > 30.0 {
|
||||
Severity::Critical
|
||||
} else if *lag_seconds > 15.0 {
|
||||
Severity::High
|
||||
} else if *lag_seconds > 10.0 {
|
||||
Severity::Medium
|
||||
} else {
|
||||
Severity::Low
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::ReplicaLag, severity).with_details(
|
||||
serde_json::json!({
|
||||
"replica_id": replica_id,
|
||||
"lag_seconds": lag_seconds,
|
||||
"threshold": thresholds.replica_lag_seconds,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Check storage usage
|
||||
if metrics.storage_usage_pct > thresholds.storage_usage_pct {
|
||||
let severity = if metrics.storage_usage_pct > 95.0 {
|
||||
Severity::Critical
|
||||
} else if metrics.storage_usage_pct > 90.0 {
|
||||
Severity::High
|
||||
} else {
|
||||
Severity::Medium
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::StorageExhaustion, severity).with_details(
|
||||
serde_json::json!({
|
||||
"usage_pct": metrics.storage_usage_pct,
|
||||
"threshold": thresholds.storage_usage_pct,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// Check query timeout rate
|
||||
if metrics.query_timeout_rate > thresholds.query_timeout_rate {
|
||||
let severity = if metrics.query_timeout_rate > 0.20 {
|
||||
Severity::Critical
|
||||
} else if metrics.query_timeout_rate > 0.10 {
|
||||
Severity::High
|
||||
} else {
|
||||
Severity::Medium
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::QueryTimeout, severity).with_details(serde_json::json!({
|
||||
"timeout_rate": metrics.query_timeout_rate,
|
||||
"threshold": thresholds.query_timeout_rate,
|
||||
})),
|
||||
);
|
||||
}
|
||||
|
||||
// Check integrity lambda
|
||||
if metrics.integrity_lambda < thresholds.min_integrity_lambda
|
||||
&& metrics.integrity_lambda > 0.0
|
||||
{
|
||||
let severity = if metrics.integrity_lambda < 0.2 {
|
||||
Severity::Critical
|
||||
} else if metrics.integrity_lambda < 0.35 {
|
||||
Severity::High
|
||||
} else {
|
||||
Severity::Medium
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::IntegrityViolation, severity).with_details(
|
||||
serde_json::json!({
|
||||
"lambda": metrics.integrity_lambda,
|
||||
"threshold": thresholds.min_integrity_lambda,
|
||||
"witness_edges": metrics.witness_edges.len(),
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// Check memory pressure
|
||||
if metrics.memory_usage_pct > thresholds.memory_usage_pct {
|
||||
let severity = if metrics.memory_usage_pct > 95.0 {
|
||||
Severity::Critical
|
||||
} else if metrics.memory_usage_pct > 90.0 {
|
||||
Severity::High
|
||||
} else {
|
||||
Severity::Medium
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::MemoryPressure, severity).with_details(
|
||||
serde_json::json!({
|
||||
"usage_pct": metrics.memory_usage_pct,
|
||||
"threshold": thresholds.memory_usage_pct,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// Check connection exhaustion
|
||||
if metrics.connection_usage_pct > thresholds.connection_usage_pct {
|
||||
let severity = if metrics.connection_usage_pct > 98.0 {
|
||||
Severity::Critical
|
||||
} else if metrics.connection_usage_pct > 95.0 {
|
||||
Severity::High
|
||||
} else {
|
||||
Severity::Medium
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::ConnectionExhaustion, severity).with_details(
|
||||
serde_json::json!({
|
||||
"usage_pct": metrics.connection_usage_pct,
|
||||
"threshold": thresholds.connection_usage_pct,
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// Check for hot partitions
|
||||
if !metrics.partition_loads.is_empty() {
|
||||
let avg_load: f64 = metrics.partition_loads.values().sum::<f64>()
|
||||
/ metrics.partition_loads.len() as f64;
|
||||
|
||||
let hot_partitions: Vec<i64> = metrics
|
||||
.partition_loads
|
||||
.iter()
|
||||
.filter(|(_, load)| **load > avg_load * thresholds.partition_load_ratio as f64)
|
||||
.map(|(id, _)| *id)
|
||||
.collect();
|
||||
|
||||
if !hot_partitions.is_empty() {
|
||||
let max_ratio = hot_partitions
|
||||
.iter()
|
||||
.filter_map(|id| metrics.partition_loads.get(id))
|
||||
.map(|load| *load / avg_load)
|
||||
.fold(0.0_f64, f64::max);
|
||||
|
||||
let severity = if max_ratio > 10.0 {
|
||||
Severity::High
|
||||
} else if max_ratio > 5.0 {
|
||||
Severity::Medium
|
||||
} else {
|
||||
Severity::Low
|
||||
};
|
||||
|
||||
problems.push(
|
||||
Problem::new(ProblemType::HotPartition, severity)
|
||||
.with_details(serde_json::json!({
|
||||
"avg_load": avg_load,
|
||||
"max_ratio": max_ratio,
|
||||
"threshold_ratio": thresholds.partition_load_ratio,
|
||||
}))
|
||||
.with_partitions(hot_partitions),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
self.problems_detected
|
||||
.fetch_add(problems.len() as u64, Ordering::SeqCst);
|
||||
self.last_detection.store(
|
||||
SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
|
||||
problems
|
||||
}
|
||||
|
||||
/// Get detection statistics
|
||||
pub fn get_stats(&self) -> DetectorStats {
|
||||
DetectorStats {
|
||||
problems_detected: self.problems_detected.load(Ordering::SeqCst),
|
||||
last_detection: self.last_detection.load(Ordering::SeqCst),
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Metric Collection Helpers (would use SPI in production)
|
||||
// ========================================================================
|
||||
|
||||
fn collect_index_fragmentation(&self) -> HashMap<String, f32> {
|
||||
// In production: Query pg_stat_user_indexes and compute fragmentation
|
||||
// For now, return empty (healthy state)
|
||||
HashMap::new()
|
||||
}
|
||||
|
||||
fn collect_replica_lag(&self) -> HashMap<String, f32> {
|
||||
// In production: Query pg_stat_replication
|
||||
HashMap::new()
|
||||
}
|
||||
|
||||
fn collect_storage_usage(&self) -> f32 {
|
||||
// In production: Query pg_tablespace sizes
|
||||
0.0
|
||||
}
|
||||
|
||||
fn collect_query_timeout_rate(&self) -> f32 {
|
||||
// In production: Query pg_stat_statements for timeout metrics
|
||||
0.0
|
||||
}
|
||||
|
||||
fn collect_integrity_lambda(&self) -> f32 {
|
||||
// In production: Get from integrity control plane
|
||||
1.0 // Healthy default
|
||||
}
|
||||
|
||||
fn collect_memory_usage(&self) -> f32 {
|
||||
// In production: Query pg_shmem_allocations or OS metrics
|
||||
0.0
|
||||
}
|
||||
|
||||
fn collect_connection_usage(&self) -> f32 {
|
||||
// In production: Query pg_stat_activity vs max_connections
|
||||
0.0
|
||||
}
|
||||
|
||||
fn collect_partition_loads(&self) -> HashMap<i64, f64> {
|
||||
// In production: Query partition statistics
|
||||
HashMap::new()
|
||||
}
|
||||
|
||||
fn collect_witness_edges(&self) -> Vec<WitnessEdge> {
|
||||
// In production: Get from mincut computation
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProblemDetector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Detector statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DetectorStats {
|
||||
pub problems_detected: u64,
|
||||
pub last_detection: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_problem_type_display() {
|
||||
assert_eq!(
|
||||
ProblemType::IndexDegradation.to_string(),
|
||||
"index_degradation"
|
||||
);
|
||||
assert_eq!(ProblemType::ReplicaLag.to_string(), "replica_lag");
|
||||
assert_eq!(
|
||||
ProblemType::IntegrityViolation.to_string(),
|
||||
"integrity_violation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_problem_type_parse() {
|
||||
assert_eq!(
|
||||
"index_degradation".parse::<ProblemType>().unwrap(),
|
||||
ProblemType::IndexDegradation
|
||||
);
|
||||
assert_eq!(
|
||||
"replica_lag".parse::<ProblemType>().unwrap(),
|
||||
ProblemType::ReplicaLag
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_index_degradation() {
|
||||
let detector = ProblemDetector::new();
|
||||
|
||||
let mut metrics = SystemMetrics::new();
|
||||
metrics
|
||||
.index_fragmentation
|
||||
.insert("test_idx".to_string(), 50.0);
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert_eq!(problems.len(), 1);
|
||||
assert_eq!(problems[0].problem_type, ProblemType::IndexDegradation);
|
||||
assert_eq!(problems[0].severity, Severity::Medium);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_storage_exhaustion() {
|
||||
let detector = ProblemDetector::new();
|
||||
|
||||
let mut metrics = SystemMetrics::new();
|
||||
metrics.storage_usage_pct = 92.0;
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert_eq!(problems.len(), 1);
|
||||
assert_eq!(problems[0].problem_type, ProblemType::StorageExhaustion);
|
||||
assert_eq!(problems[0].severity, Severity::High);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_integrity_violation() {
|
||||
let detector = ProblemDetector::new();
|
||||
|
||||
let mut metrics = SystemMetrics::new();
|
||||
metrics.integrity_lambda = 0.3;
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert_eq!(problems.len(), 1);
|
||||
assert_eq!(problems[0].problem_type, ProblemType::IntegrityViolation);
|
||||
assert_eq!(problems[0].severity, Severity::High);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_hot_partition() {
|
||||
let detector = ProblemDetector::new();
|
||||
|
||||
let mut metrics = SystemMetrics::new();
|
||||
metrics.partition_loads.insert(1, 100.0);
|
||||
metrics.partition_loads.insert(2, 100.0);
|
||||
metrics.partition_loads.insert(3, 500.0); // Hot partition
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert_eq!(problems.len(), 1);
|
||||
assert_eq!(problems[0].problem_type, ProblemType::HotPartition);
|
||||
assert!(problems[0].affected_partitions.contains(&3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_severity_ordering() {
|
||||
assert!(Severity::Critical > Severity::High);
|
||||
assert!(Severity::High > Severity::Medium);
|
||||
assert!(Severity::Medium > Severity::Low);
|
||||
assert!(Severity::Low > Severity::Info);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_healthy_metrics_no_problems() {
|
||||
let detector = ProblemDetector::new();
|
||||
let metrics = SystemMetrics::new();
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert!(problems.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_thresholds() {
|
||||
let thresholds = DetectionThresholds {
|
||||
index_fragmentation_pct: 10.0, // More sensitive
|
||||
..Default::default()
|
||||
};
|
||||
let detector = ProblemDetector::with_thresholds(thresholds);
|
||||
|
||||
let mut metrics = SystemMetrics::new();
|
||||
metrics
|
||||
.index_fragmentation
|
||||
.insert("test_idx".to_string(), 15.0);
|
||||
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
assert_eq!(problems.len(), 1);
|
||||
assert_eq!(problems[0].problem_type, ProblemType::IndexDegradation);
|
||||
}
|
||||
}
|
||||
788
vendor/ruvector/crates/ruvector-postgres/src/healing/engine.rs
vendored
Normal file
788
vendor/ruvector/crates/ruvector-postgres/src/healing/engine.rs
vendored
Normal file
@@ -0,0 +1,788 @@
|
||||
//! Remediation Engine for Self-Healing System
|
||||
//!
|
||||
//! Orchestrates remediation execution with:
|
||||
//! - Strategy selection based on problem type and weights
|
||||
//! - Execution with timeout and rollback capability
|
||||
//! - Outcome verification
|
||||
//! - Cooldown periods to prevent thrashing
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::detector::{Problem, ProblemType, SystemMetrics};
|
||||
use super::learning::OutcomeTracker;
|
||||
use super::strategies::{
|
||||
RemediationResult, RemediationStrategy, StrategyContext, StrategyRegistry,
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Healing Configuration
|
||||
// ============================================================================
|
||||
|
||||
/// Configuration for the healing engine
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealingConfig {
|
||||
/// Minimum time between healing attempts for same problem type
|
||||
pub min_healing_interval: Duration,
|
||||
/// Maximum attempts per time window
|
||||
pub max_attempts_per_window: usize,
|
||||
/// Time window for attempt counting
|
||||
pub attempt_window: Duration,
|
||||
/// Maximum impact level for auto-healing (0-1)
|
||||
pub max_auto_heal_impact: f32,
|
||||
/// Problem types that require human approval
|
||||
pub require_approval: Vec<ProblemType>,
|
||||
/// Strategy names that require human approval
|
||||
pub require_approval_strategies: Vec<String>,
|
||||
/// Enable learning from outcomes
|
||||
pub learning_enabled: bool,
|
||||
/// Cooldown after failed remediation
|
||||
pub failure_cooldown: Duration,
|
||||
/// Whether to verify improvement after remediation
|
||||
pub verify_improvement: bool,
|
||||
/// Minimum improvement percentage to consider success
|
||||
pub min_improvement_pct: f32,
|
||||
/// Maximum concurrent remediations
|
||||
pub max_concurrent_remediations: usize,
|
||||
}
|
||||
|
||||
impl Default for HealingConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_healing_interval: Duration::from_secs(300), // 5 minutes
|
||||
max_attempts_per_window: 3,
|
||||
attempt_window: Duration::from_secs(3600), // 1 hour
|
||||
max_auto_heal_impact: 0.5,
|
||||
require_approval: vec![],
|
||||
require_approval_strategies: vec!["promote_replica".to_string()],
|
||||
learning_enabled: true,
|
||||
failure_cooldown: Duration::from_secs(600), // 10 minutes
|
||||
verify_improvement: true,
|
||||
min_improvement_pct: 5.0,
|
||||
max_concurrent_remediations: 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Healing Outcome
|
||||
// ============================================================================
|
||||
|
||||
/// Outcome of a healing attempt
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum HealingOutcome {
|
||||
/// Healing completed (may or may not have succeeded)
|
||||
Completed {
|
||||
problem_type: ProblemType,
|
||||
strategy: String,
|
||||
result: RemediationResult,
|
||||
verified: bool,
|
||||
},
|
||||
/// Healing was deferred (needs approval or cooldown)
|
||||
Deferred {
|
||||
reason: String,
|
||||
problem_type: ProblemType,
|
||||
},
|
||||
/// No suitable strategy found
|
||||
NoStrategy { problem_type: ProblemType },
|
||||
/// Healing is disabled
|
||||
Disabled,
|
||||
/// Already at maximum concurrent remediations
|
||||
MaxConcurrent,
|
||||
}
|
||||
|
||||
impl HealingOutcome {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
match self {
|
||||
HealingOutcome::Completed {
|
||||
problem_type,
|
||||
strategy,
|
||||
result,
|
||||
verified,
|
||||
} => {
|
||||
serde_json::json!({
|
||||
"status": "completed",
|
||||
"problem_type": problem_type.to_string(),
|
||||
"strategy": strategy,
|
||||
"result": result.to_json(),
|
||||
"verified": verified,
|
||||
})
|
||||
}
|
||||
HealingOutcome::Deferred {
|
||||
reason,
|
||||
problem_type,
|
||||
} => {
|
||||
serde_json::json!({
|
||||
"status": "deferred",
|
||||
"reason": reason,
|
||||
"problem_type": problem_type.to_string(),
|
||||
})
|
||||
}
|
||||
HealingOutcome::NoStrategy { problem_type } => {
|
||||
serde_json::json!({
|
||||
"status": "no_strategy",
|
||||
"problem_type": problem_type.to_string(),
|
||||
})
|
||||
}
|
||||
HealingOutcome::Disabled => {
|
||||
serde_json::json!({
|
||||
"status": "disabled",
|
||||
})
|
||||
}
|
||||
HealingOutcome::MaxConcurrent => {
|
||||
serde_json::json!({
|
||||
"status": "max_concurrent",
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Active Remediation
|
||||
// ============================================================================
|
||||
|
||||
/// An active remediation in progress
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ActiveRemediation {
|
||||
/// Unique ID
|
||||
pub id: u64,
|
||||
/// Problem being remediated
|
||||
pub problem: Problem,
|
||||
/// Strategy being used
|
||||
pub strategy_name: String,
|
||||
/// When remediation started
|
||||
pub started_at: SystemTime,
|
||||
/// Expected completion time
|
||||
pub expected_completion: SystemTime,
|
||||
}
|
||||
|
||||
impl ActiveRemediation {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
let started_ts = self
|
||||
.started_at
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
let expected_ts = self
|
||||
.expected_completion
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
serde_json::json!({
|
||||
"id": self.id,
|
||||
"problem_type": self.problem.problem_type.to_string(),
|
||||
"strategy": self.strategy_name,
|
||||
"started_at": started_ts,
|
||||
"expected_completion": expected_ts,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Remediation Context
|
||||
// ============================================================================
|
||||
|
||||
/// Full context for remediation execution
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemediationContext {
|
||||
/// The problem being remediated
|
||||
pub problem: Problem,
|
||||
/// Collection/table being remediated
|
||||
pub collection_id: i64,
|
||||
/// Tenant ID (for multi-tenant)
|
||||
pub tenant_id: Option<String>,
|
||||
/// Initial integrity lambda
|
||||
pub initial_lambda: f32,
|
||||
/// Target integrity lambda
|
||||
pub target_lambda: f32,
|
||||
/// System metrics at start
|
||||
pub initial_metrics: SystemMetrics,
|
||||
/// When context was created
|
||||
pub created_at: SystemTime,
|
||||
/// Maximum impact allowed
|
||||
pub max_impact: f32,
|
||||
/// Timeout for remediation
|
||||
pub timeout: Duration,
|
||||
/// Healing attempts in current window
|
||||
pub attempts_in_window: usize,
|
||||
/// Last healing attempt time
|
||||
pub last_attempt: Option<SystemTime>,
|
||||
}
|
||||
|
||||
impl RemediationContext {
|
||||
/// Create a new remediation context
|
||||
pub fn new(problem: Problem, metrics: SystemMetrics) -> Self {
|
||||
Self {
|
||||
problem,
|
||||
collection_id: 0,
|
||||
tenant_id: None,
|
||||
initial_lambda: metrics.integrity_lambda,
|
||||
target_lambda: 0.8,
|
||||
initial_metrics: metrics,
|
||||
created_at: SystemTime::now(),
|
||||
max_impact: 0.5,
|
||||
timeout: Duration::from_secs(300),
|
||||
attempts_in_window: 0,
|
||||
last_attempt: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set collection ID
|
||||
pub fn with_collection(mut self, collection_id: i64) -> Self {
|
||||
self.collection_id = collection_id;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set tenant ID
|
||||
pub fn with_tenant(mut self, tenant_id: String) -> Self {
|
||||
self.tenant_id = Some(tenant_id);
|
||||
self
|
||||
}
|
||||
|
||||
/// Create strategy context
|
||||
pub fn to_strategy_context(&self) -> StrategyContext {
|
||||
StrategyContext {
|
||||
problem: self.problem.clone(),
|
||||
collection_id: self.collection_id,
|
||||
initial_lambda: self.initial_lambda,
|
||||
target_lambda: self.target_lambda,
|
||||
max_impact: self.max_impact,
|
||||
timeout: self.timeout,
|
||||
start_time: SystemTime::now(),
|
||||
dry_run: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Remediation Engine
|
||||
// ============================================================================
|
||||
|
||||
/// The main remediation engine
|
||||
pub struct RemediationEngine {
|
||||
/// Strategy registry
|
||||
pub registry: StrategyRegistry,
|
||||
/// Configuration
|
||||
config: RwLock<HealingConfig>,
|
||||
/// Outcome tracker for learning
|
||||
tracker: OutcomeTracker,
|
||||
/// Active remediations
|
||||
active: RwLock<Vec<ActiveRemediation>>,
|
||||
/// Next remediation ID
|
||||
next_id: AtomicU64,
|
||||
/// Healing attempt history (problem_type -> timestamps)
|
||||
attempt_history: RwLock<HashMap<ProblemType, VecDeque<SystemTime>>>,
|
||||
/// Whether engine is enabled
|
||||
enabled: AtomicBool,
|
||||
/// Total healings attempted
|
||||
total_healings: AtomicU64,
|
||||
/// Successful healings
|
||||
successful_healings: AtomicU64,
|
||||
}
|
||||
|
||||
impl RemediationEngine {
|
||||
/// Create a new remediation engine
|
||||
pub fn new(registry: StrategyRegistry, config: HealingConfig, tracker: OutcomeTracker) -> Self {
|
||||
Self {
|
||||
registry,
|
||||
config: RwLock::new(config),
|
||||
tracker,
|
||||
active: RwLock::new(Vec::new()),
|
||||
next_id: AtomicU64::new(1),
|
||||
attempt_history: RwLock::new(HashMap::new()),
|
||||
enabled: AtomicBool::new(true),
|
||||
total_healings: AtomicU64::new(0),
|
||||
successful_healings: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable or disable the engine
|
||||
pub fn set_enabled(&self, enabled: bool) {
|
||||
self.enabled.store(enabled, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
/// Check if engine is enabled
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.enabled.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
/// Update configuration
|
||||
pub fn update_config(&self, config: HealingConfig) {
|
||||
*self.config.write() = config;
|
||||
}
|
||||
|
||||
/// Get current configuration
|
||||
pub fn get_config(&self) -> HealingConfig {
|
||||
self.config.read().clone()
|
||||
}
|
||||
|
||||
/// Get active remediations
|
||||
pub fn active_remediations(&self) -> Vec<ActiveRemediation> {
|
||||
self.active.read().clone()
|
||||
}
|
||||
|
||||
/// Main healing method
|
||||
pub fn heal(&self, problem: &Problem) -> HealingOutcome {
|
||||
// Check if enabled
|
||||
if !self.is_enabled() {
|
||||
return HealingOutcome::Disabled;
|
||||
}
|
||||
|
||||
let config = self.config.read().clone();
|
||||
|
||||
// Check concurrent limit
|
||||
if self.active.read().len() >= config.max_concurrent_remediations {
|
||||
return HealingOutcome::MaxConcurrent;
|
||||
}
|
||||
|
||||
// Check if we should auto-heal
|
||||
if !self.should_auto_heal(problem, &config) {
|
||||
return HealingOutcome::Deferred {
|
||||
reason: self.get_defer_reason(problem, &config),
|
||||
problem_type: problem.problem_type,
|
||||
};
|
||||
}
|
||||
|
||||
// Select strategy
|
||||
let strategy = match self.registry.select(problem, config.max_auto_heal_impact) {
|
||||
Some(s) => s,
|
||||
None => {
|
||||
return HealingOutcome::NoStrategy {
|
||||
problem_type: problem.problem_type,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Check if strategy requires approval
|
||||
if config
|
||||
.require_approval_strategies
|
||||
.contains(&strategy.name().to_string())
|
||||
{
|
||||
return HealingOutcome::Deferred {
|
||||
reason: format!("Strategy '{}' requires human approval", strategy.name()),
|
||||
problem_type: problem.problem_type,
|
||||
};
|
||||
}
|
||||
|
||||
// Record attempt
|
||||
self.record_attempt(problem.problem_type);
|
||||
self.total_healings.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
// Start active remediation
|
||||
let remediation_id = self.next_id.fetch_add(1, Ordering::SeqCst);
|
||||
let active_rem = ActiveRemediation {
|
||||
id: remediation_id,
|
||||
problem: problem.clone(),
|
||||
strategy_name: strategy.name().to_string(),
|
||||
started_at: SystemTime::now(),
|
||||
expected_completion: SystemTime::now() + strategy.estimated_duration(),
|
||||
};
|
||||
self.active.write().push(active_rem);
|
||||
|
||||
// Execute strategy
|
||||
let context = StrategyContext {
|
||||
problem: problem.clone(),
|
||||
collection_id: 0,
|
||||
initial_lambda: 1.0,
|
||||
target_lambda: 0.8,
|
||||
max_impact: config.max_auto_heal_impact,
|
||||
timeout: strategy.estimated_duration() * 2,
|
||||
start_time: SystemTime::now(),
|
||||
dry_run: false,
|
||||
};
|
||||
|
||||
let result = self.execute_with_safeguards(&*strategy, &context);
|
||||
|
||||
// Remove from active
|
||||
self.active.write().retain(|r| r.id != remediation_id);
|
||||
|
||||
// Verify improvement
|
||||
let verified = if config.verify_improvement && result.is_success() {
|
||||
self.verify_improvement(&result, config.min_improvement_pct)
|
||||
} else {
|
||||
result.is_success()
|
||||
};
|
||||
|
||||
// Rollback if not verified and reversible
|
||||
if !verified && strategy.reversible() {
|
||||
pgrx::log!(
|
||||
"Remediation not verified, rolling back: {}",
|
||||
strategy.name()
|
||||
);
|
||||
if let Err(e) = strategy.rollback(&context, &result) {
|
||||
pgrx::warning!("Rollback failed: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Update learning
|
||||
if config.learning_enabled {
|
||||
self.registry
|
||||
.update_weight(strategy.name(), verified, result.improvement_pct);
|
||||
self.tracker
|
||||
.record(problem, strategy.name(), &result, verified);
|
||||
}
|
||||
|
||||
if verified {
|
||||
self.successful_healings.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
HealingOutcome::Completed {
|
||||
problem_type: problem.problem_type,
|
||||
strategy: strategy.name().to_string(),
|
||||
result,
|
||||
verified,
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute strategy with safeguards (timeout, panic catching)
|
||||
fn execute_with_safeguards(
|
||||
&self,
|
||||
strategy: &dyn RemediationStrategy,
|
||||
context: &StrategyContext,
|
||||
) -> RemediationResult {
|
||||
// In production, wrap in timeout and panic handling
|
||||
// For now, execute directly
|
||||
let start = std::time::Instant::now();
|
||||
let mut result = strategy.execute(context);
|
||||
result.duration_ms = start.elapsed().as_millis() as u64;
|
||||
result
|
||||
}
|
||||
|
||||
/// Check if we should auto-heal this problem
|
||||
fn should_auto_heal(&self, problem: &Problem, config: &HealingConfig) -> bool {
|
||||
// Check if problem type requires approval
|
||||
if config.require_approval.contains(&problem.problem_type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check cooldown
|
||||
if !self.is_past_cooldown(problem.problem_type, config) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check attempt limit
|
||||
if self.attempts_in_window(problem.problem_type, &config.attempt_window)
|
||||
>= config.max_attempts_per_window
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/// Get reason for deferring
|
||||
fn get_defer_reason(&self, problem: &Problem, config: &HealingConfig) -> String {
|
||||
if config.require_approval.contains(&problem.problem_type) {
|
||||
return format!(
|
||||
"Problem type '{:?}' requires human approval",
|
||||
problem.problem_type
|
||||
);
|
||||
}
|
||||
|
||||
if !self.is_past_cooldown(problem.problem_type, config) {
|
||||
return "In cooldown period after recent healing attempt".to_string();
|
||||
}
|
||||
|
||||
if self.attempts_in_window(problem.problem_type, &config.attempt_window)
|
||||
>= config.max_attempts_per_window
|
||||
{
|
||||
return format!(
|
||||
"Exceeded maximum {} attempts per {:?}",
|
||||
config.max_attempts_per_window, config.attempt_window
|
||||
);
|
||||
}
|
||||
|
||||
"Unknown reason".to_string()
|
||||
}
|
||||
|
||||
/// Check if past cooldown period
|
||||
fn is_past_cooldown(&self, problem_type: ProblemType, config: &HealingConfig) -> bool {
|
||||
let history = self.attempt_history.read();
|
||||
if let Some(attempts) = history.get(&problem_type) {
|
||||
if let Some(last) = attempts.back() {
|
||||
if let Ok(elapsed) = last.elapsed() {
|
||||
return elapsed >= config.min_healing_interval;
|
||||
}
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Count attempts in window
|
||||
fn attempts_in_window(&self, problem_type: ProblemType, window: &Duration) -> usize {
|
||||
let history = self.attempt_history.read();
|
||||
if let Some(attempts) = history.get(&problem_type) {
|
||||
let cutoff = SystemTime::now() - *window;
|
||||
attempts.iter().filter(|t| **t > cutoff).count()
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Record an attempt
|
||||
fn record_attempt(&self, problem_type: ProblemType) {
|
||||
let mut history = self.attempt_history.write();
|
||||
let attempts = history.entry(problem_type).or_insert_with(VecDeque::new);
|
||||
attempts.push_back(SystemTime::now());
|
||||
|
||||
// Keep only recent attempts
|
||||
let cutoff = SystemTime::now() - Duration::from_secs(86400); // 24 hours
|
||||
while let Some(front) = attempts.front() {
|
||||
if *front < cutoff {
|
||||
attempts.pop_front();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Verify improvement after remediation
|
||||
fn verify_improvement(&self, result: &RemediationResult, min_pct: f32) -> bool {
|
||||
result.improvement_pct >= min_pct
|
||||
}
|
||||
|
||||
/// Get engine statistics
|
||||
pub fn get_stats(&self) -> EngineStats {
|
||||
let total = self.total_healings.load(Ordering::SeqCst);
|
||||
let successful = self.successful_healings.load(Ordering::SeqCst);
|
||||
|
||||
EngineStats {
|
||||
enabled: self.is_enabled(),
|
||||
total_healings: total,
|
||||
successful_healings: successful,
|
||||
success_rate: if total > 0 {
|
||||
successful as f32 / total as f32
|
||||
} else {
|
||||
0.0
|
||||
},
|
||||
active_remediations: self.active.read().len(),
|
||||
strategy_weights: self.registry.get_all_weights(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute a specific strategy manually
|
||||
pub fn execute_strategy(
|
||||
&self,
|
||||
strategy_name: &str,
|
||||
problem: &Problem,
|
||||
dry_run: bool,
|
||||
) -> Option<HealingOutcome> {
|
||||
let strategy = self.registry.get_by_name(strategy_name)?;
|
||||
let _config = self.config.read().clone();
|
||||
|
||||
let context = StrategyContext {
|
||||
problem: problem.clone(),
|
||||
collection_id: 0,
|
||||
initial_lambda: 1.0,
|
||||
target_lambda: 0.8,
|
||||
max_impact: 1.0, // Manual execution allows higher impact
|
||||
timeout: strategy.estimated_duration() * 2,
|
||||
start_time: SystemTime::now(),
|
||||
dry_run,
|
||||
};
|
||||
|
||||
let result = strategy.execute(&context);
|
||||
|
||||
Some(HealingOutcome::Completed {
|
||||
problem_type: problem.problem_type,
|
||||
strategy: strategy_name.to_string(),
|
||||
result,
|
||||
verified: !dry_run,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Engine statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EngineStats {
|
||||
pub enabled: bool,
|
||||
pub total_healings: u64,
|
||||
pub successful_healings: u64,
|
||||
pub success_rate: f32,
|
||||
pub active_remediations: usize,
|
||||
pub strategy_weights: HashMap<String, f32>,
|
||||
}
|
||||
|
||||
impl EngineStats {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"enabled": self.enabled,
|
||||
"total_healings": self.total_healings,
|
||||
"successful_healings": self.successful_healings,
|
||||
"success_rate": self.success_rate,
|
||||
"active_remediations": self.active_remediations,
|
||||
"strategy_weights": self.strategy_weights,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::healing::detector::Severity;
|
||||
|
||||
fn create_engine() -> RemediationEngine {
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
let config = HealingConfig::default();
|
||||
let tracker = OutcomeTracker::new();
|
||||
RemediationEngine::new(registry, config, tracker)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_engine_creation() {
|
||||
let engine = create_engine();
|
||||
assert!(engine.is_enabled());
|
||||
assert!(engine.active_remediations().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_engine_enable_disable() {
|
||||
let engine = create_engine();
|
||||
|
||||
engine.set_enabled(false);
|
||||
assert!(!engine.is_enabled());
|
||||
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
let outcome = engine.heal(&problem);
|
||||
assert!(matches!(outcome, HealingOutcome::Disabled));
|
||||
|
||||
engine.set_enabled(true);
|
||||
assert!(engine.is_enabled());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_index_degradation() {
|
||||
let engine = create_engine();
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
|
||||
let outcome = engine.heal(&problem);
|
||||
match outcome {
|
||||
HealingOutcome::Completed { strategy, .. } => {
|
||||
assert!(strategy.contains("reindex") || strategy.contains("integrity"));
|
||||
}
|
||||
_ => panic!("Expected Completed outcome"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cooldown_enforcement() {
|
||||
let mut config = HealingConfig::default();
|
||||
config.min_healing_interval = Duration::from_secs(60);
|
||||
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let engine = RemediationEngine::new(registry, config, tracker);
|
||||
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
|
||||
// First healing should succeed
|
||||
let outcome1 = engine.heal(&problem);
|
||||
assert!(matches!(outcome1, HealingOutcome::Completed { .. }));
|
||||
|
||||
// Second should be deferred (in cooldown)
|
||||
let outcome2 = engine.heal(&problem);
|
||||
assert!(matches!(outcome2, HealingOutcome::Deferred { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_attempts_enforcement() {
|
||||
let mut config = HealingConfig::default();
|
||||
config.max_attempts_per_window = 2;
|
||||
config.min_healing_interval = Duration::from_millis(1);
|
||||
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let engine = RemediationEngine::new(registry, config, tracker);
|
||||
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
|
||||
// First two should succeed
|
||||
engine.heal(&problem);
|
||||
std::thread::sleep(Duration::from_millis(2));
|
||||
engine.heal(&problem);
|
||||
std::thread::sleep(Duration::from_millis(2));
|
||||
|
||||
// Third should be deferred
|
||||
let outcome = engine.heal(&problem);
|
||||
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_approval_requirement() {
|
||||
let mut config = HealingConfig::default();
|
||||
config.require_approval.push(ProblemType::ReplicaLag);
|
||||
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let engine = RemediationEngine::new(registry, config, tracker);
|
||||
|
||||
let problem = Problem::new(ProblemType::ReplicaLag, Severity::High);
|
||||
let outcome = engine.heal(&problem);
|
||||
|
||||
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_approval_requirement() {
|
||||
let mut config = HealingConfig::default();
|
||||
config
|
||||
.require_approval_strategies
|
||||
.push("promote_replica".to_string());
|
||||
config.max_auto_heal_impact = 1.0; // Allow high impact
|
||||
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let engine = RemediationEngine::new(registry, config, tracker);
|
||||
|
||||
let problem = Problem::new(ProblemType::ReplicaLag, Severity::High);
|
||||
let outcome = engine.heal(&problem);
|
||||
|
||||
// Should be deferred because promote_replica requires approval
|
||||
assert!(matches!(outcome, HealingOutcome::Deferred { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_strategy() {
|
||||
let registry = StrategyRegistry::new(); // Empty registry
|
||||
let config = HealingConfig::default();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let engine = RemediationEngine::new(registry, config, tracker);
|
||||
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
let outcome = engine.heal(&problem);
|
||||
|
||||
assert!(matches!(outcome, HealingOutcome::NoStrategy { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_manual_execution() {
|
||||
let engine = create_engine();
|
||||
let problem = Problem::new(ProblemType::IndexDegradation, Severity::Medium);
|
||||
|
||||
let outcome = engine.execute_strategy("reindex_partition", &problem, true);
|
||||
assert!(outcome.is_some());
|
||||
|
||||
if let Some(HealingOutcome::Completed { result, .. }) = outcome {
|
||||
assert!(result.metadata.get("dry_run") == Some(&serde_json::json!(true)));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_engine_stats() {
|
||||
let engine = create_engine();
|
||||
let stats = engine.get_stats();
|
||||
|
||||
assert!(stats.enabled);
|
||||
assert_eq!(stats.total_healings, 0);
|
||||
assert_eq!(stats.active_remediations, 0);
|
||||
}
|
||||
}
|
||||
467
vendor/ruvector/crates/ruvector-postgres/src/healing/functions.rs
vendored
Normal file
467
vendor/ruvector/crates/ruvector-postgres/src/healing/functions.rs
vendored
Normal file
@@ -0,0 +1,467 @@
|
||||
//! SQL Functions for Self-Healing Engine
|
||||
//!
|
||||
//! Provides PostgreSQL-accessible functions for:
|
||||
//! - Health status monitoring
|
||||
//! - Healing history queries
|
||||
//! - Manual healing triggers
|
||||
//! - Configuration management
|
||||
|
||||
use pgrx::prelude::*;
|
||||
|
||||
use super::detector::ProblemType;
|
||||
use super::{get_healing_engine, Problem};
|
||||
|
||||
// ============================================================================
|
||||
// Health Status Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Get current health status of the RuVector system
|
||||
///
|
||||
/// Returns JSON with:
|
||||
/// - healthy: whether system is healthy
|
||||
/// - problem_count: number of detected problems
|
||||
/// - active_remediation_count: ongoing remediations
|
||||
/// - problems: list of current problems
|
||||
/// - enabled: whether healing is enabled
|
||||
#[pg_extern]
|
||||
pub fn ruvector_health_status() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
let status = engine_lock.health_status();
|
||||
pgrx::JsonB(status.to_json())
|
||||
}
|
||||
|
||||
/// Check if system is currently healthy (no detected problems)
|
||||
#[pg_extern]
|
||||
pub fn ruvector_is_healthy() -> bool {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
let status = engine_lock.health_status();
|
||||
status.healthy
|
||||
}
|
||||
|
||||
/// Get system metrics used for problem detection
|
||||
#[pg_extern]
|
||||
pub fn ruvector_system_metrics() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
let metrics = engine_lock.detector.collect_metrics();
|
||||
pgrx::JsonB(metrics.to_json())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Healing History Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Get recent healing history
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `limit` - Maximum number of records to return (default 20)
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_history(limit: default!(i32, 20)) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let records = engine_lock.tracker.get_recent(limit as usize);
|
||||
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"history": history,
|
||||
"count": history.len(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get healing history since a specific timestamp
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `since_timestamp` - Unix timestamp to filter from
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_history_since(since_timestamp: i64) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let records = engine_lock.tracker.get_since(since_timestamp as u64);
|
||||
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"history": history,
|
||||
"count": history.len(),
|
||||
"since": since_timestamp,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get healing history for a specific strategy
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_history_for_strategy(
|
||||
strategy_name: &str,
|
||||
limit: default!(i32, 20),
|
||||
) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let records = engine_lock
|
||||
.tracker
|
||||
.get_for_strategy(strategy_name, limit as usize);
|
||||
let history: Vec<serde_json::Value> = records.iter().map(|r| r.to_json()).collect();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"strategy": strategy_name,
|
||||
"history": history,
|
||||
"count": history.len(),
|
||||
}))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Healing Trigger Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Manually trigger healing for a specific problem type
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `problem_type` - One of: index_degradation, replica_lag, storage_exhaustion,
|
||||
/// query_timeout, integrity_violation, memory_pressure,
|
||||
/// connection_exhaustion, hot_partition
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_trigger(problem_type: &str) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
// Parse problem type
|
||||
let ptype = match problem_type.parse::<ProblemType>() {
|
||||
Ok(pt) => pt,
|
||||
Err(e) => {
|
||||
return pgrx::JsonB(serde_json::json!({
|
||||
"success": false,
|
||||
"error": e,
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
// Trigger healing
|
||||
match engine_lock.trigger_healing(ptype) {
|
||||
Some(outcome) => pgrx::JsonB(serde_json::json!({
|
||||
"success": true,
|
||||
"outcome": outcome.to_json(),
|
||||
})),
|
||||
None => pgrx::JsonB(serde_json::json!({
|
||||
"success": false,
|
||||
"error": "Healing is disabled",
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute a specific healing strategy manually
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `strategy_name` - Strategy to execute
|
||||
/// * `problem_type` - Problem type for context
|
||||
/// * `dry_run` - If true, don't actually execute (default false)
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_execute(
|
||||
strategy_name: &str,
|
||||
problem_type: &str,
|
||||
dry_run: default!(bool, false),
|
||||
) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
// Parse problem type
|
||||
let ptype = match problem_type.parse::<ProblemType>() {
|
||||
Ok(pt) => pt,
|
||||
Err(e) => {
|
||||
return pgrx::JsonB(serde_json::json!({
|
||||
"success": false,
|
||||
"error": e,
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
let problem = Problem::new(ptype, super::detector::Severity::Medium);
|
||||
|
||||
match engine_lock
|
||||
.remediation
|
||||
.execute_strategy(strategy_name, &problem, dry_run)
|
||||
{
|
||||
Some(outcome) => pgrx::JsonB(serde_json::json!({
|
||||
"success": true,
|
||||
"dry_run": dry_run,
|
||||
"outcome": outcome.to_json(),
|
||||
})),
|
||||
None => pgrx::JsonB(serde_json::json!({
|
||||
"success": false,
|
||||
"error": format!("Strategy '{}' not found", strategy_name),
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Configure healing engine settings
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `config_json` - JSON configuration object with optional keys:
|
||||
/// - min_healing_interval_secs
|
||||
/// - max_attempts_per_window
|
||||
/// - max_auto_heal_impact
|
||||
/// - learning_enabled
|
||||
/// - verify_improvement
|
||||
/// - min_improvement_pct
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_configure(config_json: pgrx::JsonB) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let mut engine_lock = engine.write();
|
||||
|
||||
let mut config = engine_lock.config.clone();
|
||||
let json = config_json.0;
|
||||
|
||||
// Update configuration from JSON
|
||||
if let Some(interval) = json
|
||||
.get("min_healing_interval_secs")
|
||||
.and_then(|v| v.as_i64())
|
||||
{
|
||||
if interval > 0 {
|
||||
config.min_healing_interval = std::time::Duration::from_secs(interval as u64);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(attempts) = json.get("max_attempts_per_window").and_then(|v| v.as_i64()) {
|
||||
if attempts > 0 {
|
||||
config.max_attempts_per_window = attempts as usize;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(impact) = json.get("max_auto_heal_impact").and_then(|v| v.as_f64()) {
|
||||
if impact >= 0.0 && impact <= 1.0 {
|
||||
config.max_auto_heal_impact = impact as f32;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(learning) = json.get("learning_enabled").and_then(|v| v.as_bool()) {
|
||||
config.learning_enabled = learning;
|
||||
}
|
||||
|
||||
if let Some(verify) = json.get("verify_improvement").and_then(|v| v.as_bool()) {
|
||||
config.verify_improvement = verify;
|
||||
}
|
||||
|
||||
if let Some(min_pct) = json.get("min_improvement_pct").and_then(|v| v.as_f64()) {
|
||||
if min_pct >= 0.0 {
|
||||
config.min_improvement_pct = min_pct as f32;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(enabled) = json.get("enabled").and_then(|v| v.as_bool()) {
|
||||
engine_lock.set_enabled(enabled);
|
||||
}
|
||||
|
||||
engine_lock.update_config(config.clone());
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"status": "updated",
|
||||
"config": {
|
||||
"min_healing_interval_secs": config.min_healing_interval.as_secs(),
|
||||
"max_attempts_per_window": config.max_attempts_per_window,
|
||||
"max_auto_heal_impact": config.max_auto_heal_impact,
|
||||
"learning_enabled": config.learning_enabled,
|
||||
"verify_improvement": config.verify_improvement,
|
||||
"min_improvement_pct": config.min_improvement_pct,
|
||||
"enabled": engine_lock.enabled,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get current healing configuration
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_get_config() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
let config = &engine_lock.config;
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"min_healing_interval_secs": config.min_healing_interval.as_secs(),
|
||||
"max_attempts_per_window": config.max_attempts_per_window,
|
||||
"attempt_window_secs": config.attempt_window.as_secs(),
|
||||
"max_auto_heal_impact": config.max_auto_heal_impact,
|
||||
"learning_enabled": config.learning_enabled,
|
||||
"failure_cooldown_secs": config.failure_cooldown.as_secs(),
|
||||
"verify_improvement": config.verify_improvement,
|
||||
"min_improvement_pct": config.min_improvement_pct,
|
||||
"max_concurrent_remediations": config.max_concurrent_remediations,
|
||||
"require_approval_strategies": config.require_approval_strategies,
|
||||
"enabled": engine_lock.enabled,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Enable or disable healing
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_enable(enabled: bool) -> bool {
|
||||
let engine = get_healing_engine();
|
||||
let mut engine_lock = engine.write();
|
||||
engine_lock.set_enabled(enabled);
|
||||
engine_lock.enabled
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Strategy Functions
|
||||
// ============================================================================
|
||||
|
||||
/// List all available healing strategies
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_strategies() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let strategies: Vec<serde_json::Value> = engine_lock
|
||||
.remediation
|
||||
.registry
|
||||
.all_strategies()
|
||||
.iter()
|
||||
.map(|s| {
|
||||
serde_json::json!({
|
||||
"name": s.name(),
|
||||
"description": s.description(),
|
||||
"handles": s.handles().iter().map(|h| h.to_string()).collect::<Vec<_>>(),
|
||||
"impact": s.impact(),
|
||||
"estimated_duration_secs": s.estimated_duration().as_secs(),
|
||||
"reversible": s.reversible(),
|
||||
"weight": engine_lock.remediation.registry.get_weight(s.name()),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"strategies": strategies,
|
||||
"count": strategies.len(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get effectiveness report for all strategies
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_effectiveness() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let report = engine_lock.tracker.effectiveness_report();
|
||||
pgrx::JsonB(report.to_json())
|
||||
}
|
||||
|
||||
/// Get statistics for the healing engine
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_stats() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let engine_stats = engine_lock.remediation.get_stats();
|
||||
let tracker_stats = engine_lock.tracker.get_stats();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"engine": engine_stats.to_json(),
|
||||
"tracker": tracker_stats.to_json(),
|
||||
}))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Detection Threshold Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Get current detection thresholds
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_thresholds() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let thresholds = engine_lock.detector.get_thresholds();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"index_fragmentation_pct": thresholds.index_fragmentation_pct,
|
||||
"replica_lag_seconds": thresholds.replica_lag_seconds,
|
||||
"storage_usage_pct": thresholds.storage_usage_pct,
|
||||
"query_timeout_rate": thresholds.query_timeout_rate,
|
||||
"min_integrity_lambda": thresholds.min_integrity_lambda,
|
||||
"memory_usage_pct": thresholds.memory_usage_pct,
|
||||
"connection_usage_pct": thresholds.connection_usage_pct,
|
||||
"partition_load_ratio": thresholds.partition_load_ratio,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Update detection thresholds
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_set_thresholds(thresholds_json: pgrx::JsonB) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let mut thresholds = engine_lock.detector.get_thresholds();
|
||||
let json = thresholds_json.0;
|
||||
|
||||
if let Some(v) = json.get("index_fragmentation_pct").and_then(|v| v.as_f64()) {
|
||||
thresholds.index_fragmentation_pct = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("replica_lag_seconds").and_then(|v| v.as_f64()) {
|
||||
thresholds.replica_lag_seconds = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("storage_usage_pct").and_then(|v| v.as_f64()) {
|
||||
thresholds.storage_usage_pct = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("query_timeout_rate").and_then(|v| v.as_f64()) {
|
||||
thresholds.query_timeout_rate = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("min_integrity_lambda").and_then(|v| v.as_f64()) {
|
||||
thresholds.min_integrity_lambda = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("memory_usage_pct").and_then(|v| v.as_f64()) {
|
||||
thresholds.memory_usage_pct = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("connection_usage_pct").and_then(|v| v.as_f64()) {
|
||||
thresholds.connection_usage_pct = v as f32;
|
||||
}
|
||||
if let Some(v) = json.get("partition_load_ratio").and_then(|v| v.as_f64()) {
|
||||
thresholds.partition_load_ratio = v as f32;
|
||||
}
|
||||
|
||||
engine_lock.detector.update_thresholds(thresholds.clone());
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"status": "updated",
|
||||
"thresholds": {
|
||||
"index_fragmentation_pct": thresholds.index_fragmentation_pct,
|
||||
"replica_lag_seconds": thresholds.replica_lag_seconds,
|
||||
"storage_usage_pct": thresholds.storage_usage_pct,
|
||||
"query_timeout_rate": thresholds.query_timeout_rate,
|
||||
"min_integrity_lambda": thresholds.min_integrity_lambda,
|
||||
"memory_usage_pct": thresholds.memory_usage_pct,
|
||||
"connection_usage_pct": thresholds.connection_usage_pct,
|
||||
"partition_load_ratio": thresholds.partition_load_ratio,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Problem Type Reference
|
||||
// ============================================================================
|
||||
|
||||
/// List all supported problem types
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_problem_types() -> pgrx::JsonB {
|
||||
let types: Vec<serde_json::Value> = ProblemType::all()
|
||||
.iter()
|
||||
.map(|t| {
|
||||
serde_json::json!({
|
||||
"name": t.to_string(),
|
||||
"description": t.description(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"problem_types": types,
|
||||
"count": types.len(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// These tests would run in a PostgreSQL context with pg_test
|
||||
// For now, they verify the function signatures compile correctly
|
||||
}
|
||||
669
vendor/ruvector/crates/ruvector-postgres/src/healing/learning.rs
vendored
Normal file
669
vendor/ruvector/crates/ruvector-postgres/src/healing/learning.rs
vendored
Normal file
@@ -0,0 +1,669 @@
|
||||
//! Learning System for Self-Healing Engine
|
||||
//!
|
||||
//! Tracks remediation outcomes and adjusts strategy selection:
|
||||
//! - Outcome recording with full context
|
||||
//! - Strategy weight updates based on success/failure
|
||||
//! - Confidence scoring for strategies
|
||||
//! - Effectiveness reporting
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::detector::{Problem, ProblemType, Severity};
|
||||
use super::strategies::RemediationResult;
|
||||
|
||||
// ============================================================================
|
||||
// Outcome Record
|
||||
// ============================================================================
|
||||
|
||||
/// A recorded remediation outcome
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OutcomeRecord {
|
||||
/// Unique ID
|
||||
pub id: u64,
|
||||
/// Problem type
|
||||
pub problem_type: ProblemType,
|
||||
/// Problem severity
|
||||
pub severity: Severity,
|
||||
/// Strategy used
|
||||
pub strategy_name: String,
|
||||
/// Whether remediation succeeded
|
||||
pub success: bool,
|
||||
/// Whether improvement was verified
|
||||
pub verified: bool,
|
||||
/// Actions taken
|
||||
pub actions_taken: usize,
|
||||
/// Improvement percentage
|
||||
pub improvement_pct: f32,
|
||||
/// Duration in milliseconds
|
||||
pub duration_ms: u64,
|
||||
/// Error message if failed
|
||||
pub error_message: Option<String>,
|
||||
/// Timestamp
|
||||
pub timestamp: u64,
|
||||
/// Human feedback score (if provided, 0-1)
|
||||
pub feedback_score: Option<f32>,
|
||||
/// Additional metadata
|
||||
pub metadata: serde_json::Value,
|
||||
}
|
||||
|
||||
impl OutcomeRecord {
|
||||
/// Create from a problem and result
|
||||
pub fn from_result(
|
||||
id: u64,
|
||||
problem: &Problem,
|
||||
strategy_name: &str,
|
||||
result: &RemediationResult,
|
||||
verified: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
problem_type: problem.problem_type,
|
||||
severity: problem.severity,
|
||||
strategy_name: strategy_name.to_string(),
|
||||
success: result.is_success(),
|
||||
verified,
|
||||
actions_taken: result.actions_taken,
|
||||
improvement_pct: result.improvement_pct,
|
||||
duration_ms: result.duration_ms,
|
||||
error_message: result.error_message.clone(),
|
||||
timestamp: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
feedback_score: None,
|
||||
metadata: result.metadata.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"id": self.id,
|
||||
"problem_type": self.problem_type.to_string(),
|
||||
"severity": format!("{:?}", self.severity).to_lowercase(),
|
||||
"strategy_name": self.strategy_name,
|
||||
"success": self.success,
|
||||
"verified": self.verified,
|
||||
"actions_taken": self.actions_taken,
|
||||
"improvement_pct": self.improvement_pct,
|
||||
"duration_ms": self.duration_ms,
|
||||
"error_message": self.error_message,
|
||||
"timestamp": self.timestamp,
|
||||
"feedback_score": self.feedback_score,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Strategy Weight
|
||||
// ============================================================================
|
||||
|
||||
/// Strategy weight with confidence metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StrategyWeight {
|
||||
/// Strategy name
|
||||
pub strategy_name: String,
|
||||
/// Current weight (1.0 = baseline)
|
||||
pub weight: f32,
|
||||
/// Confidence in weight (0-1)
|
||||
pub confidence: f32,
|
||||
/// Number of observations
|
||||
pub observations: usize,
|
||||
/// Success count
|
||||
pub successes: usize,
|
||||
/// Average improvement when successful
|
||||
pub avg_improvement: f32,
|
||||
/// Average duration in milliseconds
|
||||
pub avg_duration_ms: u64,
|
||||
/// Last update timestamp
|
||||
pub last_updated: u64,
|
||||
}
|
||||
|
||||
impl StrategyWeight {
|
||||
/// Create new weight for strategy
|
||||
pub fn new(strategy_name: &str) -> Self {
|
||||
Self {
|
||||
strategy_name: strategy_name.to_string(),
|
||||
weight: 1.0,
|
||||
confidence: 0.0,
|
||||
observations: 0,
|
||||
successes: 0,
|
||||
avg_improvement: 0.0,
|
||||
avg_duration_ms: 0,
|
||||
last_updated: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Update with new observation
|
||||
pub fn update(&mut self, success: bool, improvement_pct: f32, duration_ms: u64) {
|
||||
self.observations += 1;
|
||||
if success {
|
||||
self.successes += 1;
|
||||
}
|
||||
|
||||
// Update running averages
|
||||
let n = self.observations as f32;
|
||||
self.avg_improvement = ((n - 1.0) * self.avg_improvement + improvement_pct) / n;
|
||||
self.avg_duration_ms = ((self.observations as u64 - 1) * self.avg_duration_ms
|
||||
+ duration_ms)
|
||||
/ self.observations as u64;
|
||||
|
||||
// Calculate success rate
|
||||
let success_rate = self.successes as f32 / self.observations as f32;
|
||||
|
||||
// Weight = success_rate * (1 + avg_improvement/100)
|
||||
self.weight = success_rate * (1.0 + self.avg_improvement / 100.0);
|
||||
self.weight = self.weight.max(0.1).min(2.0);
|
||||
|
||||
// Confidence increases with observations (asymptotic to 1.0)
|
||||
self.confidence = 1.0 - 1.0 / (1.0 + (self.observations as f32 / 10.0));
|
||||
|
||||
self.last_updated = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
}
|
||||
|
||||
/// Get success rate
|
||||
pub fn success_rate(&self) -> f32 {
|
||||
if self.observations > 0 {
|
||||
self.successes as f32 / self.observations as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"strategy_name": self.strategy_name,
|
||||
"weight": self.weight,
|
||||
"confidence": self.confidence,
|
||||
"observations": self.observations,
|
||||
"successes": self.successes,
|
||||
"success_rate": self.success_rate(),
|
||||
"avg_improvement": self.avg_improvement,
|
||||
"avg_duration_ms": self.avg_duration_ms,
|
||||
"last_updated": self.last_updated,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Outcome Tracker
|
||||
// ============================================================================
|
||||
|
||||
/// Tracks remediation outcomes for learning
|
||||
#[derive(Clone)]
|
||||
pub struct OutcomeTracker {
|
||||
/// Outcome history
|
||||
history: std::sync::Arc<RwLock<VecDeque<OutcomeRecord>>>,
|
||||
/// Strategy weights
|
||||
weights: std::sync::Arc<RwLock<HashMap<String, StrategyWeight>>>,
|
||||
/// Maximum history size
|
||||
max_history: usize,
|
||||
/// Next record ID
|
||||
next_id: std::sync::Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
impl OutcomeTracker {
|
||||
/// Create new tracker
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
history: std::sync::Arc::new(RwLock::new(VecDeque::new())),
|
||||
weights: std::sync::Arc::new(RwLock::new(HashMap::new())),
|
||||
max_history: 10000,
|
||||
next_id: std::sync::Arc::new(AtomicU64::new(1)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom history size
|
||||
pub fn with_max_history(max_history: usize) -> Self {
|
||||
Self {
|
||||
history: std::sync::Arc::new(RwLock::new(VecDeque::new())),
|
||||
weights: std::sync::Arc::new(RwLock::new(HashMap::new())),
|
||||
max_history,
|
||||
next_id: std::sync::Arc::new(AtomicU64::new(1)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Record a remediation outcome
|
||||
pub fn record(
|
||||
&self,
|
||||
problem: &Problem,
|
||||
strategy_name: &str,
|
||||
result: &RemediationResult,
|
||||
verified: bool,
|
||||
) {
|
||||
let id = self.next_id.fetch_add(1, Ordering::SeqCst);
|
||||
let record = OutcomeRecord::from_result(id, problem, strategy_name, result, verified);
|
||||
|
||||
// Add to history
|
||||
let mut history = self.history.write();
|
||||
history.push_back(record.clone());
|
||||
while history.len() > self.max_history {
|
||||
history.pop_front();
|
||||
}
|
||||
|
||||
// Update strategy weight
|
||||
let mut weights = self.weights.write();
|
||||
let weight = weights
|
||||
.entry(strategy_name.to_string())
|
||||
.or_insert_with(|| StrategyWeight::new(strategy_name));
|
||||
weight.update(verified, result.improvement_pct, result.duration_ms);
|
||||
}
|
||||
|
||||
/// Get recent outcomes
|
||||
pub fn get_recent(&self, limit: usize) -> Vec<OutcomeRecord> {
|
||||
let history = self.history.read();
|
||||
history.iter().rev().take(limit).cloned().collect()
|
||||
}
|
||||
|
||||
/// Get outcomes since timestamp
|
||||
pub fn get_since(&self, since: u64) -> Vec<OutcomeRecord> {
|
||||
let history = self.history.read();
|
||||
history
|
||||
.iter()
|
||||
.filter(|r| r.timestamp >= since)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get outcomes for a specific strategy
|
||||
pub fn get_for_strategy(&self, strategy_name: &str, limit: usize) -> Vec<OutcomeRecord> {
|
||||
let history = self.history.read();
|
||||
history
|
||||
.iter()
|
||||
.rev()
|
||||
.filter(|r| r.strategy_name == strategy_name)
|
||||
.take(limit)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get outcomes for a specific problem type
|
||||
pub fn get_for_problem_type(
|
||||
&self,
|
||||
problem_type: ProblemType,
|
||||
limit: usize,
|
||||
) -> Vec<OutcomeRecord> {
|
||||
let history = self.history.read();
|
||||
history
|
||||
.iter()
|
||||
.rev()
|
||||
.filter(|r| r.problem_type == problem_type)
|
||||
.take(limit)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get strategy weight
|
||||
pub fn get_weight(&self, strategy_name: &str) -> Option<StrategyWeight> {
|
||||
self.weights.read().get(strategy_name).cloned()
|
||||
}
|
||||
|
||||
/// Get all strategy weights
|
||||
pub fn get_all_weights(&self) -> Vec<StrategyWeight> {
|
||||
self.weights.read().values().cloned().collect()
|
||||
}
|
||||
|
||||
/// Add human feedback to an outcome
|
||||
pub fn add_feedback(&self, outcome_id: u64, score: f32) -> bool {
|
||||
let mut history = self.history.write();
|
||||
for record in history.iter_mut() {
|
||||
if record.id == outcome_id {
|
||||
record.feedback_score = Some(score.max(0.0).min(1.0));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Get overall statistics
|
||||
pub fn get_stats(&self) -> TrackerStats {
|
||||
let history = self.history.read();
|
||||
let weights = self.weights.read();
|
||||
|
||||
let total = history.len();
|
||||
let successes = history.iter().filter(|r| r.success && r.verified).count();
|
||||
let total_improvement: f32 = history.iter().map(|r| r.improvement_pct).sum();
|
||||
let total_duration: u64 = history.iter().map(|r| r.duration_ms).sum();
|
||||
|
||||
TrackerStats {
|
||||
total_outcomes: total,
|
||||
successful_outcomes: successes,
|
||||
success_rate: if total > 0 {
|
||||
successes as f32 / total as f32
|
||||
} else {
|
||||
0.0
|
||||
},
|
||||
avg_improvement: if total > 0 {
|
||||
total_improvement / total as f32
|
||||
} else {
|
||||
0.0
|
||||
},
|
||||
avg_duration_ms: if total > 0 {
|
||||
total_duration / total as u64
|
||||
} else {
|
||||
0
|
||||
},
|
||||
tracked_strategies: weights.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate effectiveness report
|
||||
pub fn effectiveness_report(&self) -> EffectivenessReport {
|
||||
let weights = self.get_all_weights();
|
||||
let stats = self.get_stats();
|
||||
|
||||
let strategy_reports: Vec<StrategyEffectiveness> = weights
|
||||
.iter()
|
||||
.map(|w| {
|
||||
let recent = self.get_for_strategy(&w.strategy_name, 10);
|
||||
StrategyEffectiveness {
|
||||
strategy_name: w.strategy_name.clone(),
|
||||
weight: w.weight,
|
||||
confidence: w.confidence,
|
||||
success_rate: w.success_rate(),
|
||||
avg_improvement: w.avg_improvement,
|
||||
recent_outcomes: recent.len(),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
EffectivenessReport {
|
||||
strategies: strategy_reports,
|
||||
overall_success_rate: stats.success_rate,
|
||||
avg_time_to_recovery_ms: stats.avg_duration_ms,
|
||||
total_outcomes: stats.total_outcomes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update weights from historical data (for batch learning)
|
||||
pub fn recalculate_weights(&self, lookback: Duration) {
|
||||
let cutoff = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
- lookback.as_secs();
|
||||
|
||||
let history = self.history.read();
|
||||
let mut weights = self.weights.write();
|
||||
|
||||
// Group outcomes by strategy
|
||||
let mut strategy_outcomes: HashMap<String, Vec<&OutcomeRecord>> = HashMap::new();
|
||||
for record in history.iter().filter(|r| r.timestamp >= cutoff) {
|
||||
strategy_outcomes
|
||||
.entry(record.strategy_name.clone())
|
||||
.or_default()
|
||||
.push(record);
|
||||
}
|
||||
|
||||
// Recalculate each strategy's weight
|
||||
for (strategy_name, outcomes) in strategy_outcomes {
|
||||
let weight = weights
|
||||
.entry(strategy_name.clone())
|
||||
.or_insert_with(|| StrategyWeight::new(&strategy_name));
|
||||
|
||||
// Reset counters
|
||||
weight.observations = outcomes.len();
|
||||
weight.successes = outcomes.iter().filter(|o| o.success && o.verified).count();
|
||||
weight.avg_improvement =
|
||||
outcomes.iter().map(|o| o.improvement_pct).sum::<f32>() / outcomes.len() as f32;
|
||||
weight.avg_duration_ms =
|
||||
outcomes.iter().map(|o| o.duration_ms).sum::<u64>() / outcomes.len() as u64;
|
||||
|
||||
// Recalculate weight
|
||||
let success_rate = weight.success_rate();
|
||||
weight.weight = success_rate * (1.0 + weight.avg_improvement / 100.0);
|
||||
weight.weight = weight.weight.max(0.1).min(2.0);
|
||||
weight.confidence = 1.0 - 1.0 / (1.0 + (weight.observations as f32 / 10.0));
|
||||
weight.last_updated = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for OutcomeTracker {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracker statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TrackerStats {
|
||||
pub total_outcomes: usize,
|
||||
pub successful_outcomes: usize,
|
||||
pub success_rate: f32,
|
||||
pub avg_improvement: f32,
|
||||
pub avg_duration_ms: u64,
|
||||
pub tracked_strategies: usize,
|
||||
}
|
||||
|
||||
impl TrackerStats {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"total_outcomes": self.total_outcomes,
|
||||
"successful_outcomes": self.successful_outcomes,
|
||||
"success_rate": self.success_rate,
|
||||
"avg_improvement": self.avg_improvement,
|
||||
"avg_duration_ms": self.avg_duration_ms,
|
||||
"tracked_strategies": self.tracked_strategies,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Strategy effectiveness
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StrategyEffectiveness {
|
||||
pub strategy_name: String,
|
||||
pub weight: f32,
|
||||
pub confidence: f32,
|
||||
pub success_rate: f32,
|
||||
pub avg_improvement: f32,
|
||||
pub recent_outcomes: usize,
|
||||
}
|
||||
|
||||
/// Effectiveness report
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EffectivenessReport {
|
||||
pub strategies: Vec<StrategyEffectiveness>,
|
||||
pub overall_success_rate: f32,
|
||||
pub avg_time_to_recovery_ms: u64,
|
||||
pub total_outcomes: usize,
|
||||
}
|
||||
|
||||
impl EffectivenessReport {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"strategies": self.strategies,
|
||||
"overall_success_rate": self.overall_success_rate,
|
||||
"avg_time_to_recovery_ms": self.avg_time_to_recovery_ms,
|
||||
"total_outcomes": self.total_outcomes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_problem() -> Problem {
|
||||
Problem::new(ProblemType::IndexDegradation, Severity::Medium)
|
||||
}
|
||||
|
||||
fn create_result(success: bool, improvement: f32) -> RemediationResult {
|
||||
if success {
|
||||
RemediationResult::success(1, improvement).with_duration(1000)
|
||||
} else {
|
||||
RemediationResult::failure("test error").with_duration(500)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_record_outcome() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
let result = create_result(true, 15.0);
|
||||
|
||||
tracker.record(&problem, "test_strategy", &result, true);
|
||||
|
||||
let recent = tracker.get_recent(10);
|
||||
assert_eq!(recent.len(), 1);
|
||||
assert!(recent[0].success);
|
||||
assert!(recent[0].verified);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_weight_updates() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
// Record successes
|
||||
for _ in 0..5 {
|
||||
let result = create_result(true, 20.0);
|
||||
tracker.record(&problem, "test_strategy", &result, true);
|
||||
}
|
||||
|
||||
let weight = tracker.get_weight("test_strategy").unwrap();
|
||||
assert_eq!(weight.observations, 5);
|
||||
assert_eq!(weight.successes, 5);
|
||||
assert!(weight.weight > 1.0); // Should be elevated
|
||||
assert!(weight.confidence > 0.3); // Should have some confidence
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mixed_outcomes() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
// 3 successes
|
||||
for _ in 0..3 {
|
||||
let result = create_result(true, 10.0);
|
||||
tracker.record(&problem, "test_strategy", &result, true);
|
||||
}
|
||||
|
||||
// 2 failures
|
||||
for _ in 0..2 {
|
||||
let result = create_result(false, 0.0);
|
||||
tracker.record(&problem, "test_strategy", &result, false);
|
||||
}
|
||||
|
||||
let weight = tracker.get_weight("test_strategy").unwrap();
|
||||
assert_eq!(weight.observations, 5);
|
||||
assert_eq!(weight.successes, 3);
|
||||
assert!((weight.success_rate() - 0.6).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_for_strategy() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
// Record for different strategies
|
||||
tracker.record(&problem, "strategy_a", &create_result(true, 10.0), true);
|
||||
tracker.record(&problem, "strategy_b", &create_result(true, 15.0), true);
|
||||
tracker.record(&problem, "strategy_a", &create_result(true, 20.0), true);
|
||||
|
||||
let a_outcomes = tracker.get_for_strategy("strategy_a", 10);
|
||||
assert_eq!(a_outcomes.len(), 2);
|
||||
|
||||
let b_outcomes = tracker.get_for_strategy("strategy_b", 10);
|
||||
assert_eq!(b_outcomes.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_feedback() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
tracker.record(&problem, "test_strategy", &create_result(true, 10.0), true);
|
||||
|
||||
let recent = tracker.get_recent(1);
|
||||
let id = recent[0].id;
|
||||
|
||||
assert!(tracker.add_feedback(id, 0.9));
|
||||
|
||||
let updated = tracker.get_recent(1);
|
||||
assert_eq!(updated[0].feedback_score, Some(0.9));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_history() {
|
||||
let tracker = OutcomeTracker::with_max_history(5);
|
||||
let problem = create_problem();
|
||||
|
||||
// Record 10 outcomes
|
||||
for i in 0..10 {
|
||||
tracker.record(
|
||||
&problem,
|
||||
"test_strategy",
|
||||
&create_result(true, i as f32),
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
let history = tracker.get_recent(100);
|
||||
assert_eq!(history.len(), 5); // Should be capped at 5
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_effectiveness_report() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
for _ in 0..5 {
|
||||
tracker.record(&problem, "strategy_a", &create_result(true, 15.0), true);
|
||||
}
|
||||
for _ in 0..5 {
|
||||
tracker.record(&problem, "strategy_b", &create_result(true, 25.0), true);
|
||||
}
|
||||
|
||||
let report = tracker.effectiveness_report();
|
||||
assert_eq!(report.strategies.len(), 2);
|
||||
assert_eq!(report.total_outcomes, 10);
|
||||
assert_eq!(report.overall_success_rate, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_weight_confidence() {
|
||||
let mut weight = StrategyWeight::new("test");
|
||||
|
||||
// Few observations = low confidence
|
||||
weight.update(true, 10.0, 1000);
|
||||
assert!(weight.confidence < 0.5);
|
||||
|
||||
// More observations = higher confidence
|
||||
for _ in 0..20 {
|
||||
weight.update(true, 10.0, 1000);
|
||||
}
|
||||
assert!(weight.confidence > 0.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tracker_stats() {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let problem = create_problem();
|
||||
|
||||
tracker.record(&problem, "strategy_a", &create_result(true, 10.0), true);
|
||||
tracker.record(&problem, "strategy_b", &create_result(false, 0.0), false);
|
||||
|
||||
let stats = tracker.get_stats();
|
||||
assert_eq!(stats.total_outcomes, 2);
|
||||
assert_eq!(stats.successful_outcomes, 1);
|
||||
assert_eq!(stats.success_rate, 0.5);
|
||||
}
|
||||
}
|
||||
233
vendor/ruvector/crates/ruvector-postgres/src/healing/mod.rs
vendored
Normal file
233
vendor/ruvector/crates/ruvector-postgres/src/healing/mod.rs
vendored
Normal file
@@ -0,0 +1,233 @@
|
||||
//! Self-Healing Engine for RuVector Postgres v2
|
||||
//!
|
||||
//! This module provides automated problem detection and remediation capabilities:
|
||||
//! - **Problem Detection**: Monitors system health and detects issues
|
||||
//! - **Remediation Strategies**: Defines actions to fix detected problems
|
||||
//! - **Remediation Engine**: Orchestrates strategy execution with rollback
|
||||
//! - **Learning System**: Tracks outcomes and improves strategy selection
|
||||
//! - **Background Worker**: Continuous health monitoring
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! +------------------------------------------------------------------+
|
||||
//! | Integrity Monitor |
|
||||
//! | - Detects state transitions (normal -> stress -> critical) |
|
||||
//! +------------------------------------------------------------------+
|
||||
//! |
|
||||
//! v
|
||||
//! +------------------------------------------------------------------+
|
||||
//! | Problem Detector |
|
||||
//! | - Classifies problem types from witness edges |
|
||||
//! +------------------------------------------------------------------+
|
||||
//! |
|
||||
//! v
|
||||
//! +------------------------------------------------------------------+
|
||||
//! | Remediation Engine |
|
||||
//! | - Selects strategy, executes with timeout/rollback |
|
||||
//! +------------------------------------------------------------------+
|
||||
//! |
|
||||
//! v
|
||||
//! +------------------------------------------------------------------+
|
||||
//! | Learning System |
|
||||
//! | - Records outcomes, updates strategy weights |
|
||||
//! +------------------------------------------------------------------+
|
||||
//! ```
|
||||
|
||||
pub mod detector;
|
||||
pub mod engine;
|
||||
pub mod functions;
|
||||
pub mod learning;
|
||||
pub mod strategies;
|
||||
pub mod worker;
|
||||
|
||||
pub use detector::{Problem, ProblemDetector, ProblemType, SystemMetrics};
|
||||
pub use engine::{HealingConfig, HealingOutcome, RemediationContext, RemediationEngine};
|
||||
pub use learning::{OutcomeRecord, OutcomeTracker, StrategyWeight};
|
||||
pub use strategies::{
|
||||
IntegrityRecovery, PromoteReplica, QueryCircuitBreaker, ReindexPartition, RemediationOutcome,
|
||||
RemediationResult, RemediationStrategy, StrategyRegistry, TierEviction,
|
||||
};
|
||||
pub use worker::{HealingWorker, HealingWorkerConfig, HealingWorkerState};
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Global healing engine instance
|
||||
static HEALING_ENGINE: std::sync::OnceLock<Arc<RwLock<HealingEngine>>> = std::sync::OnceLock::new();
|
||||
|
||||
/// Get or initialize the global healing engine
|
||||
pub fn get_healing_engine() -> Arc<RwLock<HealingEngine>> {
|
||||
HEALING_ENGINE
|
||||
.get_or_init(|| Arc::new(RwLock::new(HealingEngine::new())))
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Main healing engine combining all components
|
||||
pub struct HealingEngine {
|
||||
/// Problem detector
|
||||
pub detector: ProblemDetector,
|
||||
/// Remediation engine
|
||||
pub remediation: RemediationEngine,
|
||||
/// Outcome tracker for learning
|
||||
pub tracker: OutcomeTracker,
|
||||
/// Background worker state
|
||||
pub worker_state: Arc<HealingWorkerState>,
|
||||
/// Configuration
|
||||
pub config: HealingConfig,
|
||||
/// Whether healing is enabled
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
impl HealingEngine {
|
||||
/// Create a new healing engine with default configuration
|
||||
pub fn new() -> Self {
|
||||
let config = HealingConfig::default();
|
||||
let tracker = OutcomeTracker::new();
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
|
||||
Self {
|
||||
detector: ProblemDetector::new(),
|
||||
remediation: RemediationEngine::new(registry, config.clone(), tracker.clone()),
|
||||
tracker,
|
||||
worker_state: Arc::new(HealingWorkerState::new(HealingWorkerConfig::default())),
|
||||
config,
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom configuration
|
||||
pub fn with_config(config: HealingConfig) -> Self {
|
||||
let tracker = OutcomeTracker::new();
|
||||
let registry = StrategyRegistry::new_with_defaults();
|
||||
|
||||
Self {
|
||||
detector: ProblemDetector::new(),
|
||||
remediation: RemediationEngine::new(registry, config.clone(), tracker.clone()),
|
||||
tracker,
|
||||
worker_state: Arc::new(HealingWorkerState::new(HealingWorkerConfig::default())),
|
||||
config,
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check system health and return current status
|
||||
pub fn health_status(&self) -> HealthStatus {
|
||||
let metrics = self.detector.collect_metrics();
|
||||
let problems = self.detector.detect_problems(&metrics);
|
||||
let active_remediations = self.remediation.active_remediations();
|
||||
|
||||
HealthStatus {
|
||||
healthy: problems.is_empty() && active_remediations.is_empty(),
|
||||
problem_count: problems.len(),
|
||||
active_remediation_count: active_remediations.len(),
|
||||
problems,
|
||||
metrics,
|
||||
enabled: self.enabled,
|
||||
last_check: std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable or disable healing
|
||||
pub fn set_enabled(&mut self, enabled: bool) {
|
||||
self.enabled = enabled;
|
||||
}
|
||||
|
||||
/// Update configuration
|
||||
pub fn update_config(&mut self, config: HealingConfig) {
|
||||
self.config = config.clone();
|
||||
self.remediation.update_config(config);
|
||||
}
|
||||
|
||||
/// Trigger manual healing for a specific problem type
|
||||
pub fn trigger_healing(&self, problem_type: ProblemType) -> Option<HealingOutcome> {
|
||||
if !self.enabled {
|
||||
return None;
|
||||
}
|
||||
|
||||
let problem = Problem {
|
||||
problem_type,
|
||||
severity: detector::Severity::Medium,
|
||||
detected_at: std::time::SystemTime::now(),
|
||||
details: serde_json::json!({"source": "manual_trigger"}),
|
||||
affected_partitions: vec![],
|
||||
};
|
||||
|
||||
Some(self.remediation.heal(&problem))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealingEngine {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Health status summary
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealthStatus {
|
||||
/// Whether the system is healthy
|
||||
pub healthy: bool,
|
||||
/// Number of detected problems
|
||||
pub problem_count: usize,
|
||||
/// Number of active remediations
|
||||
pub active_remediation_count: usize,
|
||||
/// List of detected problems
|
||||
pub problems: Vec<Problem>,
|
||||
/// Current system metrics
|
||||
pub metrics: SystemMetrics,
|
||||
/// Whether healing is enabled
|
||||
pub enabled: bool,
|
||||
/// Timestamp of last health check
|
||||
pub last_check: u64,
|
||||
}
|
||||
|
||||
impl HealthStatus {
|
||||
/// Convert to JSON for SQL function output
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"healthy": self.healthy,
|
||||
"problem_count": self.problem_count,
|
||||
"active_remediation_count": self.active_remediation_count,
|
||||
"problems": self.problems.iter().map(|p| p.to_json()).collect::<Vec<_>>(),
|
||||
"metrics": self.metrics.to_json(),
|
||||
"enabled": self.enabled,
|
||||
"last_check": self.last_check,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_healing_engine_creation() {
|
||||
let engine = HealingEngine::new();
|
||||
assert!(engine.enabled);
|
||||
|
||||
let status = engine.health_status();
|
||||
assert!(status.healthy);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_healing_enable_disable() {
|
||||
let mut engine = HealingEngine::new();
|
||||
|
||||
engine.set_enabled(false);
|
||||
assert!(!engine.enabled);
|
||||
|
||||
engine.set_enabled(true);
|
||||
assert!(engine.enabled);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_global_instance() {
|
||||
let engine1 = get_healing_engine();
|
||||
let engine2 = get_healing_engine();
|
||||
assert!(Arc::ptr_eq(&engine1, &engine2));
|
||||
}
|
||||
}
|
||||
1165
vendor/ruvector/crates/ruvector-postgres/src/healing/strategies.rs
vendored
Normal file
1165
vendor/ruvector/crates/ruvector-postgres/src/healing/strategies.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
618
vendor/ruvector/crates/ruvector-postgres/src/healing/worker.rs
vendored
Normal file
618
vendor/ruvector/crates/ruvector-postgres/src/healing/worker.rs
vendored
Normal file
@@ -0,0 +1,618 @@
|
||||
//! Background Worker for Self-Healing Engine
|
||||
//!
|
||||
//! Provides continuous health monitoring and async remediation:
|
||||
//! - Periodic health checks
|
||||
//! - Automatic problem detection
|
||||
//! - Async remediation execution
|
||||
//! - Integration with integrity control plane
|
||||
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::detector::ProblemDetector;
|
||||
use super::engine::HealingOutcome;
|
||||
use super::get_healing_engine;
|
||||
|
||||
// ============================================================================
|
||||
// Worker Configuration
|
||||
// ============================================================================
|
||||
|
||||
/// Configuration for the healing background worker
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealingWorkerConfig {
|
||||
/// Health check interval
|
||||
pub check_interval: Duration,
|
||||
/// Whether to auto-remediate detected problems
|
||||
pub auto_remediate: bool,
|
||||
/// Minimum severity to auto-remediate
|
||||
pub min_auto_severity: u8, // 0=Info, 1=Low, 2=Medium, 3=High, 4=Critical
|
||||
/// Maximum concurrent remediations
|
||||
pub max_concurrent: usize,
|
||||
/// Whether to log health status
|
||||
pub log_status: bool,
|
||||
/// Enable metrics collection
|
||||
pub collect_metrics: bool,
|
||||
}
|
||||
|
||||
impl Default for HealingWorkerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
check_interval: Duration::from_secs(60),
|
||||
auto_remediate: true,
|
||||
min_auto_severity: 2, // Medium and above
|
||||
max_concurrent: 2,
|
||||
log_status: true,
|
||||
collect_metrics: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Worker State
|
||||
// ============================================================================
|
||||
|
||||
/// State of the healing background worker
|
||||
pub struct HealingWorkerState {
|
||||
/// Configuration
|
||||
config: RwLock<HealingWorkerConfig>,
|
||||
/// Whether worker is running
|
||||
running: AtomicBool,
|
||||
/// Last health check timestamp
|
||||
last_check: AtomicU64,
|
||||
/// Total health checks performed
|
||||
checks_completed: AtomicU64,
|
||||
/// Total problems detected
|
||||
problems_detected: AtomicU64,
|
||||
/// Total remediations triggered
|
||||
remediations_triggered: AtomicU64,
|
||||
/// Recent health statuses
|
||||
recent_statuses: RwLock<Vec<HealthCheckResult>>,
|
||||
}
|
||||
|
||||
impl HealingWorkerState {
|
||||
/// Create new worker state
|
||||
pub fn new(config: HealingWorkerConfig) -> Self {
|
||||
Self {
|
||||
config: RwLock::new(config),
|
||||
running: AtomicBool::new(false),
|
||||
last_check: AtomicU64::new(0),
|
||||
checks_completed: AtomicU64::new(0),
|
||||
problems_detected: AtomicU64::new(0),
|
||||
remediations_triggered: AtomicU64::new(0),
|
||||
recent_statuses: RwLock::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if worker is running
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.running.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
/// Start worker
|
||||
pub fn start(&self) {
|
||||
self.running.store(true, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
/// Stop worker
|
||||
pub fn stop(&self) {
|
||||
self.running.store(false, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
/// Get configuration
|
||||
pub fn get_config(&self) -> HealingWorkerConfig {
|
||||
self.config.read().clone()
|
||||
}
|
||||
|
||||
/// Update configuration
|
||||
pub fn set_config(&self, config: HealingWorkerConfig) {
|
||||
*self.config.write() = config;
|
||||
}
|
||||
|
||||
/// Record a health check
|
||||
pub fn record_check(&self, result: HealthCheckResult) {
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
self.last_check.store(now, Ordering::SeqCst);
|
||||
self.checks_completed.fetch_add(1, Ordering::SeqCst);
|
||||
self.problems_detected
|
||||
.fetch_add(result.problems_found as u64, Ordering::SeqCst);
|
||||
self.remediations_triggered
|
||||
.fetch_add(result.remediations_triggered as u64, Ordering::SeqCst);
|
||||
|
||||
// Keep last 100 statuses
|
||||
let mut statuses = self.recent_statuses.write();
|
||||
statuses.push(result);
|
||||
while statuses.len() > 100 {
|
||||
statuses.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get worker statistics
|
||||
pub fn get_stats(&self) -> WorkerStats {
|
||||
WorkerStats {
|
||||
running: self.running.load(Ordering::SeqCst),
|
||||
last_check: self.last_check.load(Ordering::SeqCst),
|
||||
checks_completed: self.checks_completed.load(Ordering::SeqCst),
|
||||
problems_detected: self.problems_detected.load(Ordering::SeqCst),
|
||||
remediations_triggered: self.remediations_triggered.load(Ordering::SeqCst),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get recent health check results
|
||||
pub fn get_recent_checks(&self, limit: usize) -> Vec<HealthCheckResult> {
|
||||
let statuses = self.recent_statuses.read();
|
||||
statuses.iter().rev().take(limit).cloned().collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Worker statistics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WorkerStats {
|
||||
pub running: bool,
|
||||
pub last_check: u64,
|
||||
pub checks_completed: u64,
|
||||
pub problems_detected: u64,
|
||||
pub remediations_triggered: u64,
|
||||
}
|
||||
|
||||
impl WorkerStats {
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"running": self.running,
|
||||
"last_check": self.last_check,
|
||||
"checks_completed": self.checks_completed,
|
||||
"problems_detected": self.problems_detected,
|
||||
"remediations_triggered": self.remediations_triggered,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Health Check Result
|
||||
// ============================================================================
|
||||
|
||||
/// Result of a health check
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthCheckResult {
|
||||
/// Timestamp of check
|
||||
pub timestamp: u64,
|
||||
/// Whether system is healthy
|
||||
pub healthy: bool,
|
||||
/// Number of problems found
|
||||
pub problems_found: usize,
|
||||
/// Number of remediations triggered
|
||||
pub remediations_triggered: usize,
|
||||
/// Remediation outcomes
|
||||
pub outcomes: Vec<serde_json::Value>,
|
||||
/// Metrics collected
|
||||
pub metrics: Option<serde_json::Value>,
|
||||
/// Duration of check in milliseconds
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
impl HealthCheckResult {
|
||||
/// Create a healthy result
|
||||
pub fn healthy() -> Self {
|
||||
Self {
|
||||
timestamp: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
healthy: true,
|
||||
problems_found: 0,
|
||||
remediations_triggered: 0,
|
||||
outcomes: vec![],
|
||||
metrics: None,
|
||||
duration_ms: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to JSON
|
||||
pub fn to_json(&self) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"timestamp": self.timestamp,
|
||||
"healthy": self.healthy,
|
||||
"problems_found": self.problems_found,
|
||||
"remediations_triggered": self.remediations_triggered,
|
||||
"outcomes": self.outcomes,
|
||||
"duration_ms": self.duration_ms,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Healing Worker
|
||||
// ============================================================================
|
||||
|
||||
/// Background worker for continuous health monitoring
|
||||
pub struct HealingWorker {
|
||||
/// Worker state
|
||||
state: Arc<HealingWorkerState>,
|
||||
/// Problem detector
|
||||
detector: ProblemDetector,
|
||||
}
|
||||
|
||||
impl HealingWorker {
|
||||
/// Create new healing worker
|
||||
pub fn new(config: HealingWorkerConfig) -> Self {
|
||||
Self {
|
||||
state: Arc::new(HealingWorkerState::new(config)),
|
||||
detector: ProblemDetector::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with shared state
|
||||
pub fn with_state(state: Arc<HealingWorkerState>) -> Self {
|
||||
Self {
|
||||
state,
|
||||
detector: ProblemDetector::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get worker state
|
||||
pub fn state(&self) -> &Arc<HealingWorkerState> {
|
||||
&self.state
|
||||
}
|
||||
|
||||
/// Perform one health check cycle
|
||||
pub fn check_health(&self) -> HealthCheckResult {
|
||||
let start = std::time::Instant::now();
|
||||
let config = self.state.get_config();
|
||||
|
||||
// Collect metrics
|
||||
let metrics = self.detector.collect_metrics();
|
||||
|
||||
// Detect problems
|
||||
let problems = self.detector.detect_problems(&metrics);
|
||||
let problems_found = problems.len();
|
||||
|
||||
if config.log_status {
|
||||
if problems_found > 0 {
|
||||
pgrx::log!("Healing worker: {} problems detected", problems_found);
|
||||
} else {
|
||||
pgrx::debug1!("Healing worker: no problems detected");
|
||||
}
|
||||
}
|
||||
|
||||
let mut remediations_triggered = 0;
|
||||
let mut outcomes = Vec::new();
|
||||
|
||||
// Auto-remediate if enabled
|
||||
if config.auto_remediate && problems_found > 0 {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
for problem in &problems {
|
||||
// Check severity threshold
|
||||
if problem.severity.value() < config.min_auto_severity {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Attempt remediation
|
||||
let outcome = engine_lock.remediation.heal(problem);
|
||||
outcomes.push(outcome.to_json());
|
||||
|
||||
if matches!(outcome, HealingOutcome::Completed { .. }) {
|
||||
remediations_triggered += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let duration_ms = start.elapsed().as_millis() as u64;
|
||||
|
||||
let result = HealthCheckResult {
|
||||
timestamp: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
healthy: problems_found == 0,
|
||||
problems_found,
|
||||
remediations_triggered,
|
||||
outcomes,
|
||||
metrics: if config.collect_metrics {
|
||||
Some(metrics.to_json())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
duration_ms,
|
||||
};
|
||||
|
||||
self.state.record_check(result.clone());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Run the worker loop (blocking)
|
||||
pub fn run(&self) {
|
||||
self.state.start();
|
||||
pgrx::log!("Healing background worker started");
|
||||
|
||||
while self.state.is_running() {
|
||||
// Perform health check
|
||||
let _result = self.check_health();
|
||||
|
||||
// Sleep until next check
|
||||
let interval = self.state.get_config().check_interval;
|
||||
|
||||
// Use PostgreSQL's WaitLatch for interruptible sleep
|
||||
self.wait_for_interval(interval);
|
||||
}
|
||||
|
||||
pgrx::log!("Healing background worker stopped");
|
||||
}
|
||||
|
||||
/// Wait for interval with interruption support
|
||||
fn wait_for_interval(&self, interval: Duration) {
|
||||
// Use simple thread sleep which works in all contexts.
|
||||
// In production as a full background worker, one would use
|
||||
// PostgreSQL's WaitLatch for interruptible sleep.
|
||||
std::thread::sleep(interval);
|
||||
}
|
||||
|
||||
/// Stop the worker
|
||||
pub fn stop(&self) {
|
||||
self.state.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Background Worker Entry Point
|
||||
// ============================================================================
|
||||
|
||||
/// PostgreSQL background worker entry point
|
||||
#[pgrx::pg_guard]
|
||||
pub extern "C" fn healing_bgworker_main(_arg: pgrx::pg_sys::Datum) {
|
||||
pgrx::log!("RuVector healing background worker starting");
|
||||
|
||||
let config = HealingWorkerConfig::default();
|
||||
let worker = HealingWorker::new(config);
|
||||
|
||||
worker.run();
|
||||
}
|
||||
|
||||
/// Register the background worker with PostgreSQL
|
||||
pub fn register_healing_worker() {
|
||||
pgrx::log!("Registering RuVector healing background worker");
|
||||
|
||||
// In production, use pg_sys::RegisterBackgroundWorker
|
||||
// This is a placeholder for now
|
||||
//
|
||||
// unsafe {
|
||||
// let mut worker = pg_sys::BackgroundWorker::default();
|
||||
// // Configure worker...
|
||||
// pg_sys::RegisterBackgroundWorker(&mut worker);
|
||||
// }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SQL Functions for Worker Control
|
||||
// ============================================================================
|
||||
|
||||
use pgrx::prelude::*;
|
||||
|
||||
/// Start the healing background worker
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_worker_start() -> bool {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
if engine_lock.worker_state.is_running() {
|
||||
pgrx::warning!("Healing worker is already running");
|
||||
return false;
|
||||
}
|
||||
|
||||
// In production, would launch actual background worker
|
||||
engine_lock.worker_state.start();
|
||||
pgrx::log!("Healing worker started");
|
||||
true
|
||||
}
|
||||
|
||||
/// Stop the healing background worker
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_worker_stop() -> bool {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
if !engine_lock.worker_state.is_running() {
|
||||
pgrx::warning!("Healing worker is not running");
|
||||
return false;
|
||||
}
|
||||
|
||||
engine_lock.worker_state.stop();
|
||||
pgrx::log!("Healing worker stopped");
|
||||
true
|
||||
}
|
||||
|
||||
/// Get healing worker status
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_worker_status() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let stats = engine_lock.worker_state.get_stats();
|
||||
let config = engine_lock.worker_state.get_config();
|
||||
|
||||
let status = serde_json::json!({
|
||||
"stats": stats.to_json(),
|
||||
"config": {
|
||||
"check_interval_secs": config.check_interval.as_secs(),
|
||||
"auto_remediate": config.auto_remediate,
|
||||
"min_auto_severity": config.min_auto_severity,
|
||||
"max_concurrent": config.max_concurrent,
|
||||
}
|
||||
});
|
||||
|
||||
pgrx::JsonB(status)
|
||||
}
|
||||
|
||||
/// Configure the healing worker
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_worker_config(
|
||||
check_interval_secs: Option<i32>,
|
||||
auto_remediate: Option<bool>,
|
||||
min_auto_severity: Option<i32>,
|
||||
) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let mut config = engine_lock.worker_state.get_config();
|
||||
|
||||
if let Some(interval) = check_interval_secs {
|
||||
if interval > 0 {
|
||||
config.check_interval = Duration::from_secs(interval as u64);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(auto_rem) = auto_remediate {
|
||||
config.auto_remediate = auto_rem;
|
||||
}
|
||||
|
||||
if let Some(severity) = min_auto_severity {
|
||||
if severity >= 0 && severity <= 4 {
|
||||
config.min_auto_severity = severity as u8;
|
||||
}
|
||||
}
|
||||
|
||||
engine_lock.worker_state.set_config(config.clone());
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"status": "updated",
|
||||
"config": {
|
||||
"check_interval_secs": config.check_interval.as_secs(),
|
||||
"auto_remediate": config.auto_remediate,
|
||||
"min_auto_severity": config.min_auto_severity,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
/// Manually trigger a health check
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_check_now() -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let detector = ProblemDetector::new();
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let metrics = detector.collect_metrics();
|
||||
let problems = detector.detect_problems(&metrics);
|
||||
|
||||
let mut outcomes = Vec::new();
|
||||
for problem in &problems {
|
||||
let outcome = engine_lock.remediation.heal(problem);
|
||||
outcomes.push(outcome.to_json());
|
||||
}
|
||||
|
||||
let result = serde_json::json!({
|
||||
"healthy": problems.is_empty(),
|
||||
"problems_found": problems.len(),
|
||||
"problems": problems.iter().map(|p| p.to_json()).collect::<Vec<_>>(),
|
||||
"outcomes": outcomes,
|
||||
"metrics": metrics.to_json(),
|
||||
"duration_ms": start.elapsed().as_millis() as u64,
|
||||
});
|
||||
|
||||
pgrx::JsonB(result)
|
||||
}
|
||||
|
||||
/// Get recent health check results
|
||||
#[pg_extern]
|
||||
pub fn ruvector_healing_recent_checks(limit: default!(i32, 10)) -> pgrx::JsonB {
|
||||
let engine = get_healing_engine();
|
||||
let engine_lock = engine.read();
|
||||
|
||||
let checks = engine_lock.worker_state.get_recent_checks(limit as usize);
|
||||
|
||||
pgrx::JsonB(serde_json::json!({
|
||||
"checks": checks.iter().map(|c| c.to_json()).collect::<Vec<_>>(),
|
||||
"count": checks.len(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_worker_state() {
|
||||
let state = HealingWorkerState::new(HealingWorkerConfig::default());
|
||||
|
||||
assert!(!state.is_running());
|
||||
state.start();
|
||||
assert!(state.is_running());
|
||||
state.stop();
|
||||
assert!(!state.is_running());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_health_check_result() {
|
||||
let result = HealthCheckResult::healthy();
|
||||
assert!(result.healthy);
|
||||
assert_eq!(result.problems_found, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_worker_config() {
|
||||
let config = HealingWorkerConfig::default();
|
||||
assert!(config.auto_remediate);
|
||||
assert_eq!(config.min_auto_severity, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_state_recording() {
|
||||
let state = HealingWorkerState::new(HealingWorkerConfig::default());
|
||||
|
||||
let result = HealthCheckResult {
|
||||
timestamp: 12345,
|
||||
healthy: false,
|
||||
problems_found: 2,
|
||||
remediations_triggered: 1,
|
||||
outcomes: vec![],
|
||||
metrics: None,
|
||||
duration_ms: 100,
|
||||
};
|
||||
|
||||
state.record_check(result);
|
||||
|
||||
let stats = state.get_stats();
|
||||
assert_eq!(stats.checks_completed, 1);
|
||||
assert_eq!(stats.problems_detected, 2);
|
||||
assert_eq!(stats.remediations_triggered, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_recent_checks() {
|
||||
let state = HealingWorkerState::new(HealingWorkerConfig::default());
|
||||
|
||||
for i in 0..5 {
|
||||
state.record_check(HealthCheckResult {
|
||||
timestamp: i,
|
||||
healthy: true,
|
||||
problems_found: 0,
|
||||
remediations_triggered: 0,
|
||||
outcomes: vec![],
|
||||
metrics: None,
|
||||
duration_ms: 10,
|
||||
});
|
||||
}
|
||||
|
||||
let recent = state.get_recent_checks(3);
|
||||
assert_eq!(recent.len(), 3);
|
||||
// Most recent first
|
||||
assert_eq!(recent[0].timestamp, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_worker_creation() {
|
||||
let worker = HealingWorker::new(HealingWorkerConfig::default());
|
||||
assert!(!worker.state().is_running());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user