//! Time series processing and vectorization for RuVector use std::collections::HashMap; use chrono::{DateTime, Utc}; use ndarray::Array1; use serde::{Deserialize, Serialize}; use crate::ClimateObservation; /// A vectorized time series for RuVector storage #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TimeSeriesVector { /// Series identifier pub id: String, /// Station/source ID pub station_id: String, /// Start time pub start_time: DateTime, /// End time pub end_time: DateTime, /// Temporal resolution (seconds) pub resolution_secs: i64, /// Feature vector for similarity search pub embedding: Vec, /// Statistical summary pub stats: SeriesStats, /// Raw values (optional, for debugging) pub raw_values: Option>, } /// Statistical summary of a time series #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SeriesStats { /// Number of observations pub count: usize, /// Mean value pub mean: f64, /// Standard deviation pub std_dev: f64, /// Minimum value pub min: f64, /// Maximum value pub max: f64, /// Trend (linear slope) pub trend: f64, /// Variance ratio (for stationarity check) pub variance_ratio: f64, /// Autocorrelation at lag 1 pub autocorr_lag1: f64, } /// Seasonal decomposition result #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SeasonalDecomposition { /// Trend component pub trend: Vec, /// Seasonal component pub seasonal: Vec, /// Residual component pub residual: Vec, /// Period detected pub period: usize, /// Strength of seasonality (0-1) pub seasonal_strength: f64, /// Strength of trend (0-1) pub trend_strength: f64, } /// Time series processor pub struct TimeSeriesProcessor { /// Configuration config: ProcessorConfig, } /// Processor configuration #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessorConfig { /// Target embedding dimension pub embedding_dim: usize, /// Window size for rolling statistics pub window_size: usize, /// Enable seasonal decomposition pub decompose_seasonal: bool, /// Seasonal period (if known) pub seasonal_period: Option, /// Normalize embeddings pub normalize: bool, } impl Default for ProcessorConfig { fn default() -> Self { Self { embedding_dim: 128, window_size: 7, decompose_seasonal: true, seasonal_period: None, normalize: true, } } } impl TimeSeriesProcessor { /// Create a new processor pub fn new(config: ProcessorConfig) -> Self { Self { config } } /// Process observations into a time series vector pub fn process(&self, observations: &[ClimateObservation]) -> Option { if observations.is_empty() { return None; } // Sort by time let mut sorted = observations.to_vec(); sorted.sort_by_key(|o| o.timestamp); // Extract values and times let values: Vec = sorted.iter().map(|o| o.value).collect(); let times: Vec> = sorted.iter().map(|o| o.timestamp).collect(); let start_time = times.first().cloned()?; let end_time = times.last().cloned()?; let station_id = sorted.first()?.station_id.clone(); // Compute resolution let resolution_secs = if times.len() >= 2 { let diffs: Vec = times .windows(2) .map(|w| (w[1] - w[0]).num_seconds()) .collect(); diffs.iter().sum::() / diffs.len() as i64 } else { 86400 // Default to daily }; // Compute statistics let stats = self.compute_stats(&values); // Generate embedding let embedding = self.generate_embedding(&values, &stats); Some(TimeSeriesVector { id: format!("{}_{}", station_id, start_time.timestamp()), station_id, start_time, end_time, resolution_secs, embedding, stats, raw_values: Some(values), }) } /// Compute statistical summary fn compute_stats(&self, values: &[f64]) -> SeriesStats { let n = values.len(); if n == 0 { return SeriesStats { count: 0, mean: 0.0, std_dev: 0.0, min: 0.0, max: 0.0, trend: 0.0, variance_ratio: 1.0, autocorr_lag1: 0.0, }; } let mean = values.iter().sum::() / n as f64; let variance = values.iter().map(|v| (v - mean).powi(2)).sum::() / n as f64; let std_dev = variance.sqrt(); let min = values.iter().cloned().fold(f64::INFINITY, f64::min); let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max); // Linear trend let trend = self.compute_trend(values); // Variance ratio (for stationarity) let variance_ratio = if n > 10 { let mid = n / 2; let var1: f64 = values[..mid].iter().map(|v| (v - mean).powi(2)).sum::() / mid as f64; let var2: f64 = values[mid..].iter().map(|v| (v - mean).powi(2)).sum::() / (n - mid) as f64; if var1 > 0.0 { var2 / var1 } else { 1.0 } } else { 1.0 }; // Autocorrelation at lag 1 let autocorr_lag1 = self.compute_autocorr(values, 1); SeriesStats { count: n, mean, std_dev, min, max, trend, variance_ratio, autocorr_lag1, } } /// Compute linear trend fn compute_trend(&self, values: &[f64]) -> f64 { let n = values.len(); if n < 2 { return 0.0; } let x_mean = (n - 1) as f64 / 2.0; let y_mean = values.iter().sum::() / n as f64; let mut num = 0.0; let mut denom = 0.0; for (i, &y) in values.iter().enumerate() { let x = i as f64; num += (x - x_mean) * (y - y_mean); denom += (x - x_mean).powi(2); } if denom > 0.0 { num / denom } else { 0.0 } } /// Compute autocorrelation at given lag fn compute_autocorr(&self, values: &[f64], lag: usize) -> f64 { let n = values.len(); if n <= lag { return 0.0; } let mean = values.iter().sum::() / n as f64; let variance: f64 = values.iter().map(|v| (v - mean).powi(2)).sum(); if variance == 0.0 { return 0.0; } let mut cov = 0.0; for i in lag..n { cov += (values[i] - mean) * (values[i - lag] - mean); } cov / variance } /// Generate embedding vector for similarity search fn generate_embedding(&self, values: &[f64], stats: &SeriesStats) -> Vec { let mut embedding = Vec::with_capacity(self.config.embedding_dim); // Statistical features (first 16 dimensions) embedding.push(stats.mean as f32); embedding.push(stats.std_dev as f32); embedding.push(stats.min as f32); embedding.push(stats.max as f32); embedding.push(stats.trend as f32); embedding.push(stats.variance_ratio as f32); embedding.push(stats.autocorr_lag1 as f32); embedding.push((stats.max - stats.min) as f32); // Range // Quantile features let quantiles = self.compute_quantiles(values, &[0.1, 0.25, 0.5, 0.75, 0.9]); for q in quantiles { embedding.push(q as f32); } // Pad to reach target dimension while embedding.len() < 16 { embedding.push(0.0); } // Rolling window features (next 32 dimensions) if values.len() >= self.config.window_size { let rolling_means = self.rolling_mean(values, self.config.window_size); let rolling_stds = self.rolling_std(values, self.config.window_size); // Sample evenly from rolling stats let sample_count = 16; for i in 0..sample_count { let idx = i * rolling_means.len() / sample_count; if idx < rolling_means.len() { embedding.push(rolling_means[idx] as f32); embedding.push(rolling_stds[idx] as f32); } } } // Pad to target dimension while embedding.len() < self.config.embedding_dim { embedding.push(0.0); } // Truncate if needed embedding.truncate(self.config.embedding_dim); // Normalize if self.config.normalize { let norm: f32 = embedding.iter().map(|x| x * x).sum::().sqrt(); if norm > 0.0 { for x in &mut embedding { *x /= norm; } } } embedding } /// Compute quantiles fn compute_quantiles(&self, values: &[f64], quantiles: &[f64]) -> Vec { if values.is_empty() { return quantiles.iter().map(|_| 0.0).collect(); } let mut sorted = values.to_vec(); sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); quantiles .iter() .map(|q| { let idx = (q * (sorted.len() - 1) as f64).round() as usize; sorted[idx.min(sorted.len() - 1)] }) .collect() } /// Rolling mean fn rolling_mean(&self, values: &[f64], window: usize) -> Vec { if values.len() < window { return vec![]; } let mut result = Vec::with_capacity(values.len() - window + 1); let mut sum: f64 = values[..window].iter().sum(); result.push(sum / window as f64); for i in window..values.len() { sum += values[i] - values[i - window]; result.push(sum / window as f64); } result } /// Rolling standard deviation fn rolling_std(&self, values: &[f64], window: usize) -> Vec { if values.len() < window { return vec![]; } let means = self.rolling_mean(values, window); means .iter() .enumerate() .map(|(i, &mean)| { let variance: f64 = values[i..i + window] .iter() .map(|v| (v - mean).powi(2)) .sum::() / window as f64; variance.sqrt() }) .collect() } /// Decompose time series into trend, seasonal, and residual components pub fn decompose(&self, values: &[f64], period: usize) -> SeasonalDecomposition { let n = values.len(); if n < period * 2 { return SeasonalDecomposition { trend: values.to_vec(), seasonal: vec![0.0; n], residual: vec![0.0; n], period, seasonal_strength: 0.0, trend_strength: 0.0, }; } // Simple moving average for trend let mut trend = vec![0.0; n]; let half_period = period / 2; for i in half_period..(n - half_period) { let window: f64 = values[(i - half_period)..(i + half_period + 1)] .iter() .sum(); trend[i] = window / period as f64; } // Fill edges with nearest values for i in 0..half_period { trend[i] = trend[half_period]; } for i in (n - half_period)..n { trend[i] = trend[n - half_period - 1]; } // Detrended series let detrended: Vec = values.iter().zip(&trend).map(|(v, t)| v - t).collect(); // Compute seasonal pattern let mut seasonal = vec![0.0; n]; for i in 0..period { let indices: Vec = (i..n).step_by(period).collect(); let seasonal_mean: f64 = indices.iter().map(|&j| detrended[j]).sum::() / indices.len() as f64; for &j in &indices { seasonal[j] = seasonal_mean; } } // Residual let residual: Vec = values .iter() .zip(&trend) .zip(&seasonal) .map(|((v, t), s)| v - t - s) .collect(); // Compute strength measures let residual_var: f64 = residual.iter().map(|r| r * r).sum::() / n as f64; let detrended_var: f64 = detrended.iter().map(|d| d * d).sum::() / n as f64; let deseasoned: Vec = values.iter().zip(&seasonal).map(|(v, s)| v - s).collect(); let deseasoned_var: f64 = deseasoned.iter().map(|d| d * d).sum::() / n as f64; let seasonal_strength = if detrended_var > 0.0 { (1.0 - residual_var / detrended_var).max(0.0) } else { 0.0 }; let trend_strength = if deseasoned_var > 0.0 { (1.0 - residual_var / deseasoned_var).max(0.0) } else { 0.0 }; SeasonalDecomposition { trend, seasonal, residual, period, seasonal_strength, trend_strength, } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_processor_creation() { let config = ProcessorConfig::default(); let processor = TimeSeriesProcessor::new(config); assert_eq!(processor.config.embedding_dim, 128); } #[test] fn test_compute_stats() { let config = ProcessorConfig::default(); let processor = TimeSeriesProcessor::new(config); let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; let stats = processor.compute_stats(&values); assert_eq!(stats.count, 5); assert!((stats.mean - 3.0).abs() < 0.001); assert!((stats.min - 1.0).abs() < 0.001); assert!((stats.max - 5.0).abs() < 0.001); } #[test] fn test_trend_calculation() { let config = ProcessorConfig::default(); let processor = TimeSeriesProcessor::new(config); let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; let trend = processor.compute_trend(&values); assert!((trend - 1.0).abs() < 0.001); // Perfect linear trend } #[test] fn test_rolling_mean() { let config = ProcessorConfig::default(); let processor = TimeSeriesProcessor::new(config); let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; let rolling = processor.rolling_mean(&values, 3); assert_eq!(rolling.len(), 3); assert!((rolling[0] - 2.0).abs() < 0.001); assert!((rolling[1] - 3.0).abs() < 0.001); assert!((rolling[2] - 4.0).abs() < 0.001); } #[test] fn test_decomposition() { let config = ProcessorConfig::default(); let processor = TimeSeriesProcessor::new(config); // Create synthetic data with trend and seasonality let n = 100; let period = 12; let mut values = Vec::with_capacity(n); for i in 0..n { let trend = 0.1 * i as f64; let seasonal = 5.0 * (2.0 * std::f64::consts::PI * i as f64 / period as f64).sin(); values.push(trend + seasonal); } let decomp = processor.decompose(&values, period); assert_eq!(decomp.trend.len(), n); assert_eq!(decomp.seasonal.len(), n); assert_eq!(decomp.residual.len(), n); assert!(decomp.seasonal_strength > 0.5); } }