Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/crates/ruvector-math/src/spectral/clustering.rs
+++ b/crates/ruvector-math/src/spectral/clustering.rs
@@ -0,0 +1,441 @@
+//! Spectral Clustering
+//!
+//! Graph partitioning using spectral methods.
+//! Efficient approximation via Chebyshev polynomials.
+
+use super::ScaledLaplacian;
+
+/// Spectral clustering configuration
+#[derive(Debug, Clone)]
+pub struct ClusteringConfig {
+    /// Number of clusters
+    pub k: usize,
+    /// Number of eigenvectors to use
+    pub num_eigenvectors: usize,
+    /// Power iteration steps for eigenvector approximation
+    pub power_iters: usize,
+    /// K-means iterations
+    pub kmeans_iters: usize,
+    /// Random seed
+    pub seed: u64,
+}
+
+impl Default for ClusteringConfig {
+    fn default() -> Self {
+        Self {
+            k: 2,
+            num_eigenvectors: 10,
+            power_iters: 50,
+            kmeans_iters: 20,
+            seed: 42,
+        }
+    }
+}
+
+/// Spectral clustering result
+#[derive(Debug, Clone)]
+pub struct ClusteringResult {
+    /// Cluster assignment for each vertex
+    pub assignments: Vec<usize>,
+    /// Eigenvector embedding (n × k)
+    pub embedding: Vec<Vec<f64>>,
+    /// Number of clusters
+    pub k: usize,
+}
+
+impl ClusteringResult {
+    /// Get vertices in cluster c
+    pub fn cluster(&self, c: usize) -> Vec<usize> {
+        self.assignments
+            .iter()
+            .enumerate()
+            .filter(|(_, &a)| a == c)
+            .map(|(i, _)| i)
+            .collect()
+    }
+
+    /// Cluster sizes
+    pub fn cluster_sizes(&self) -> Vec<usize> {
+        let mut sizes = vec![0; self.k];
+        for &a in &self.assignments {
+            if a < self.k {
+                sizes[a] += 1;
+            }
+        }
+        sizes
+    }
+}
+
+/// Spectral clustering
+#[derive(Debug, Clone)]
+pub struct SpectralClustering {
+    /// Configuration
+    config: ClusteringConfig,
+}
+
+impl SpectralClustering {
+    /// Create with configuration
+    pub fn new(config: ClusteringConfig) -> Self {
+        Self { config }
+    }
+
+    /// Create with just number of clusters
+    pub fn with_k(k: usize) -> Self {
+        Self::new(ClusteringConfig {
+            k,
+            num_eigenvectors: k,
+            ..Default::default()
+        })
+    }
+
+    /// Cluster graph using normalized Laplacian eigenvectors
+    pub fn cluster(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
+        let n = laplacian.n;
+        let k = self.config.k.min(n);
+        let num_eig = self.config.num_eigenvectors.min(n);
+
+        // Compute approximate eigenvectors of Laplacian
+        // We want the k smallest eigenvalues (smoothest eigenvectors)
+        // Use inverse power method on shifted Laplacian
+        let embedding = self.compute_embedding(laplacian, num_eig);
+
+        // Run k-means on embedding
+        let assignments = self.kmeans(&embedding, k);
+
+        ClusteringResult {
+            assignments,
+            embedding,
+            k,
+        }
+    }
+
+    /// Cluster using Fiedler vector (k=2)
+    pub fn bipartition(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
+        let n = laplacian.n;
+
+        // Compute Fiedler vector (second smallest eigenvector)
+        let fiedler = self.compute_fiedler(laplacian);
+
+        // Partition by sign
+        let assignments: Vec<usize> = fiedler
+            .iter()
+            .map(|&v| if v >= 0.0 { 0 } else { 1 })
+            .collect();
+
+        ClusteringResult {
+            assignments,
+            embedding: vec![fiedler],
+            k: 2,
+        }
+    }
+
+    /// Compute spectral embedding (k smallest non-trivial eigenvectors)
+    fn compute_embedding(&self, laplacian: &ScaledLaplacian, k: usize) -> Vec<Vec<f64>> {
+        let n = laplacian.n;
+        if k == 0 || n == 0 {
+            return vec![];
+        }
+
+        // Initialize random vectors
+        let mut vectors: Vec<Vec<f64>> = (0..k)
+            .map(|i| {
+                (0..n)
+                    .map(|j| {
+                        let x = ((j * 2654435769 + i * 1103515245 + self.config.seed as usize)
+                            as f64
+                            / 4294967296.0)
+                            * 2.0
+                            - 1.0;
+                        x
+                    })
+                    .collect()
+            })
+            .collect();
+
+        // Power iteration to find smallest eigenvectors
+        // We use (I - L_scaled) which has largest eigenvalue where L_scaled has smallest
+        for _ in 0..self.config.power_iters {
+            for i in 0..k {
+                // Apply (I - L_scaled) = (2I - L)/λ_max approximately
+                // Simpler: just use deflated power iteration on L for smallest
+                let mut y = vec![0.0; n];
+                let lx = laplacian.apply(&vectors[i]);
+
+                // We want small eigenvalues, so use (λ_max*I - L)
+                let shift = 2.0; // Approximate max eigenvalue of scaled Laplacian
+                for j in 0..n {
+                    y[j] = shift * vectors[i][j] - lx[j];
+                }
+
+                // Orthogonalize against previous vectors and constant vector
+                // First, remove constant component (eigenvalue 0)
+                let mean: f64 = y.iter().sum::<f64>() / n as f64;
+                for j in 0..n {
+                    y[j] -= mean;
+                }
+
+                // Then orthogonalize against previous eigenvectors
+                for prev in 0..i {
+                    let dot: f64 = y.iter().zip(vectors[prev].iter()).map(|(a, b)| a * b).sum();
+                    for j in 0..n {
+                        y[j] -= dot * vectors[prev][j];
+                    }
+                }
+
+                // Normalize
+                let norm: f64 = y.iter().map(|x| x * x).sum::<f64>().sqrt();
+                if norm > 1e-15 {
+                    for j in 0..n {
+                        y[j] /= norm;
+                    }
+                }
+
+                vectors[i] = y;
+            }
+        }
+
+        vectors
+    }
+
+    /// Compute Fiedler vector (second smallest eigenvector)
+    fn compute_fiedler(&self, laplacian: &ScaledLaplacian) -> Vec<f64> {
+        let embedding = self.compute_embedding(laplacian, 1);
+        if embedding.is_empty() {
+            return vec![0.0; laplacian.n];
+        }
+        embedding[0].clone()
+    }
+
+    /// K-means clustering on embedding
+    fn kmeans(&self, embedding: &[Vec<f64>], k: usize) -> Vec<usize> {
+        if embedding.is_empty() {
+            return vec![];
+        }
+
+        let n = embedding[0].len();
+        let dim = embedding.len();
+
+        if n == 0 || k == 0 {
+            return vec![];
+        }
+
+        // Initialize centroids (k-means++ style)
+        let mut centroids: Vec<Vec<f64>> = Vec::with_capacity(k);
+
+        // First centroid: random point
+        let first = (self.config.seed as usize) % n;
+        centroids.push((0..dim).map(|d| embedding[d][first]).collect());
+
+        // Remaining centroids: proportional to squared distance
+        for _ in 1..k {
+            let mut distances: Vec<f64> = (0..n)
+                .map(|i| {
+                    centroids
+                        .iter()
+                        .map(|c| {
+                            (0..dim)
+                                .map(|d| (embedding[d][i] - c[d]).powi(2))
+                                .sum::<f64>()
+                        })
+                        .fold(f64::INFINITY, f64::min)
+                })
+                .collect();
+
+            let total: f64 = distances.iter().sum();
+            if total > 0.0 {
+                let threshold = (self.config.seed as f64 / 4294967296.0) * total;
+                let mut cumsum = 0.0;
+                let mut chosen = 0;
+                for (i, &d) in distances.iter().enumerate() {
+                    cumsum += d;
+                    if cumsum >= threshold {
+                        chosen = i;
+                        break;
+                    }
+                }
+                centroids.push((0..dim).map(|d| embedding[d][chosen]).collect());
+            } else {
+                // Degenerate case
+                centroids.push(vec![0.0; dim]);
+            }
+        }
+
+        // K-means iterations
+        let mut assignments = vec![0; n];
+
+        for _ in 0..self.config.kmeans_iters {
+            // Assign points to nearest centroid
+            for i in 0..n {
+                let mut best_cluster = 0;
+                let mut best_dist = f64::INFINITY;
+
+                for (c, centroid) in centroids.iter().enumerate() {
+                    let dist: f64 = (0..dim)
+                        .map(|d| (embedding[d][i] - centroid[d]).powi(2))
+                        .sum();
+
+                    if dist < best_dist {
+                        best_dist = dist;
+                        best_cluster = c;
+                    }
+                }
+
+                assignments[i] = best_cluster;
+            }
+
+            // Update centroids
+            let mut counts = vec![0usize; k];
+            for centroid in centroids.iter_mut() {
+                for v in centroid.iter_mut() {
+                    *v = 0.0;
+                }
+            }
+
+            for (i, &c) in assignments.iter().enumerate() {
+                counts[c] += 1;
+                for d in 0..dim {
+                    centroids[c][d] += embedding[d][i];
+                }
+            }
+
+            for (c, centroid) in centroids.iter_mut().enumerate() {
+                if counts[c] > 0 {
+                    for v in centroid.iter_mut() {
+                        *v /= counts[c] as f64;
+                    }
+                }
+            }
+        }
+
+        assignments
+    }
+
+    /// Compute normalized cut value for a bipartition
+    pub fn normalized_cut(&self, laplacian: &ScaledLaplacian, partition: &[bool]) -> f64 {
+        let n = laplacian.n;
+        if n == 0 {
+            return 0.0;
+        }
+
+        // Compute cut and volumes
+        let mut cut = 0.0;
+        let mut vol_a = 0.0;
+        let mut vol_b = 0.0;
+
+        // For each entry in Laplacian
+        for &(i, j, v) in &laplacian.entries {
+            if i < n && j < n && i != j {
+                // This is an edge (negative Laplacian entry)
+                let w = -v; // Edge weight
+                if w > 0.0 && partition[i] != partition[j] {
+                    cut += w;
+                }
+            }
+            if i == j && i < n {
+                // Diagonal = degree
+                if partition[i] {
+                    vol_a += v;
+                } else {
+                    vol_b += v;
+                }
+            }
+        }
+
+        // NCut = cut/vol(A) + cut/vol(B)
+        let ncut = if vol_a > 0.0 { cut / vol_a } else { 0.0 }
+            + if vol_b > 0.0 { cut / vol_b } else { 0.0 };
+
+        ncut
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn two_cliques_graph() -> ScaledLaplacian {
+        // Two cliques of size 3 connected by one edge
+        let edges = vec![
+            // Clique 1
+            (0, 1, 1.0),
+            (0, 2, 1.0),
+            (1, 2, 1.0),
+            // Clique 2
+            (3, 4, 1.0),
+            (3, 5, 1.0),
+            (4, 5, 1.0),
+            // Bridge
+            (2, 3, 0.1),
+        ];
+        ScaledLaplacian::from_sparse_adjacency(&edges, 6)
+    }
+
+    #[test]
+    fn test_spectral_clustering() {
+        let laplacian = two_cliques_graph();
+        let clustering = SpectralClustering::with_k(2);
+
+        let result = clustering.cluster(&laplacian);
+
+        assert_eq!(result.assignments.len(), 6);
+        assert_eq!(result.k, 2);
+
+        // Should roughly separate the two cliques
+        let sizes = result.cluster_sizes();
+        assert_eq!(sizes.iter().sum::<usize>(), 6);
+    }
+
+    #[test]
+    fn test_bipartition() {
+        let laplacian = two_cliques_graph();
+        let clustering = SpectralClustering::with_k(2);
+
+        let result = clustering.bipartition(&laplacian);
+
+        assert_eq!(result.assignments.len(), 6);
+        assert_eq!(result.k, 2);
+    }
+
+    #[test]
+    fn test_cluster_extraction() {
+        let laplacian = two_cliques_graph();
+        let clustering = SpectralClustering::with_k(2);
+        let result = clustering.cluster(&laplacian);
+
+        let c0 = result.cluster(0);
+        let c1 = result.cluster(1);
+
+        // All vertices assigned
+        assert_eq!(c0.len() + c1.len(), 6);
+    }
+
+    #[test]
+    fn test_normalized_cut() {
+        let laplacian = two_cliques_graph();
+        let clustering = SpectralClustering::with_k(2);
+
+        // Good partition: separate cliques
+        let good_partition = vec![true, true, true, false, false, false];
+        let good_ncut = clustering.normalized_cut(&laplacian, &good_partition);
+
+        // Bad partition: mix cliques
+        let bad_partition = vec![true, false, true, false, true, false];
+        let bad_ncut = clustering.normalized_cut(&laplacian, &bad_partition);
+
+        // Good partition should have lower normalized cut
+        // (This is a heuristic test, actual values depend on graph structure)
+        assert!(good_ncut >= 0.0);
+        assert!(bad_ncut >= 0.0);
+    }
+
+    #[test]
+    fn test_single_node() {
+        let laplacian = ScaledLaplacian::from_sparse_adjacency(&[], 1);
+        let clustering = SpectralClustering::with_k(1);
+
+        let result = clustering.cluster(&laplacian);
+
+        assert_eq!(result.assignments.len(), 1);
+        assert_eq!(result.assignments[0], 0);
+    }
+}