//! K-mer Graph PageRank for DNA Sequence Ranking //! //! Builds a k-mer co-occurrence graph from DNA sequences and uses //! ruvector-solver's Forward Push Personalized PageRank (PPR) to rank //! sequences by structural centrality in the k-mer overlap network. //! //! This enables identifying the most "representative" sequences in a //! collection — those whose k-mer profiles are most connected to others. use ruvector_solver::forward_push::ForwardPushSolver; use ruvector_solver::types::CsrMatrix; /// Result of PageRank-based sequence ranking #[derive(Debug, Clone)] pub struct SequenceRank { /// Index of the sequence in the input collection pub index: usize, /// PageRank score (higher = more central) pub score: f64, } /// K-mer graph builder and PageRank ranker. /// /// Constructs a weighted graph where: /// - Nodes are sequences /// - Edge weight(i, j) = number of shared k-mers between sequences i and j /// /// Then uses Forward Push PPR to compute centrality scores. pub struct KmerGraphRanker { k: usize, hash_dimensions: usize, } impl KmerGraphRanker { /// Create a new ranker with the given k-mer length. /// /// # Arguments /// * `k` - K-mer length (typical: 11-31) /// * `hash_dimensions` - Number of hash buckets for k-mer fingerprints (default: 256) pub fn new(k: usize, hash_dimensions: usize) -> Self { Self { k, hash_dimensions } } /// Build a k-mer fingerprint vector for a DNA sequence. /// /// Uses FNV-1a hashing with canonical k-mers (min of forward/reverse-complement) /// to produce a fixed-size frequency vector. fn fingerprint(&self, seq: &[u8]) -> Vec { if seq.len() < self.k { return vec![0.0; self.hash_dimensions]; } let mut counts = vec![0u32; self.hash_dimensions]; for window in seq.windows(self.k) { let fwd = Self::fnv1a_hash(window); let rc = Self::fnv1a_hash_rc(window); let canonical = fwd.min(rc); counts[canonical % self.hash_dimensions] += 1; } // Normalize to probability distribution let total: u32 = counts.iter().sum(); if total == 0 { return vec![0.0; self.hash_dimensions]; } let inv = 1.0 / total as f64; counts.iter().map(|&c| c as f64 * inv).collect() } /// Compute cosine similarity between two fingerprint vectors. fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 { let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); let norm_a: f64 = a.iter().map(|x| x * x).sum::().sqrt(); let norm_b: f64 = b.iter().map(|x| x * x).sum::().sqrt(); if norm_a < 1e-15 || norm_b < 1e-15 { return 0.0; } dot / (norm_a * norm_b) } /// Build the k-mer overlap graph as a column-stochastic transition matrix. /// /// Edge weights are cosine similarities between k-mer fingerprints, /// normalized to form a stochastic matrix (columns sum to 1). fn build_transition_matrix(&self, sequences: &[&[u8]], threshold: f64) -> CsrMatrix { let n = sequences.len(); let fingerprints: Vec> = sequences.iter().map(|seq| self.fingerprint(seq)).collect(); // Build weighted adjacency with thresholding let mut col_sums = vec![0.0f64; n]; let mut entries: Vec<(usize, usize, f64)> = Vec::new(); for i in 0..n { for j in 0..n { if i == j { continue; } let sim = Self::cosine_similarity(&fingerprints[i], &fingerprints[j]); if sim > threshold { entries.push((i, j, sim)); col_sums[j] += sim; } } } // Normalize columns to make stochastic // Also add self-loops for isolated nodes let mut normalized: Vec<(usize, usize, f64)> = entries .into_iter() .map(|(i, j, w)| { let norm = if col_sums[j] > 1e-15 { col_sums[j] } else { 1.0 }; (i, j, w / norm) }) .collect(); // Add self-loops for isolated nodes (dangling node handling) for j in 0..n { if col_sums[j] < 1e-15 { normalized.push((j, j, 1.0)); } } CsrMatrix::::from_coo(n, n, normalized) } /// Rank sequences by PageRank centrality in the k-mer overlap graph. /// /// Uses ruvector-solver's Forward Push algorithm for sublinear-time /// Personalized PageRank computation. /// /// # Arguments /// * `sequences` - Collection of DNA sequences (as byte slices) /// * `alpha` - Teleportation probability (default: 0.15) /// * `epsilon` - PPR approximation tolerance (default: 1e-6) /// * `similarity_threshold` - Minimum cosine similarity to create an edge (default: 0.1) /// /// # Returns /// Sequences ranked by descending PageRank score pub fn rank_sequences( &self, sequences: &[&[u8]], alpha: f64, epsilon: f64, similarity_threshold: f64, ) -> Vec { let n = sequences.len(); if n == 0 { return vec![]; } if n == 1 { return vec![SequenceRank { index: 0, score: 1.0, }]; } let matrix = self.build_transition_matrix(sequences, similarity_threshold); // Use Forward Push PPR from each node, accumulate global PageRank let solver = ForwardPushSolver::new(alpha, epsilon); let mut global_rank = vec![0.0f64; n]; // Compute PPR from each node (or a representative subset for large graphs) let num_seeds = n.min(50); // Limit seeds for large collections let step = if n > num_seeds { n / num_seeds } else { 1 }; for seed_idx in (0..n).step_by(step) { match solver.ppr_from_source(&matrix, seed_idx) { Ok(ppr_result) => { for (node, score) in ppr_result { if node < n { global_rank[node] += score; } } } Err(_) => { // If PPR fails for this seed, skip it continue; } } } // Normalize let total: f64 = global_rank.iter().sum(); if total > 1e-15 { let inv = 1.0 / total; for score in &mut global_rank { *score *= inv; } } // Build ranked results let mut results: Vec = global_rank .into_iter() .enumerate() .map(|(index, score)| SequenceRank { index, score }) .collect(); // Sort by score descending results.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); results } /// Compute pairwise PageRank similarity between two specific sequences /// within the context of a collection. /// /// Uses Forward Push PPR from the source sequence and returns the /// PPR score at the target sequence. pub fn pairwise_similarity( &self, sequences: &[&[u8]], source: usize, target: usize, alpha: f64, epsilon: f64, similarity_threshold: f64, ) -> f64 { if source >= sequences.len() || target >= sequences.len() { return 0.0; } let matrix = self.build_transition_matrix(sequences, similarity_threshold); let solver = ForwardPushSolver::new(alpha, epsilon); match solver.ppr_from_source(&matrix, source) { Ok(ppr_result) => ppr_result .into_iter() .find(|(node, _)| *node == target) .map(|(_, score)| score) .unwrap_or(0.0), Err(_) => 0.0, } } #[inline] fn fnv1a_hash(data: &[u8]) -> usize { const FNV_OFFSET: u64 = 14695981039346656037; const FNV_PRIME: u64 = 1099511628211; let mut hash = FNV_OFFSET; for &byte in data { hash ^= byte as u64; hash = hash.wrapping_mul(FNV_PRIME); } hash as usize } #[inline] fn fnv1a_hash_rc(data: &[u8]) -> usize { const FNV_OFFSET: u64 = 14695981039346656037; const FNV_PRIME: u64 = 1099511628211; let mut hash = FNV_OFFSET; for &byte in data.iter().rev() { let comp = match byte.to_ascii_uppercase() { b'A' => b'T', b'T' | b'U' => b'A', b'C' => b'G', b'G' => b'C', n => n, }; hash ^= comp as u64; hash = hash.wrapping_mul(FNV_PRIME); } hash as usize } } #[cfg(test)] mod tests { use super::*; #[test] fn test_fingerprint() { let ranker = KmerGraphRanker::new(3, 64); let seq = b"ATCGATCGATCG"; let fp = ranker.fingerprint(seq); assert_eq!(fp.len(), 64); // Should be a probability distribution (sums to ~1) let sum: f64 = fp.iter().sum(); assert!((sum - 1.0).abs() < 1e-10); } #[test] fn test_cosine_similarity_identical() { let a = vec![1.0, 2.0, 3.0]; let b = vec![1.0, 2.0, 3.0]; let sim = KmerGraphRanker::cosine_similarity(&a, &b); assert!((sim - 1.0).abs() < 1e-10); } #[test] fn test_cosine_similarity_orthogonal() { let a = vec![1.0, 0.0]; let b = vec![0.0, 1.0]; let sim = KmerGraphRanker::cosine_similarity(&a, &b); assert!(sim.abs() < 1e-10); } #[test] fn test_rank_sequences_basic() { let ranker = KmerGraphRanker::new(3, 64); let seq1 = b"ATCGATCGATCGATCG"; let seq2 = b"ATCGATCGATCGATCG"; // identical to seq1 let seq3 = b"GCTAGCTAGCTAGCTA"; // different let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3]; let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.01); assert_eq!(ranks.len(), 3); // All ranks should sum to 1 let total: f64 = ranks.iter().map(|r| r.score).sum(); assert!((total - 1.0).abs() < 1e-5); // Identical sequences should have similar ranks let rank_0 = ranks.iter().find(|r| r.index == 0).unwrap().score; let rank_1 = ranks.iter().find(|r| r.index == 1).unwrap().score; assert!((rank_0 - rank_1).abs() < 0.3); // roughly similar } #[test] fn test_rank_empty() { let ranker = KmerGraphRanker::new(3, 64); let sequences: Vec<&[u8]> = vec![]; let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1); assert!(ranks.is_empty()); } #[test] fn test_rank_single() { let ranker = KmerGraphRanker::new(3, 64); let sequences: Vec<&[u8]> = vec![b"ATCGATCG"]; let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1); assert_eq!(ranks.len(), 1); assert!((ranks[0].score - 1.0).abs() < 1e-10); } #[test] fn test_pairwise_similarity() { let ranker = KmerGraphRanker::new(3, 64); let seq1 = b"ATCGATCGATCGATCG"; let seq2 = b"ATCGATCGATCGATCG"; let seq3 = b"NNNNNNNNNNNNNNNN"; // very different let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3]; let sim_01 = ranker.pairwise_similarity(&sequences, 0, 1, 0.15, 1e-4, 0.01); let sim_02 = ranker.pairwise_similarity(&sequences, 0, 2, 0.15, 1e-4, 0.01); // Identical sequences should have higher similarity assert!(sim_01 >= sim_02); } }