Files
wifi-densepose/vendor/ruvector/examples/dna/src/kmer_pagerank.rs

366 lines
12 KiB
Rust

//! K-mer Graph PageRank for DNA Sequence Ranking
//!
//! Builds a k-mer co-occurrence graph from DNA sequences and uses
//! ruvector-solver's Forward Push Personalized PageRank (PPR) to rank
//! sequences by structural centrality in the k-mer overlap network.
//!
//! This enables identifying the most "representative" sequences in a
//! collection — those whose k-mer profiles are most connected to others.
use ruvector_solver::forward_push::ForwardPushSolver;
use ruvector_solver::types::CsrMatrix;
/// Result of PageRank-based sequence ranking
#[derive(Debug, Clone)]
pub struct SequenceRank {
/// Index of the sequence in the input collection
pub index: usize,
/// PageRank score (higher = more central)
pub score: f64,
}
/// K-mer graph builder and PageRank ranker.
///
/// Constructs a weighted graph where:
/// - Nodes are sequences
/// - Edge weight(i, j) = number of shared k-mers between sequences i and j
///
/// Then uses Forward Push PPR to compute centrality scores.
pub struct KmerGraphRanker {
k: usize,
hash_dimensions: usize,
}
impl KmerGraphRanker {
/// Create a new ranker with the given k-mer length.
///
/// # Arguments
/// * `k` - K-mer length (typical: 11-31)
/// * `hash_dimensions` - Number of hash buckets for k-mer fingerprints (default: 256)
pub fn new(k: usize, hash_dimensions: usize) -> Self {
Self { k, hash_dimensions }
}
/// Build a k-mer fingerprint vector for a DNA sequence.
///
/// Uses FNV-1a hashing with canonical k-mers (min of forward/reverse-complement)
/// to produce a fixed-size frequency vector.
fn fingerprint(&self, seq: &[u8]) -> Vec<f64> {
if seq.len() < self.k {
return vec![0.0; self.hash_dimensions];
}
let mut counts = vec![0u32; self.hash_dimensions];
for window in seq.windows(self.k) {
let fwd = Self::fnv1a_hash(window);
let rc = Self::fnv1a_hash_rc(window);
let canonical = fwd.min(rc);
counts[canonical % self.hash_dimensions] += 1;
}
// Normalize to probability distribution
let total: u32 = counts.iter().sum();
if total == 0 {
return vec![0.0; self.hash_dimensions];
}
let inv = 1.0 / total as f64;
counts.iter().map(|&c| c as f64 * inv).collect()
}
/// Compute cosine similarity between two fingerprint vectors.
fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
let norm_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
if norm_a < 1e-15 || norm_b < 1e-15 {
return 0.0;
}
dot / (norm_a * norm_b)
}
/// Build the k-mer overlap graph as a column-stochastic transition matrix.
///
/// Edge weights are cosine similarities between k-mer fingerprints,
/// normalized to form a stochastic matrix (columns sum to 1).
fn build_transition_matrix(&self, sequences: &[&[u8]], threshold: f64) -> CsrMatrix<f64> {
let n = sequences.len();
let fingerprints: Vec<Vec<f64>> =
sequences.iter().map(|seq| self.fingerprint(seq)).collect();
// Build weighted adjacency with thresholding
let mut col_sums = vec![0.0f64; n];
let mut entries: Vec<(usize, usize, f64)> = Vec::new();
for i in 0..n {
for j in 0..n {
if i == j {
continue;
}
let sim = Self::cosine_similarity(&fingerprints[i], &fingerprints[j]);
if sim > threshold {
entries.push((i, j, sim));
col_sums[j] += sim;
}
}
}
// Normalize columns to make stochastic
// Also add self-loops for isolated nodes
let mut normalized: Vec<(usize, usize, f64)> = entries
.into_iter()
.map(|(i, j, w)| {
let norm = if col_sums[j] > 1e-15 {
col_sums[j]
} else {
1.0
};
(i, j, w / norm)
})
.collect();
// Add self-loops for isolated nodes (dangling node handling)
for j in 0..n {
if col_sums[j] < 1e-15 {
normalized.push((j, j, 1.0));
}
}
CsrMatrix::<f64>::from_coo(n, n, normalized)
}
/// Rank sequences by PageRank centrality in the k-mer overlap graph.
///
/// Uses ruvector-solver's Forward Push algorithm for sublinear-time
/// Personalized PageRank computation.
///
/// # Arguments
/// * `sequences` - Collection of DNA sequences (as byte slices)
/// * `alpha` - Teleportation probability (default: 0.15)
/// * `epsilon` - PPR approximation tolerance (default: 1e-6)
/// * `similarity_threshold` - Minimum cosine similarity to create an edge (default: 0.1)
///
/// # Returns
/// Sequences ranked by descending PageRank score
pub fn rank_sequences(
&self,
sequences: &[&[u8]],
alpha: f64,
epsilon: f64,
similarity_threshold: f64,
) -> Vec<SequenceRank> {
let n = sequences.len();
if n == 0 {
return vec![];
}
if n == 1 {
return vec![SequenceRank {
index: 0,
score: 1.0,
}];
}
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
// Use Forward Push PPR from each node, accumulate global PageRank
let solver = ForwardPushSolver::new(alpha, epsilon);
let mut global_rank = vec![0.0f64; n];
// Compute PPR from each node (or a representative subset for large graphs)
let num_seeds = n.min(50); // Limit seeds for large collections
let step = if n > num_seeds { n / num_seeds } else { 1 };
for seed_idx in (0..n).step_by(step) {
match solver.ppr_from_source(&matrix, seed_idx) {
Ok(ppr_result) => {
for (node, score) in ppr_result {
if node < n {
global_rank[node] += score;
}
}
}
Err(_) => {
// If PPR fails for this seed, skip it
continue;
}
}
}
// Normalize
let total: f64 = global_rank.iter().sum();
if total > 1e-15 {
let inv = 1.0 / total;
for score in &mut global_rank {
*score *= inv;
}
}
// Build ranked results
let mut results: Vec<SequenceRank> = global_rank
.into_iter()
.enumerate()
.map(|(index, score)| SequenceRank { index, score })
.collect();
// Sort by score descending
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
/// Compute pairwise PageRank similarity between two specific sequences
/// within the context of a collection.
///
/// Uses Forward Push PPR from the source sequence and returns the
/// PPR score at the target sequence.
pub fn pairwise_similarity(
&self,
sequences: &[&[u8]],
source: usize,
target: usize,
alpha: f64,
epsilon: f64,
similarity_threshold: f64,
) -> f64 {
if source >= sequences.len() || target >= sequences.len() {
return 0.0;
}
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
let solver = ForwardPushSolver::new(alpha, epsilon);
match solver.ppr_from_source(&matrix, source) {
Ok(ppr_result) => ppr_result
.into_iter()
.find(|(node, _)| *node == target)
.map(|(_, score)| score)
.unwrap_or(0.0),
Err(_) => 0.0,
}
}
#[inline]
fn fnv1a_hash(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data {
hash ^= byte as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
#[inline]
fn fnv1a_hash_rc(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data.iter().rev() {
let comp = match byte.to_ascii_uppercase() {
b'A' => b'T',
b'T' | b'U' => b'A',
b'C' => b'G',
b'G' => b'C',
n => n,
};
hash ^= comp as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fingerprint() {
let ranker = KmerGraphRanker::new(3, 64);
let seq = b"ATCGATCGATCG";
let fp = ranker.fingerprint(seq);
assert_eq!(fp.len(), 64);
// Should be a probability distribution (sums to ~1)
let sum: f64 = fp.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
}
#[test]
fn test_cosine_similarity_identical() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![1.0, 2.0, 3.0];
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
assert!((sim - 1.0).abs() < 1e-10);
}
#[test]
fn test_cosine_similarity_orthogonal() {
let a = vec![1.0, 0.0];
let b = vec![0.0, 1.0];
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
assert!(sim.abs() < 1e-10);
}
#[test]
fn test_rank_sequences_basic() {
let ranker = KmerGraphRanker::new(3, 64);
let seq1 = b"ATCGATCGATCGATCG";
let seq2 = b"ATCGATCGATCGATCG"; // identical to seq1
let seq3 = b"GCTAGCTAGCTAGCTA"; // different
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.01);
assert_eq!(ranks.len(), 3);
// All ranks should sum to 1
let total: f64 = ranks.iter().map(|r| r.score).sum();
assert!((total - 1.0).abs() < 1e-5);
// Identical sequences should have similar ranks
let rank_0 = ranks.iter().find(|r| r.index == 0).unwrap().score;
let rank_1 = ranks.iter().find(|r| r.index == 1).unwrap().score;
assert!((rank_0 - rank_1).abs() < 0.3); // roughly similar
}
#[test]
fn test_rank_empty() {
let ranker = KmerGraphRanker::new(3, 64);
let sequences: Vec<&[u8]> = vec![];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
assert!(ranks.is_empty());
}
#[test]
fn test_rank_single() {
let ranker = KmerGraphRanker::new(3, 64);
let sequences: Vec<&[u8]> = vec![b"ATCGATCG"];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
assert_eq!(ranks.len(), 1);
assert!((ranks[0].score - 1.0).abs() < 1e-10);
}
#[test]
fn test_pairwise_similarity() {
let ranker = KmerGraphRanker::new(3, 64);
let seq1 = b"ATCGATCGATCGATCG";
let seq2 = b"ATCGATCGATCGATCG";
let seq3 = b"NNNNNNNNNNNNNNNN"; // very different
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
let sim_01 = ranker.pairwise_similarity(&sequences, 0, 1, 0.15, 1e-4, 0.01);
let sim_02 = ranker.pairwise_similarity(&sequences, 0, 2, 0.15, 1e-4, 0.01);
// Identical sequences should have higher similarity
assert!(sim_01 >= sim_02);
}
}