Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
365
vendor/ruvector/examples/dna/src/kmer_pagerank.rs
vendored
Normal file
365
vendor/ruvector/examples/dna/src/kmer_pagerank.rs
vendored
Normal file
@@ -0,0 +1,365 @@
|
||||
//! K-mer Graph PageRank for DNA Sequence Ranking
|
||||
//!
|
||||
//! Builds a k-mer co-occurrence graph from DNA sequences and uses
|
||||
//! ruvector-solver's Forward Push Personalized PageRank (PPR) to rank
|
||||
//! sequences by structural centrality in the k-mer overlap network.
|
||||
//!
|
||||
//! This enables identifying the most "representative" sequences in a
|
||||
//! collection — those whose k-mer profiles are most connected to others.
|
||||
|
||||
use ruvector_solver::forward_push::ForwardPushSolver;
|
||||
use ruvector_solver::types::CsrMatrix;
|
||||
|
||||
/// Result of PageRank-based sequence ranking
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SequenceRank {
|
||||
/// Index of the sequence in the input collection
|
||||
pub index: usize,
|
||||
/// PageRank score (higher = more central)
|
||||
pub score: f64,
|
||||
}
|
||||
|
||||
/// K-mer graph builder and PageRank ranker.
|
||||
///
|
||||
/// Constructs a weighted graph where:
|
||||
/// - Nodes are sequences
|
||||
/// - Edge weight(i, j) = number of shared k-mers between sequences i and j
|
||||
///
|
||||
/// Then uses Forward Push PPR to compute centrality scores.
|
||||
pub struct KmerGraphRanker {
|
||||
k: usize,
|
||||
hash_dimensions: usize,
|
||||
}
|
||||
|
||||
impl KmerGraphRanker {
|
||||
/// Create a new ranker with the given k-mer length.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `k` - K-mer length (typical: 11-31)
|
||||
/// * `hash_dimensions` - Number of hash buckets for k-mer fingerprints (default: 256)
|
||||
pub fn new(k: usize, hash_dimensions: usize) -> Self {
|
||||
Self { k, hash_dimensions }
|
||||
}
|
||||
|
||||
/// Build a k-mer fingerprint vector for a DNA sequence.
|
||||
///
|
||||
/// Uses FNV-1a hashing with canonical k-mers (min of forward/reverse-complement)
|
||||
/// to produce a fixed-size frequency vector.
|
||||
fn fingerprint(&self, seq: &[u8]) -> Vec<f64> {
|
||||
if seq.len() < self.k {
|
||||
return vec![0.0; self.hash_dimensions];
|
||||
}
|
||||
|
||||
let mut counts = vec![0u32; self.hash_dimensions];
|
||||
|
||||
for window in seq.windows(self.k) {
|
||||
let fwd = Self::fnv1a_hash(window);
|
||||
let rc = Self::fnv1a_hash_rc(window);
|
||||
let canonical = fwd.min(rc);
|
||||
counts[canonical % self.hash_dimensions] += 1;
|
||||
}
|
||||
|
||||
// Normalize to probability distribution
|
||||
let total: u32 = counts.iter().sum();
|
||||
if total == 0 {
|
||||
return vec![0.0; self.hash_dimensions];
|
||||
}
|
||||
let inv = 1.0 / total as f64;
|
||||
counts.iter().map(|&c| c as f64 * inv).collect()
|
||||
}
|
||||
|
||||
/// Compute cosine similarity between two fingerprint vectors.
|
||||
fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
|
||||
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
|
||||
let norm_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
|
||||
|
||||
if norm_a < 1e-15 || norm_b < 1e-15 {
|
||||
return 0.0;
|
||||
}
|
||||
dot / (norm_a * norm_b)
|
||||
}
|
||||
|
||||
/// Build the k-mer overlap graph as a column-stochastic transition matrix.
|
||||
///
|
||||
/// Edge weights are cosine similarities between k-mer fingerprints,
|
||||
/// normalized to form a stochastic matrix (columns sum to 1).
|
||||
fn build_transition_matrix(&self, sequences: &[&[u8]], threshold: f64) -> CsrMatrix<f64> {
|
||||
let n = sequences.len();
|
||||
let fingerprints: Vec<Vec<f64>> =
|
||||
sequences.iter().map(|seq| self.fingerprint(seq)).collect();
|
||||
|
||||
// Build weighted adjacency with thresholding
|
||||
let mut col_sums = vec![0.0f64; n];
|
||||
let mut entries: Vec<(usize, usize, f64)> = Vec::new();
|
||||
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i == j {
|
||||
continue;
|
||||
}
|
||||
let sim = Self::cosine_similarity(&fingerprints[i], &fingerprints[j]);
|
||||
if sim > threshold {
|
||||
entries.push((i, j, sim));
|
||||
col_sums[j] += sim;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize columns to make stochastic
|
||||
// Also add self-loops for isolated nodes
|
||||
let mut normalized: Vec<(usize, usize, f64)> = entries
|
||||
.into_iter()
|
||||
.map(|(i, j, w)| {
|
||||
let norm = if col_sums[j] > 1e-15 {
|
||||
col_sums[j]
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
(i, j, w / norm)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Add self-loops for isolated nodes (dangling node handling)
|
||||
for j in 0..n {
|
||||
if col_sums[j] < 1e-15 {
|
||||
normalized.push((j, j, 1.0));
|
||||
}
|
||||
}
|
||||
|
||||
CsrMatrix::<f64>::from_coo(n, n, normalized)
|
||||
}
|
||||
|
||||
/// Rank sequences by PageRank centrality in the k-mer overlap graph.
|
||||
///
|
||||
/// Uses ruvector-solver's Forward Push algorithm for sublinear-time
|
||||
/// Personalized PageRank computation.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `sequences` - Collection of DNA sequences (as byte slices)
|
||||
/// * `alpha` - Teleportation probability (default: 0.15)
|
||||
/// * `epsilon` - PPR approximation tolerance (default: 1e-6)
|
||||
/// * `similarity_threshold` - Minimum cosine similarity to create an edge (default: 0.1)
|
||||
///
|
||||
/// # Returns
|
||||
/// Sequences ranked by descending PageRank score
|
||||
pub fn rank_sequences(
|
||||
&self,
|
||||
sequences: &[&[u8]],
|
||||
alpha: f64,
|
||||
epsilon: f64,
|
||||
similarity_threshold: f64,
|
||||
) -> Vec<SequenceRank> {
|
||||
let n = sequences.len();
|
||||
if n == 0 {
|
||||
return vec![];
|
||||
}
|
||||
if n == 1 {
|
||||
return vec![SequenceRank {
|
||||
index: 0,
|
||||
score: 1.0,
|
||||
}];
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
|
||||
|
||||
// Use Forward Push PPR from each node, accumulate global PageRank
|
||||
let solver = ForwardPushSolver::new(alpha, epsilon);
|
||||
let mut global_rank = vec![0.0f64; n];
|
||||
|
||||
// Compute PPR from each node (or a representative subset for large graphs)
|
||||
let num_seeds = n.min(50); // Limit seeds for large collections
|
||||
let step = if n > num_seeds { n / num_seeds } else { 1 };
|
||||
|
||||
for seed_idx in (0..n).step_by(step) {
|
||||
match solver.ppr_from_source(&matrix, seed_idx) {
|
||||
Ok(ppr_result) => {
|
||||
for (node, score) in ppr_result {
|
||||
if node < n {
|
||||
global_rank[node] += score;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
// If PPR fails for this seed, skip it
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize
|
||||
let total: f64 = global_rank.iter().sum();
|
||||
if total > 1e-15 {
|
||||
let inv = 1.0 / total;
|
||||
for score in &mut global_rank {
|
||||
*score *= inv;
|
||||
}
|
||||
}
|
||||
|
||||
// Build ranked results
|
||||
let mut results: Vec<SequenceRank> = global_rank
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(index, score)| SequenceRank { index, score })
|
||||
.collect();
|
||||
|
||||
// Sort by score descending
|
||||
results.sort_by(|a, b| {
|
||||
b.score
|
||||
.partial_cmp(&a.score)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Compute pairwise PageRank similarity between two specific sequences
|
||||
/// within the context of a collection.
|
||||
///
|
||||
/// Uses Forward Push PPR from the source sequence and returns the
|
||||
/// PPR score at the target sequence.
|
||||
pub fn pairwise_similarity(
|
||||
&self,
|
||||
sequences: &[&[u8]],
|
||||
source: usize,
|
||||
target: usize,
|
||||
alpha: f64,
|
||||
epsilon: f64,
|
||||
similarity_threshold: f64,
|
||||
) -> f64 {
|
||||
if source >= sequences.len() || target >= sequences.len() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
|
||||
let solver = ForwardPushSolver::new(alpha, epsilon);
|
||||
|
||||
match solver.ppr_from_source(&matrix, source) {
|
||||
Ok(ppr_result) => ppr_result
|
||||
.into_iter()
|
||||
.find(|(node, _)| *node == target)
|
||||
.map(|(_, score)| score)
|
||||
.unwrap_or(0.0),
|
||||
Err(_) => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fnv1a_hash(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data {
|
||||
hash ^= byte as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fnv1a_hash_rc(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data.iter().rev() {
|
||||
let comp = match byte.to_ascii_uppercase() {
|
||||
b'A' => b'T',
|
||||
b'T' | b'U' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
n => n,
|
||||
};
|
||||
hash ^= comp as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_fingerprint() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq = b"ATCGATCGATCG";
|
||||
let fp = ranker.fingerprint(seq);
|
||||
assert_eq!(fp.len(), 64);
|
||||
|
||||
// Should be a probability distribution (sums to ~1)
|
||||
let sum: f64 = fp.iter().sum();
|
||||
assert!((sum - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_identical() {
|
||||
let a = vec![1.0, 2.0, 3.0];
|
||||
let b = vec![1.0, 2.0, 3.0];
|
||||
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
|
||||
assert!((sim - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_orthogonal() {
|
||||
let a = vec![1.0, 0.0];
|
||||
let b = vec![0.0, 1.0];
|
||||
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
|
||||
assert!(sim.abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_sequences_basic() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq1 = b"ATCGATCGATCGATCG";
|
||||
let seq2 = b"ATCGATCGATCGATCG"; // identical to seq1
|
||||
let seq3 = b"GCTAGCTAGCTAGCTA"; // different
|
||||
|
||||
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.01);
|
||||
|
||||
assert_eq!(ranks.len(), 3);
|
||||
|
||||
// All ranks should sum to 1
|
||||
let total: f64 = ranks.iter().map(|r| r.score).sum();
|
||||
assert!((total - 1.0).abs() < 1e-5);
|
||||
|
||||
// Identical sequences should have similar ranks
|
||||
let rank_0 = ranks.iter().find(|r| r.index == 0).unwrap().score;
|
||||
let rank_1 = ranks.iter().find(|r| r.index == 1).unwrap().score;
|
||||
assert!((rank_0 - rank_1).abs() < 0.3); // roughly similar
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_empty() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let sequences: Vec<&[u8]> = vec![];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
|
||||
assert!(ranks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_single() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let sequences: Vec<&[u8]> = vec![b"ATCGATCG"];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
|
||||
assert_eq!(ranks.len(), 1);
|
||||
assert!((ranks[0].score - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pairwise_similarity() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq1 = b"ATCGATCGATCGATCG";
|
||||
let seq2 = b"ATCGATCGATCGATCG";
|
||||
let seq3 = b"NNNNNNNNNNNNNNNN"; // very different
|
||||
|
||||
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
|
||||
|
||||
let sim_01 = ranker.pairwise_similarity(&sequences, 0, 1, 0.15, 1e-4, 0.01);
|
||||
let sim_02 = ranker.pairwise_similarity(&sequences, 0, 2, 0.15, 1e-4, 0.01);
|
||||
|
||||
// Identical sequences should have higher similarity
|
||||
assert!(sim_01 >= sim_02);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user