Files
wifi-densepose/vendor/ruvector/examples/dna/tests/kmer_tests.rs

404 lines
12 KiB
Rust

//! Integration tests for k-mer indexing module
//!
//! These tests use real VectorDB instances to validate k-mer encoding,
//! indexing, and similarity search functionality.
use ::rvdna::kmer::{canonical_kmer, KmerEncoder, KmerIndex, MinHashSketch};
use tempfile::TempDir;
/// Helper to create a test directory that will be automatically cleaned up
fn create_test_db() -> TempDir {
TempDir::new().expect("Failed to create temp directory")
}
#[test]
fn test_kmer_encoding_basic() {
let encoder = KmerEncoder::new(4).expect("Failed to create encoder");
let sequence = b"ACGTACGT";
let vector = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence");
// Verify vector has correct dimensions
assert_eq!(
vector.len(),
encoder.dimensions(),
"Vector dimensions should match encoder dimensions"
);
// Verify L2 normalization
let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(
(magnitude - 1.0).abs() < 1e-5,
"Vector should be L2 normalized, got magnitude: {}",
magnitude
);
// Verify non-zero elements exist (sequence has k-mers)
let non_zero_count = vector.iter().filter(|&&x| x != 0.0).count();
assert!(non_zero_count > 0, "Vector should have non-zero elements");
}
#[test]
fn test_kmer_encoding_deterministic() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let sequence = b"ACGTACGTACGTACGTACGT";
let vector1 = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence first time");
let vector2 = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence second time");
// Verify same sequence produces identical vectors
assert_eq!(
vector1.len(),
vector2.len(),
"Vectors should have same length"
);
for (i, (&v1, &v2)) in vector1.iter().zip(vector2.iter()).enumerate() {
assert!(
(v1 - v2).abs() < 1e-6,
"Vector element {} should be identical: {} vs {}",
i,
v1,
v2
);
}
}
#[test]
fn test_kmer_complement_symmetry() {
let kmer1 = b"ACGT";
let kmer2 = b"ACGT"; // reverse complement is ACGT (palindrome)
let canon1 = canonical_kmer(kmer1);
let canon2 = canonical_kmer(kmer2);
assert_eq!(canon1, canon2, "Canonical k-mers should be equal");
// Test with non-palindrome
let kmer3 = b"AAAA";
let kmer4 = b"TTTT"; // reverse complement of AAAA
let canon3 = canonical_kmer(kmer3);
let canon4 = canonical_kmer(kmer4);
assert_eq!(
canon3, canon4,
"Canonical k-mer should be same for sequence and revcomp"
);
}
#[test]
fn test_kmer_index_insert_and_search() {
let _temp_dir = create_test_db();
// Create index with k=11
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Insert 3 sequences
let seq1 = b"ACGTACGTACGTACGTACGT";
let seq2 = b"ACGTACGTACGTACGTACGG"; // Similar to seq1
let seq3 = b"TTTTTTTTTTTTTTTTTTTT"; // Very different
index
.index_sequence("seq1", seq1)
.expect("Failed to index seq1");
index
.index_sequence("seq2", seq2)
.expect("Failed to index seq2");
index
.index_sequence("seq3", seq3)
.expect("Failed to index seq3");
// Search for similar sequences to seq1
let results = index.search_similar(seq1, 3).expect("Failed to search");
assert!(results.len() > 0, "Should find at least one result");
// First result should be seq1 itself (exact match)
assert_eq!(results[0].id, "seq1", "First result should be exact match");
assert!(
results[0].distance < 0.01,
"Exact match should have very low distance: {}",
results[0].distance
);
// seq2 should be closer than seq3
let seq2_idx = results.iter().position(|r| r.id == "seq2");
let seq3_idx = results.iter().position(|r| r.id == "seq3");
if let (Some(idx2), Some(idx3)) = (seq2_idx, seq3_idx) {
assert!(
idx2 < idx3,
"Similar sequence should rank higher than different sequence"
);
}
}
#[test]
fn test_kmer_index_batch_insert() {
let _temp_dir = create_test_db();
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Generate 100 random sequences
let mut sequences = Vec::new();
for i in 0..100 {
let seq = generate_random_sequence(50, i as u64);
sequences.push((format!("seq_{}", i), seq));
}
// Convert to reference slices for batch insert
let batch: Vec<(&str, &[u8])> = sequences
.iter()
.map(|(id, seq)| (id.as_str(), seq.as_slice()))
.collect();
// Batch insert
index
.index_batch(batch)
.expect("Failed to batch insert sequences");
// Verify we can search and get results
let query = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let results = index.search_similar(query, 10).expect("Failed to search");
assert!(results.len() > 0, "Should find results after batch insert");
}
#[test]
fn test_kmer_similar_sequences_score_higher() {
let _temp_dir = create_test_db();
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Create two similar sequences (90% identical)
let base_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"; // 40 bases
let similar_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGG"; // 1 base different
let random_seq = generate_random_sequence(40, 12345);
index
.index_sequence("base", base_seq)
.expect("Failed to index base");
index
.index_sequence("similar", similar_seq)
.expect("Failed to index similar");
index
.index_sequence("random", &random_seq)
.expect("Failed to index random");
// Search with base sequence
let results = index
.search_similar(base_seq, 10)
.expect("Failed to search");
assert!(results.len() > 0, "Should find at least one result");
// Find positions in results
let base_pos = results.iter().position(|r| r.id == "base");
let similar_pos = results.iter().position(|r| r.id == "similar");
// Base and similar should definitely be in top results
assert!(
base_pos.is_some(),
"Base sequence (exact match) should be found in results"
);
assert!(
similar_pos.is_some(),
"Similar sequence should be found in results"
);
// Base should be first (exact match has distance 0)
assert_eq!(
base_pos.unwrap(),
0,
"Base sequence should be the top result (exact match)"
);
// Similar sequence should be in top 3
assert!(
similar_pos.unwrap() < 3,
"Similar sequence should rank in top 3, was at position {}",
similar_pos.unwrap()
);
}
#[test]
fn test_kmer_different_k_values() {
// Test k=11
let encoder11 = KmerEncoder::new(11).expect("Failed to create k=11 encoder");
let seq = b"ACGTACGTACGTACGTACGTACGTACGT";
let vec11 = encoder11
.encode_sequence(seq)
.expect("Failed to encode with k=11");
assert_eq!(vec11.len(), encoder11.dimensions());
// Test k=21
let encoder21 = KmerEncoder::new(21).expect("Failed to create k=21 encoder");
let seq_long = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let vec21 = encoder21
.encode_sequence(seq_long)
.expect("Failed to encode with k=21");
assert_eq!(vec21.len(), encoder21.dimensions());
// Test k=31
let encoder31 = KmerEncoder::new(31).expect("Failed to create k=31 encoder");
let seq_longer = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let vec31 = encoder31
.encode_sequence(seq_longer)
.expect("Failed to encode with k=31");
assert_eq!(vec31.len(), encoder31.dimensions());
// All should be normalized
for (vec, k) in &[(vec11, 11), (vec21, 21), (vec31, 31)] {
let magnitude: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(
(magnitude - 1.0).abs() < 1e-5,
"k={} vector should be normalized",
k
);
}
}
#[test]
fn test_minhash_sketch_basic() {
let num_hashes = 100;
let mut sketch = MinHashSketch::new(num_hashes);
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
let hashes = sketch
.sketch(sequence, 11)
.expect("Failed to sketch sequence");
assert!(
hashes.len() <= num_hashes,
"Sketch should have at most {} hashes, got {}",
num_hashes,
hashes.len()
);
assert!(hashes.len() > 0, "Sketch should have at least one hash");
// Verify hashes are sorted (implementation detail)
for i in 1..hashes.len() {
assert!(hashes[i] >= hashes[i - 1], "Hashes should be sorted");
}
}
#[test]
fn test_minhash_jaccard_identical() {
let mut sketch1 = MinHashSketch::new(100);
let mut sketch2 = MinHashSketch::new(100);
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
sketch1
.sketch(sequence, 11)
.expect("Failed to sketch sequence 1");
sketch2
.sketch(sequence, 11)
.expect("Failed to sketch sequence 2");
let distance = sketch1.jaccard_distance(&sketch2);
assert!(
distance < 0.01,
"Identical sequences should have distance close to 0, got {}",
distance
);
}
#[test]
fn test_minhash_jaccard_different() {
let mut sketch1 = MinHashSketch::new(100);
let mut sketch2 = MinHashSketch::new(100);
let seq1 = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
let seq2 = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
sketch1
.sketch(seq1, 11)
.expect("Failed to sketch sequence 1");
sketch2
.sketch(seq2, 11)
.expect("Failed to sketch sequence 2");
let distance = sketch1.jaccard_distance(&sketch2);
assert!(
distance > 0.9,
"Very different sequences should have distance close to 1, got {}",
distance
);
}
#[test]
fn test_kmer_index_empty_sequence() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
// Test empty sequence
let empty_seq = b"";
let result = encoder.encode_sequence(empty_seq);
assert!(result.is_err(), "Empty sequence should return error");
// Test sequence shorter than k
let short_seq = b"ACGT"; // k=11 but only 4 bases
let result = encoder.encode_sequence(short_seq);
assert!(
result.is_err(),
"Sequence shorter than k should return error"
);
}
#[test]
fn test_kmer_index_with_n_bases() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
// Sequence with N (unknown) bases
let seq_with_n = b"ACGTACGTNNNACGTACGT";
// Should still encode (N bases are handled in canonical_kmer)
let result = encoder.encode_sequence(seq_with_n);
assert!(
result.is_ok(),
"Sequence with N bases should encode successfully"
);
let vector = result.unwrap();
assert_eq!(
vector.len(),
encoder.dimensions(),
"Vector should have correct dimensions"
);
}
// Helper function to generate random DNA sequences
fn generate_random_sequence(length: usize, seed: u64) -> Vec<u8> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let bases = [b'A', b'C', b'G', b'T'];
let mut sequence = Vec::with_capacity(length);
for i in 0..length {
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
i.hash(&mut hasher);
let hash = hasher.finish();
let base_idx = (hash % 4) as usize;
sequence.push(bases[base_idx]);
}
sequence
}