Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,409 @@
//! Integration tests for the biomarker analysis engine.
//!
//! Tests composite risk scoring, profile vector encoding, clinical biomarker
//! references, synthetic population generation, and streaming biomarker
//! processing with anomaly and trend detection.
use rvdna::biomarker::*;
use rvdna::biomarker_stream::*;
use std::collections::HashMap;
// ============================================================================
// COMPOSITE RISK SCORING TESTS
// ============================================================================
#[test]
fn test_compute_risk_scores_baseline() {
// All homozygous reference (low risk) genotypes
let mut gts = HashMap::new();
gts.insert("rs429358".to_string(), "TT".to_string()); // APOE ref
gts.insert("rs7412".to_string(), "CC".to_string()); // APOE ref
gts.insert("rs4680".to_string(), "GG".to_string()); // COMT ref
gts.insert("rs1799971".to_string(), "AA".to_string()); // OPRM1 ref
gts.insert("rs762551".to_string(), "AA".to_string()); // CYP1A2 fast
gts.insert("rs1801133".to_string(), "GG".to_string()); // MTHFR ref
gts.insert("rs1801131".to_string(), "TT".to_string()); // MTHFR ref
gts.insert("rs1042522".to_string(), "CC".to_string()); // TP53 ref
gts.insert("rs80357906".to_string(), "DD".to_string()); // BRCA1 ref
gts.insert("rs4363657".to_string(), "TT".to_string()); // SLCO1B1 ref
let profile = compute_risk_scores(&gts);
assert!(
profile.global_risk_score < 0.3,
"Baseline should be low risk, got {}",
profile.global_risk_score
);
assert!(!profile.category_scores.is_empty());
}
#[test]
fn test_compute_risk_scores_high_risk() {
// High-risk genotype combinations
let mut gts = HashMap::new();
gts.insert("rs429358".to_string(), "CC".to_string()); // APOE e4/e4
gts.insert("rs7412".to_string(), "CC".to_string());
gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
gts.insert("rs1799971".to_string(), "GG".to_string()); // OPRM1 Asp/Asp
gts.insert("rs1801133".to_string(), "AA".to_string()); // MTHFR 677TT
gts.insert("rs1801131".to_string(), "GG".to_string()); // MTHFR 1298CC
gts.insert("rs4363657".to_string(), "CC".to_string()); // SLCO1B1 hom variant
let profile = compute_risk_scores(&gts);
assert!(
profile.global_risk_score > 0.4,
"High-risk should score >0.4, got {}",
profile.global_risk_score
);
}
// ============================================================================
// PROFILE VECTOR TESTS
// ============================================================================
#[test]
fn test_profile_vector_dimension() {
let gts = HashMap::new(); // empty genotypes
let profile = compute_risk_scores(&gts);
assert_eq!(
profile.profile_vector.len(),
64,
"Profile vector must be exactly 64 dimensions"
);
}
#[test]
fn test_profile_vector_normalized() {
let mut gts = HashMap::new();
gts.insert("rs429358".to_string(), "CT".to_string());
gts.insert("rs4680".to_string(), "AG".to_string());
let profile = compute_risk_scores(&gts);
let mag: f32 = profile
.profile_vector
.iter()
.map(|x| x * x)
.sum::<f32>()
.sqrt();
assert!(
(mag - 1.0).abs() < 0.01 || mag == 0.0,
"Vector should be L2-normalized, got magnitude {}",
mag
);
}
// ============================================================================
// BIOMARKER REFERENCE TESTS
// ============================================================================
#[test]
fn test_biomarker_references_exist() {
let refs = biomarker_references();
assert!(
refs.len() >= 13,
"Should have at least 13 biomarker references, got {}",
refs.len()
);
}
#[test]
fn test_z_score_computation() {
let refs = biomarker_references();
let cholesterol_ref = refs.iter().find(|r| r.name == "Total Cholesterol").unwrap();
// Normal value should have |z| < 2
let z_normal = z_score(180.0, cholesterol_ref);
assert!(
z_normal.abs() < 2.0,
"Normal cholesterol z-score should be small: {}",
z_normal
);
// High value should have z > 0
let z_high = z_score(300.0, cholesterol_ref);
assert!(
z_high > 0.0,
"High cholesterol should have positive z-score: {}",
z_high
);
}
#[test]
fn test_biomarker_classification() {
let refs = biomarker_references();
let glucose_ref = refs.iter().find(|r| r.name == "Fasting Glucose").unwrap();
let class_normal = classify_biomarker(85.0, glucose_ref);
// Should be normal range
let class_high = classify_biomarker(200.0, glucose_ref);
// Should be high/critical
assert_ne!(format!("{:?}", class_normal), format!("{:?}", class_high));
}
// ============================================================================
// SYNTHETIC POPULATION TESTS
// ============================================================================
#[test]
fn test_synthetic_population() {
let pop = generate_synthetic_population(100, 42);
assert_eq!(pop.len(), 100);
// All vectors should be 64-dim
for profile in &pop {
assert_eq!(profile.profile_vector.len(), 64);
}
// Risk scores should span a range
let scores: Vec<f64> = pop.iter().map(|p| p.global_risk_score).collect();
let min = scores.iter().cloned().fold(f64::INFINITY, f64::min);
let max = scores.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
assert!(
max - min > 0.1,
"Population should have risk score variance, range: {:.3}..{:.3}",
min,
max
);
}
#[test]
fn test_synthetic_population_deterministic() {
let pop1 = generate_synthetic_population(50, 42);
let pop2 = generate_synthetic_population(50, 42);
assert_eq!(pop1.len(), pop2.len());
for (a, b) in pop1.iter().zip(pop2.iter()) {
assert!((a.global_risk_score - b.global_risk_score).abs() < 1e-10);
}
}
// ============================================================================
// STREAMING TESTS
// ============================================================================
#[test]
fn test_ring_buffer_basic() {
let mut rb: RingBuffer<f64> = RingBuffer::new(5);
for i in 0..3 {
rb.push(i as f64);
}
assert_eq!(rb.len(), 3);
let items: Vec<f64> = rb.iter().cloned().collect();
assert_eq!(items, vec![0.0, 1.0, 2.0]);
}
#[test]
fn test_ring_buffer_overflow() {
let mut rb: RingBuffer<f64> = RingBuffer::new(3);
for i in 0..5 {
rb.push(i as f64);
}
assert_eq!(rb.len(), 3);
let items: Vec<f64> = rb.iter().cloned().collect();
assert_eq!(items, vec![2.0, 3.0, 4.0]);
}
#[test]
fn test_stream_generation() {
let config = StreamConfig::default();
let num_biomarkers = config.num_biomarkers;
let readings = generate_readings(&config, 1000, 42);
// generate_readings produces count * num_biomarkers total readings
assert_eq!(readings.len(), 1000 * num_biomarkers);
// All values should be positive
for r in &readings {
assert!(
r.value > 0.0,
"Biomarker values should be positive: {} = {}",
r.biomarker_id,
r.value
);
}
}
#[test]
fn test_stream_processor() {
let config = StreamConfig::default();
let num_biomarkers = config.num_biomarkers;
let readings = generate_readings(&config, 500, 42);
let mut processor = StreamProcessor::new(config);
for reading in &readings {
processor.process_reading(reading);
}
let summary = processor.summary();
assert_eq!(summary.total_readings, 500 * num_biomarkers as u64);
assert!(
summary.anomaly_rate < 0.2,
"Anomaly rate should be reasonable: {}",
summary.anomaly_rate
);
}
#[test]
fn test_anomaly_detection() {
let config = StreamConfig {
anomaly_probability: 0.0, // No random anomalies
num_biomarkers: 1,
..StreamConfig::default()
};
let readings = generate_readings(&config, 200, 42);
let mut processor = StreamProcessor::new(config);
for reading in &readings {
processor.process_reading(reading);
}
// With no anomaly injection, anomaly rate should be very low
let summary = processor.summary();
assert!(
summary.anomaly_rate < 0.1,
"Without injection, anomaly rate should be low: {}",
summary.anomaly_rate
);
}
// ============================================================================
// GENE-GENE INTERACTION TESTS
// ============================================================================
#[test]
fn test_mthfr_comt_interaction() {
// MTHFR A1298C hom + COMT Met/Met should amplify neurological score
let mut gts_both = HashMap::new();
gts_both.insert("rs1801131".to_string(), "GG".to_string()); // A1298C hom_alt
gts_both.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
let both = compute_risk_scores(&gts_both);
let mut gts_one = HashMap::new();
gts_one.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met only
let one = compute_risk_scores(&gts_one);
let n_both = both.category_scores.get("Neurological").unwrap().score;
let n_one = one.category_scores.get("Neurological").unwrap().score;
assert!(
n_both > n_one,
"MTHFR×COMT interaction should amplify: {n_both} > {n_one}"
);
}
#[test]
fn test_drd2_comt_interaction() {
// DRD2 Taq1A + COMT variant should amplify neurological score
let mut gts = HashMap::new();
gts.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 hom_alt
gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
let with = compute_risk_scores(&gts);
let mut gts2 = HashMap::new();
gts2.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 only
let without = compute_risk_scores(&gts2);
let n_with = with.category_scores.get("Neurological").unwrap().score;
let n_without = without.category_scores.get("Neurological").unwrap().score;
assert!(
n_with > n_without,
"DRD2×COMT interaction should amplify: {n_with} > {n_without}"
);
}
// ============================================================================
// GENE-BIOMARKER CORRELATION TESTS
// ============================================================================
#[test]
fn test_apoe_lowers_hdl_in_population() {
let pop = generate_synthetic_population(300, 88);
let (mut apoe_hdl, mut ref_hdl) = (Vec::new(), Vec::new());
for p in &pop {
let hdl = p.biomarker_values.get("HDL").copied().unwrap_or(0.0);
// APOE carriers have elevated neurological scores from rs429358
let neuro = p
.category_scores
.get("Neurological")
.map(|c| c.score)
.unwrap_or(0.0);
if neuro > 0.3 {
apoe_hdl.push(hdl);
} else {
ref_hdl.push(hdl);
}
}
if !apoe_hdl.is_empty() && !ref_hdl.is_empty() {
let avg_apoe = apoe_hdl.iter().sum::<f64>() / apoe_hdl.len() as f64;
let avg_ref = ref_hdl.iter().sum::<f64>() / ref_hdl.len() as f64;
assert!(
avg_apoe < avg_ref,
"APOE e4 should lower HDL: {avg_apoe} < {avg_ref}"
);
}
}
#[test]
fn test_cusum_changepoint_detection() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 20,
..Default::default()
});
// Establish baseline
for i in 0..30 {
p.process_reading(&BiomarkerReading {
timestamp_ms: i * 1000,
biomarker_id: "glucose".into(),
value: 85.0,
reference_low: 70.0,
reference_high: 100.0,
is_anomaly: false,
z_score: 0.0,
});
}
// Inject a sustained shift (changepoint)
for i in 30..50 {
p.process_reading(&BiomarkerReading {
timestamp_ms: i * 1000,
biomarker_id: "glucose".into(),
value: 120.0,
reference_low: 70.0,
reference_high: 100.0,
is_anomaly: false,
z_score: 0.0,
});
}
let stats = p.get_stats("glucose").unwrap();
// After sustained shift, CUSUM should have triggered at least once
// (changepoint_detected resets after trigger, but the sustained shift
// will keep re-triggering, so the final state may or may not be true)
assert!(
stats.mean > 90.0,
"Mean should shift upward after changepoint: {}",
stats.mean
);
}
#[test]
fn test_trend_detection() {
let config = StreamConfig {
drift_rate: 0.5, // Strong upward drift
anomaly_probability: 0.0,
num_biomarkers: 1,
window_size: 50,
..StreamConfig::default()
};
let readings = generate_readings(&config, 200, 42);
let mut processor = StreamProcessor::new(config);
for reading in &readings {
processor.process_reading(reading);
}
// Should detect positive trend
let summary = processor.summary();
for (_, stats) in &summary.biomarker_stats {
assert!(
stats.trend_slope > 0.0,
"Should detect upward trend, got slope: {}",
stats.trend_slope
);
}
}

View File

@@ -0,0 +1,403 @@
//! Integration tests for k-mer indexing module
//!
//! These tests use real VectorDB instances to validate k-mer encoding,
//! indexing, and similarity search functionality.
use ::rvdna::kmer::{canonical_kmer, KmerEncoder, KmerIndex, MinHashSketch};
use tempfile::TempDir;
/// Helper to create a test directory that will be automatically cleaned up
fn create_test_db() -> TempDir {
TempDir::new().expect("Failed to create temp directory")
}
#[test]
fn test_kmer_encoding_basic() {
let encoder = KmerEncoder::new(4).expect("Failed to create encoder");
let sequence = b"ACGTACGT";
let vector = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence");
// Verify vector has correct dimensions
assert_eq!(
vector.len(),
encoder.dimensions(),
"Vector dimensions should match encoder dimensions"
);
// Verify L2 normalization
let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(
(magnitude - 1.0).abs() < 1e-5,
"Vector should be L2 normalized, got magnitude: {}",
magnitude
);
// Verify non-zero elements exist (sequence has k-mers)
let non_zero_count = vector.iter().filter(|&&x| x != 0.0).count();
assert!(non_zero_count > 0, "Vector should have non-zero elements");
}
#[test]
fn test_kmer_encoding_deterministic() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let sequence = b"ACGTACGTACGTACGTACGT";
let vector1 = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence first time");
let vector2 = encoder
.encode_sequence(sequence)
.expect("Failed to encode sequence second time");
// Verify same sequence produces identical vectors
assert_eq!(
vector1.len(),
vector2.len(),
"Vectors should have same length"
);
for (i, (&v1, &v2)) in vector1.iter().zip(vector2.iter()).enumerate() {
assert!(
(v1 - v2).abs() < 1e-6,
"Vector element {} should be identical: {} vs {}",
i,
v1,
v2
);
}
}
#[test]
fn test_kmer_complement_symmetry() {
let kmer1 = b"ACGT";
let kmer2 = b"ACGT"; // reverse complement is ACGT (palindrome)
let canon1 = canonical_kmer(kmer1);
let canon2 = canonical_kmer(kmer2);
assert_eq!(canon1, canon2, "Canonical k-mers should be equal");
// Test with non-palindrome
let kmer3 = b"AAAA";
let kmer4 = b"TTTT"; // reverse complement of AAAA
let canon3 = canonical_kmer(kmer3);
let canon4 = canonical_kmer(kmer4);
assert_eq!(
canon3, canon4,
"Canonical k-mer should be same for sequence and revcomp"
);
}
#[test]
fn test_kmer_index_insert_and_search() {
let _temp_dir = create_test_db();
// Create index with k=11
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Insert 3 sequences
let seq1 = b"ACGTACGTACGTACGTACGT";
let seq2 = b"ACGTACGTACGTACGTACGG"; // Similar to seq1
let seq3 = b"TTTTTTTTTTTTTTTTTTTT"; // Very different
index
.index_sequence("seq1", seq1)
.expect("Failed to index seq1");
index
.index_sequence("seq2", seq2)
.expect("Failed to index seq2");
index
.index_sequence("seq3", seq3)
.expect("Failed to index seq3");
// Search for similar sequences to seq1
let results = index.search_similar(seq1, 3).expect("Failed to search");
assert!(results.len() > 0, "Should find at least one result");
// First result should be seq1 itself (exact match)
assert_eq!(results[0].id, "seq1", "First result should be exact match");
assert!(
results[0].distance < 0.01,
"Exact match should have very low distance: {}",
results[0].distance
);
// seq2 should be closer than seq3
let seq2_idx = results.iter().position(|r| r.id == "seq2");
let seq3_idx = results.iter().position(|r| r.id == "seq3");
if let (Some(idx2), Some(idx3)) = (seq2_idx, seq3_idx) {
assert!(
idx2 < idx3,
"Similar sequence should rank higher than different sequence"
);
}
}
#[test]
fn test_kmer_index_batch_insert() {
let _temp_dir = create_test_db();
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Generate 100 random sequences
let mut sequences = Vec::new();
for i in 0..100 {
let seq = generate_random_sequence(50, i as u64);
sequences.push((format!("seq_{}", i), seq));
}
// Convert to reference slices for batch insert
let batch: Vec<(&str, &[u8])> = sequences
.iter()
.map(|(id, seq)| (id.as_str(), seq.as_slice()))
.collect();
// Batch insert
index
.index_batch(batch)
.expect("Failed to batch insert sequences");
// Verify we can search and get results
let query = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let results = index.search_similar(query, 10).expect("Failed to search");
assert!(results.len() > 0, "Should find results after batch insert");
}
#[test]
fn test_kmer_similar_sequences_score_higher() {
let _temp_dir = create_test_db();
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
// Create two similar sequences (90% identical)
let base_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"; // 40 bases
let similar_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGG"; // 1 base different
let random_seq = generate_random_sequence(40, 12345);
index
.index_sequence("base", base_seq)
.expect("Failed to index base");
index
.index_sequence("similar", similar_seq)
.expect("Failed to index similar");
index
.index_sequence("random", &random_seq)
.expect("Failed to index random");
// Search with base sequence
let results = index
.search_similar(base_seq, 10)
.expect("Failed to search");
assert!(results.len() > 0, "Should find at least one result");
// Find positions in results
let base_pos = results.iter().position(|r| r.id == "base");
let similar_pos = results.iter().position(|r| r.id == "similar");
// Base and similar should definitely be in top results
assert!(
base_pos.is_some(),
"Base sequence (exact match) should be found in results"
);
assert!(
similar_pos.is_some(),
"Similar sequence should be found in results"
);
// Base should be first (exact match has distance 0)
assert_eq!(
base_pos.unwrap(),
0,
"Base sequence should be the top result (exact match)"
);
// Similar sequence should be in top 3
assert!(
similar_pos.unwrap() < 3,
"Similar sequence should rank in top 3, was at position {}",
similar_pos.unwrap()
);
}
#[test]
fn test_kmer_different_k_values() {
// Test k=11
let encoder11 = KmerEncoder::new(11).expect("Failed to create k=11 encoder");
let seq = b"ACGTACGTACGTACGTACGTACGTACGT";
let vec11 = encoder11
.encode_sequence(seq)
.expect("Failed to encode with k=11");
assert_eq!(vec11.len(), encoder11.dimensions());
// Test k=21
let encoder21 = KmerEncoder::new(21).expect("Failed to create k=21 encoder");
let seq_long = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let vec21 = encoder21
.encode_sequence(seq_long)
.expect("Failed to encode with k=21");
assert_eq!(vec21.len(), encoder21.dimensions());
// Test k=31
let encoder31 = KmerEncoder::new(31).expect("Failed to create k=31 encoder");
let seq_longer = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
let vec31 = encoder31
.encode_sequence(seq_longer)
.expect("Failed to encode with k=31");
assert_eq!(vec31.len(), encoder31.dimensions());
// All should be normalized
for (vec, k) in &[(vec11, 11), (vec21, 21), (vec31, 31)] {
let magnitude: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(
(magnitude - 1.0).abs() < 1e-5,
"k={} vector should be normalized",
k
);
}
}
#[test]
fn test_minhash_sketch_basic() {
let num_hashes = 100;
let mut sketch = MinHashSketch::new(num_hashes);
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
let hashes = sketch
.sketch(sequence, 11)
.expect("Failed to sketch sequence");
assert!(
hashes.len() <= num_hashes,
"Sketch should have at most {} hashes, got {}",
num_hashes,
hashes.len()
);
assert!(hashes.len() > 0, "Sketch should have at least one hash");
// Verify hashes are sorted (implementation detail)
for i in 1..hashes.len() {
assert!(hashes[i] >= hashes[i - 1], "Hashes should be sorted");
}
}
#[test]
fn test_minhash_jaccard_identical() {
let mut sketch1 = MinHashSketch::new(100);
let mut sketch2 = MinHashSketch::new(100);
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
sketch1
.sketch(sequence, 11)
.expect("Failed to sketch sequence 1");
sketch2
.sketch(sequence, 11)
.expect("Failed to sketch sequence 2");
let distance = sketch1.jaccard_distance(&sketch2);
assert!(
distance < 0.01,
"Identical sequences should have distance close to 0, got {}",
distance
);
}
#[test]
fn test_minhash_jaccard_different() {
let mut sketch1 = MinHashSketch::new(100);
let mut sketch2 = MinHashSketch::new(100);
let seq1 = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
let seq2 = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
sketch1
.sketch(seq1, 11)
.expect("Failed to sketch sequence 1");
sketch2
.sketch(seq2, 11)
.expect("Failed to sketch sequence 2");
let distance = sketch1.jaccard_distance(&sketch2);
assert!(
distance > 0.9,
"Very different sequences should have distance close to 1, got {}",
distance
);
}
#[test]
fn test_kmer_index_empty_sequence() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
// Test empty sequence
let empty_seq = b"";
let result = encoder.encode_sequence(empty_seq);
assert!(result.is_err(), "Empty sequence should return error");
// Test sequence shorter than k
let short_seq = b"ACGT"; // k=11 but only 4 bases
let result = encoder.encode_sequence(short_seq);
assert!(
result.is_err(),
"Sequence shorter than k should return error"
);
}
#[test]
fn test_kmer_index_with_n_bases() {
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
// Sequence with N (unknown) bases
let seq_with_n = b"ACGTACGTNNNACGTACGT";
// Should still encode (N bases are handled in canonical_kmer)
let result = encoder.encode_sequence(seq_with_n);
assert!(
result.is_ok(),
"Sequence with N bases should encode successfully"
);
let vector = result.unwrap();
assert_eq!(
vector.len(),
encoder.dimensions(),
"Vector should have correct dimensions"
);
}
// Helper function to generate random DNA sequences
fn generate_random_sequence(length: usize, seed: u64) -> Vec<u8> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let bases = [b'A', b'C', b'G', b'T'];
let mut sequence = Vec::with_capacity(length);
for i in 0..length {
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
i.hash(&mut hasher);
let hash = hasher.finish();
let base_idx = (hash % 4) as usize;
sequence.push(bases[base_idx]);
}
sequence
}

View File

@@ -0,0 +1,353 @@
//! End-to-End Integration Tests for DNA Analysis Pipeline
//!
//! Real data, real computation, real assertions. No mocks, no stubs.
//! Tests the complete DNA analysis workflow from nucleotide encoding
//! through variant calling, protein translation, epigenetics, and pharmacogenomics.
use ::rvdna::*;
// ============================================================================
// NUCLEOTIDE & SEQUENCE TESTS
// ============================================================================
#[test]
fn test_nucleotide_encoding() {
assert_eq!(Nucleotide::A.to_u8(), 0);
assert_eq!(Nucleotide::C.to_u8(), 1);
assert_eq!(Nucleotide::G.to_u8(), 2);
assert_eq!(Nucleotide::T.to_u8(), 3);
assert_eq!(Nucleotide::N.to_u8(), 4);
assert_eq!(Nucleotide::from_u8(0).unwrap(), Nucleotide::A);
assert_eq!(Nucleotide::from_u8(1).unwrap(), Nucleotide::C);
assert_eq!(Nucleotide::from_u8(2).unwrap(), Nucleotide::G);
assert_eq!(Nucleotide::from_u8(3).unwrap(), Nucleotide::T);
assert_eq!(Nucleotide::from_u8(4).unwrap(), Nucleotide::N);
}
#[test]
fn test_dna_sequence_reverse_complement() {
let seq1 = DnaSequence::from_str("ACGT").unwrap();
let rc1 = seq1.reverse_complement();
assert_eq!(rc1.to_string(), "ACGT");
let seq2 = DnaSequence::from_str("AACG").unwrap();
let rc2 = seq2.reverse_complement();
assert_eq!(rc2.to_string(), "CGTT");
let seq3 = DnaSequence::from_str("ATGCATGC").unwrap();
let rc3 = seq3.reverse_complement();
assert_eq!(rc3.to_string(), "GCATGCAT");
}
// ============================================================================
// VARIANT CALLING TESTS
// ============================================================================
#[test]
fn test_variant_calling_homozygous_snp() {
let caller = VariantCaller::new(VariantCallerConfig::default());
let pileup = PileupColumn {
bases: vec![b'G'; 15],
qualities: vec![40; 15],
position: 1000,
chromosome: 1,
};
let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
assert_eq!(call.genotype, Genotype::HomAlt);
assert_eq!(call.alt_allele, b'G');
assert_eq!(call.ref_allele, b'A');
assert!(call.quality > 20.0);
}
#[test]
fn test_variant_calling_heterozygous_snp() {
let caller = VariantCaller::new(VariantCallerConfig::default());
let mut bases = vec![b'A'; 10];
bases.extend(vec![b'G'; 10]);
let pileup = PileupColumn {
bases,
qualities: vec![40; 20],
position: 2000,
chromosome: 1,
};
let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
assert_eq!(call.genotype, Genotype::Het);
assert_eq!(call.alt_allele, b'G');
assert!(call.quality > 20.0);
}
#[test]
fn test_variant_calling_no_variant() {
let caller = VariantCaller::new(VariantCallerConfig::default());
let pileup = PileupColumn {
bases: vec![b'A'; 20],
qualities: vec![40; 20],
position: 3000,
chromosome: 1,
};
let call = caller.call_snp(&pileup, b'A');
if let Some(c) = call {
assert_eq!(c.ref_allele, b'A');
assert!((c.allele_depth as f32 / c.depth as f32) < 0.2);
}
}
#[test]
fn test_variant_quality_filtering() {
let mut config = VariantCallerConfig::default();
config.min_quality = 30;
config.min_depth = 10;
let caller = VariantCaller::new(config);
let mut calls = vec![
VariantCall {
chromosome: 1,
position: 1000,
ref_allele: b'A',
alt_allele: b'G',
quality: 35.0,
genotype: Genotype::Het,
depth: 20,
allele_depth: 10,
filter_status: FilterStatus::Pass,
},
VariantCall {
chromosome: 1,
position: 2000,
ref_allele: b'C',
alt_allele: b'T',
quality: 25.0,
genotype: Genotype::Het,
depth: 20,
allele_depth: 10,
filter_status: FilterStatus::Pass,
},
VariantCall {
chromosome: 1,
position: 3000,
ref_allele: b'G',
alt_allele: b'A',
quality: 40.0,
genotype: Genotype::Het,
depth: 5,
allele_depth: 2,
filter_status: FilterStatus::Pass,
},
];
caller.filter_variants(&mut calls);
assert_eq!(calls[0].filter_status, FilterStatus::Pass);
assert_eq!(calls[1].filter_status, FilterStatus::LowQuality);
assert_eq!(calls[2].filter_status, FilterStatus::LowDepth);
}
// ============================================================================
// PROTEIN TRANSLATION TESTS
// ============================================================================
#[test]
fn test_protein_translation() {
use ::rvdna::protein::{translate_dna, AminoAcid};
let proteins = translate_dna(b"ATGGCAGGT");
assert_eq!(proteins.len(), 3);
assert_eq!(proteins[0], AminoAcid::Met);
assert_eq!(proteins[1], AminoAcid::Ala);
assert_eq!(proteins[2], AminoAcid::Gly);
}
#[test]
fn test_protein_translation_stop_codon() {
use ::rvdna::protein::{translate_dna, AminoAcid};
let p1 = translate_dna(b"ATGGCATAA");
assert_eq!(p1.len(), 2);
assert_eq!(p1[0], AminoAcid::Met);
let p2 = translate_dna(b"ATGGCATAG");
assert_eq!(p2.len(), 2);
let p3 = translate_dna(b"ATGGCATGA");
assert_eq!(p3.len(), 2);
}
#[test]
fn test_amino_acid_hydrophobicity() {
use ::rvdna::protein::AminoAcid;
assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
assert_eq!(AminoAcid::Val.hydrophobicity(), 4.2);
assert_eq!(AminoAcid::Lys.hydrophobicity(), -3.9);
assert_eq!(AminoAcid::Gly.hydrophobicity(), -0.4);
}
// ============================================================================
// EPIGENETICS TESTS
// ============================================================================
#[test]
fn test_methylation_profile_creation() {
let positions = vec![(1, 1000), (1, 2000), (2, 3000), (2, 4000)];
let betas = vec![0.1, 0.5, 0.8, 0.3];
let profile = MethylationProfile::from_beta_values(positions, betas);
assert_eq!(profile.sites.len(), 4);
let mean = profile.mean_methylation();
assert!((mean - 0.425).abs() < 0.001);
}
#[test]
fn test_horvath_clock_prediction() {
let clock = HorvathClock::default_clock();
let positions: Vec<(u8, u64)> = (0..700).map(|i| (1, i * 1000)).collect();
let betas: Vec<f32> = (0..700)
.map(|i| {
if i < 100 {
0.3
} else if i < 200 {
0.7
} else {
0.5
}
})
.collect();
let profile = MethylationProfile::from_beta_values(positions, betas);
let predicted_age = clock.predict_age(&profile);
assert!(predicted_age > 0.0);
assert!(predicted_age < 150.0);
}
// ============================================================================
// PHARMACOGENOMICS TESTS
// ============================================================================
#[test]
fn test_pharma_star_allele_calling() {
assert_eq!(call_star_allele(&[]), StarAllele::Star1);
assert_eq!(
call_star_allele(&[(42130692, b'G', b'A')]),
StarAllele::Star4
);
assert_eq!(
call_star_allele(&[(42126611, b'T', b'-')]),
StarAllele::Star5
);
}
#[test]
fn test_pharma_metabolizer_phenotype() {
assert_eq!(
predict_phenotype(&StarAllele::Star1, &StarAllele::Star1),
MetabolizerPhenotype::Normal
);
assert_eq!(
predict_phenotype(&StarAllele::Star1, &StarAllele::Star4),
MetabolizerPhenotype::Normal
);
assert_eq!(
predict_phenotype(&StarAllele::Star4, &StarAllele::Star4),
MetabolizerPhenotype::Poor
);
}
// ============================================================================
// ALIGNMENT TESTS
// ============================================================================
#[test]
fn test_smith_waterman_alignment() {
let aligner = SmithWaterman::new(AlignmentConfig::default());
let query = DnaSequence::from_str("ACGT").unwrap();
let reference = DnaSequence::from_str("ACGT").unwrap();
let result = aligner.align(&query, &reference).unwrap();
assert_eq!(result.score, 8); // 4 matches * 2 points each
}
#[test]
fn test_attention_alignment() {
let query = DnaSequence::from_str("ATCGATCG").unwrap();
let reference = DnaSequence::from_str("TTTTATCGATCGTTTT").unwrap();
let alignment = query.align_with_attention(&reference).unwrap();
assert!(alignment.score > 0);
}
// ============================================================================
// FULL PIPELINE INTEGRATION
// ============================================================================
#[test]
fn test_pipeline_config_defaults() {
let config = AnalysisConfig::default();
assert_eq!(config.kmer_size, 11);
assert_eq!(config.vector_dims, 512);
assert_eq!(config.min_quality, 20);
assert!(config.parameters.is_empty());
}
#[test]
fn test_full_pipeline_runs() {
// 1. Create and manipulate DNA
let dna_seq = DnaSequence::from_str("ATGCGATCGATCGATCGATCGTAGCTAGCTAGC").unwrap();
let rev_comp = dna_seq.reverse_complement();
assert_eq!(rev_comp.len(), dna_seq.len());
// 2. K-mer vector
let kmer_vec = dna_seq.to_kmer_vector(11, 512).unwrap();
assert_eq!(kmer_vec.len(), 512);
// 3. Variant calling
let caller = VariantCaller::new(VariantCallerConfig::default());
let pileup = PileupColumn {
bases: vec![b'A', b'A', b'G', b'G', b'G', b'G', b'G', b'G', b'G', b'G'],
qualities: vec![40; 10],
position: 1000,
chromosome: 1,
};
assert!(caller.call_snp(&pileup, b'A').is_some());
// 4. Protein translation
let proteins = translate_dna(b"ATGGCAGGTAAACCC");
assert!(!proteins.is_empty());
// 5. Methylation + Horvath
let profile = MethylationProfile::from_beta_values(
vec![(1, 1000), (1, 2000), (1, 3000)],
vec![0.3, 0.5, 0.7],
);
let age = HorvathClock::default_clock().predict_age(&profile);
assert!(age > 0.0);
// 6. Pharmacogenomics
let allele = call_star_allele(&[(42130692, b'G', b'A')]);
assert_eq!(allele, StarAllele::Star4);
let phenotype = predict_phenotype(&allele, &StarAllele::Star1);
assert_eq!(phenotype, MetabolizerPhenotype::Normal);
// 7. Alignment
let alignment = dna_seq.align_with_attention(&rev_comp).unwrap();
assert!(alignment.score > 0);
// 8. Protein contact graph
let protein = ProteinSequence::new(vec![
ProteinResidue::A,
ProteinResidue::V,
ProteinResidue::L,
ProteinResidue::I,
ProteinResidue::F,
ProteinResidue::G,
ProteinResidue::K,
ProteinResidue::D,
ProteinResidue::E,
ProteinResidue::R,
ProteinResidue::M,
ProteinResidue::N,
]);
let graph = protein.build_contact_graph(8.0).unwrap();
let contacts = protein.predict_contacts(&graph).unwrap();
assert!(!contacts.is_empty());
}

View File

@@ -0,0 +1,191 @@
//! Security validation tests for DNA analyzer - NO MOCKS, real computation only
use ::rvdna::error::DnaError;
use ::rvdna::types::*;
use ::rvdna::VectorEntry;
use std::sync::{Arc, Mutex};
use std::thread;
#[test]
fn test_buffer_overflow_protection() {
// 10M+ bases shouldn't cause OOM/crash
let large_size = 10_000_000;
let bases: Vec<Nucleotide> = (0..large_size)
.map(|i| match i % 4 {
0 => Nucleotide::A,
1 => Nucleotide::C,
2 => Nucleotide::G,
_ => Nucleotide::T,
})
.collect();
let seq = DnaSequence::new(bases);
assert_eq!(seq.len(), large_size);
let rc = seq.reverse_complement();
assert_eq!(rc.len(), large_size);
assert!(seq.to_kmer_vector(11, 512).is_ok());
}
#[test]
fn test_invalid_base_handling() {
// Non-ACGTN characters rejected gracefully
for input in ["ACGTX", "ACGT123", "ACGT!@#"] {
let result = DnaSequence::from_str(input);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), DnaError::InvalidSequence(_)));
}
assert!(DnaSequence::from_str("ACGTN").is_ok());
assert!(DnaSequence::from_str("acgtn").is_ok());
}
#[test]
fn test_unicode_injection() {
// Unicode/malicious IDs don't break indexing
let seq = DnaSequence::from_str("ACGTACGT").unwrap();
let vector = seq.to_kmer_vector(3, 128).unwrap();
let temp_dir = std::env::temp_dir().join(format!("dna_test_{}", std::process::id()));
let _ = std::fs::create_dir_all(&temp_dir);
let index = KmerIndex::new(3, 128, temp_dir.join("unicode").to_str().unwrap()).unwrap();
for id in ["seq_cafe_dna", "patient123", "seq_hidden"] {
let entry = VectorEntry {
id: Some(id.to_string()),
vector: vector.clone(),
metadata: None,
};
assert!(index.db().insert(entry).is_ok());
}
let _ = std::fs::remove_dir_all(&temp_dir);
}
#[test]
fn test_path_traversal_prevention() {
// Verify KmerIndex handles unusual paths without panicking
// The key security property: operations complete or fail gracefully
let temp_dir = std::env::temp_dir().join(format!("dna_path_{}", std::process::id()));
let _ = std::fs::create_dir_all(&temp_dir);
for path in ["../../../tmp/evil", "../../etc/passwd"] {
let full_path = temp_dir.join(path);
// KmerIndex creation with traversal paths should either succeed
// (contained to actual resolved path) or fail gracefully - never panic
let result =
std::panic::catch_unwind(|| KmerIndex::new(3, 128, full_path.to_str().unwrap()));
assert!(result.is_ok(), "Path traversal should not cause panic");
}
// Clean up any created dirs
let _ = std::fs::remove_dir_all(&temp_dir);
let _ = std::fs::remove_dir_all(std::env::temp_dir().join("evil"));
}
#[test]
fn test_integer_overflow_kmer() {
// k=64 would overflow, k=0 invalid
let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
assert!(matches!(
seq.to_kmer_vector(64, 512).unwrap_err(),
DnaError::InvalidKmerSize(64)
));
assert!(seq.to_kmer_vector(0, 512).is_err());
assert!(seq.to_kmer_vector(11, 512).is_ok());
assert!(seq.to_kmer_vector(15, 512).is_ok());
}
#[test]
fn test_empty_input_safety() {
// Empty inputs handled safely
assert!(matches!(
DnaSequence::from_str("").unwrap_err(),
DnaError::EmptySequence
));
let empty = DnaSequence::new(vec![]);
assert!(empty.is_empty() && empty.len() == 0);
assert!(empty.complement().is_empty());
assert!(empty.reverse_complement().is_empty());
assert_eq!(empty.to_string(), "");
}
#[test]
fn test_null_byte_handling() {
// Null bytes rejected
assert!(DnaSequence::from_str("ACGT\0").is_err());
}
#[test]
fn test_concurrent_access_safety() {
// 10 threads accessing VectorDB concurrently
let temp_dir = std::env::temp_dir().join(format!("dna_conc_{}", std::process::id()));
let _ = std::fs::create_dir_all(&temp_dir);
let index = Arc::new(Mutex::new(
KmerIndex::new(3, 128, temp_dir.join("idx").to_str().unwrap()).unwrap(),
));
let handles: Vec<_> = (0..10)
.map(|i| {
let idx_clone = Arc::clone(&index);
thread::spawn(move || {
let seq = DnaSequence::from_str("ACGTACGTACGT").unwrap();
let entry = VectorEntry {
id: Some(format!("seq_{}", i)),
vector: seq.to_kmer_vector(3, 128).unwrap(),
metadata: None,
};
idx_clone.lock().unwrap().db().insert(entry).unwrap();
})
})
.collect();
for h in handles {
assert!(h.join().is_ok());
}
let _ = std::fs::remove_dir_all(&temp_dir);
}
#[test]
fn test_quality_score_bounds() {
// Phred >93 rejected, 0-93 accepted
assert!(matches!(
QualityScore::new(100).unwrap_err(),
DnaError::InvalidQuality(100)
));
assert!(QualityScore::new(0).is_ok());
assert!(QualityScore::new(93).is_ok());
assert!((QualityScore::new(30).unwrap().to_error_probability() - 0.001).abs() < 1e-6);
assert!((QualityScore::new(0).unwrap().to_error_probability() - 1.0).abs() < 0.01);
}
#[test]
fn test_variant_position_overflow() {
// u64::MAX positions handled
let pos = GenomicPosition {
chromosome: 25,
position: u64::MAX,
reference_allele: Nucleotide::A,
alternate_allele: Some(Nucleotide::G),
};
assert_eq!(pos.position, u64::MAX);
}
#[test]
fn test_methylation_bounds() {
// Beta values clamped to [0,1]
for val in [-0.5f32, 0.0, 0.5, 1.0, 1.5] {
let clamped = val.clamp(0.0, 1.0);
assert!(clamped >= 0.0 && clamped <= 1.0);
}
}
#[test]
fn test_deterministic_output() {
// Same input -> same output (no randomness)
let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
assert_eq!(
seq.to_kmer_vector(11, 512).unwrap(),
seq.to_kmer_vector(11, 512).unwrap()
);
assert_eq!(
seq.reverse_complement().to_string(),
seq.reverse_complement().to_string()
);
assert_eq!(seq.complement().to_string(), seq.complement().to_string());
assert_eq!(seq.to_string(), seq.to_string());
}