Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
409
examples/dna/tests/biomarker_tests.rs
Normal file
409
examples/dna/tests/biomarker_tests.rs
Normal file
@@ -0,0 +1,409 @@
|
||||
//! Integration tests for the biomarker analysis engine.
|
||||
//!
|
||||
//! Tests composite risk scoring, profile vector encoding, clinical biomarker
|
||||
//! references, synthetic population generation, and streaming biomarker
|
||||
//! processing with anomaly and trend detection.
|
||||
|
||||
use rvdna::biomarker::*;
|
||||
use rvdna::biomarker_stream::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// ============================================================================
|
||||
// COMPOSITE RISK SCORING TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_compute_risk_scores_baseline() {
|
||||
// All homozygous reference (low risk) genotypes
|
||||
let mut gts = HashMap::new();
|
||||
gts.insert("rs429358".to_string(), "TT".to_string()); // APOE ref
|
||||
gts.insert("rs7412".to_string(), "CC".to_string()); // APOE ref
|
||||
gts.insert("rs4680".to_string(), "GG".to_string()); // COMT ref
|
||||
gts.insert("rs1799971".to_string(), "AA".to_string()); // OPRM1 ref
|
||||
gts.insert("rs762551".to_string(), "AA".to_string()); // CYP1A2 fast
|
||||
gts.insert("rs1801133".to_string(), "GG".to_string()); // MTHFR ref
|
||||
gts.insert("rs1801131".to_string(), "TT".to_string()); // MTHFR ref
|
||||
gts.insert("rs1042522".to_string(), "CC".to_string()); // TP53 ref
|
||||
gts.insert("rs80357906".to_string(), "DD".to_string()); // BRCA1 ref
|
||||
gts.insert("rs4363657".to_string(), "TT".to_string()); // SLCO1B1 ref
|
||||
|
||||
let profile = compute_risk_scores(>s);
|
||||
assert!(
|
||||
profile.global_risk_score < 0.3,
|
||||
"Baseline should be low risk, got {}",
|
||||
profile.global_risk_score
|
||||
);
|
||||
assert!(!profile.category_scores.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_risk_scores_high_risk() {
|
||||
// High-risk genotype combinations
|
||||
let mut gts = HashMap::new();
|
||||
gts.insert("rs429358".to_string(), "CC".to_string()); // APOE e4/e4
|
||||
gts.insert("rs7412".to_string(), "CC".to_string());
|
||||
gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
|
||||
gts.insert("rs1799971".to_string(), "GG".to_string()); // OPRM1 Asp/Asp
|
||||
gts.insert("rs1801133".to_string(), "AA".to_string()); // MTHFR 677TT
|
||||
gts.insert("rs1801131".to_string(), "GG".to_string()); // MTHFR 1298CC
|
||||
gts.insert("rs4363657".to_string(), "CC".to_string()); // SLCO1B1 hom variant
|
||||
|
||||
let profile = compute_risk_scores(>s);
|
||||
assert!(
|
||||
profile.global_risk_score > 0.4,
|
||||
"High-risk should score >0.4, got {}",
|
||||
profile.global_risk_score
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PROFILE VECTOR TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_profile_vector_dimension() {
|
||||
let gts = HashMap::new(); // empty genotypes
|
||||
let profile = compute_risk_scores(>s);
|
||||
assert_eq!(
|
||||
profile.profile_vector.len(),
|
||||
64,
|
||||
"Profile vector must be exactly 64 dimensions"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_profile_vector_normalized() {
|
||||
let mut gts = HashMap::new();
|
||||
gts.insert("rs429358".to_string(), "CT".to_string());
|
||||
gts.insert("rs4680".to_string(), "AG".to_string());
|
||||
let profile = compute_risk_scores(>s);
|
||||
let mag: f32 = profile
|
||||
.profile_vector
|
||||
.iter()
|
||||
.map(|x| x * x)
|
||||
.sum::<f32>()
|
||||
.sqrt();
|
||||
assert!(
|
||||
(mag - 1.0).abs() < 0.01 || mag == 0.0,
|
||||
"Vector should be L2-normalized, got magnitude {}",
|
||||
mag
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// BIOMARKER REFERENCE TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_biomarker_references_exist() {
|
||||
let refs = biomarker_references();
|
||||
assert!(
|
||||
refs.len() >= 13,
|
||||
"Should have at least 13 biomarker references, got {}",
|
||||
refs.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_z_score_computation() {
|
||||
let refs = biomarker_references();
|
||||
let cholesterol_ref = refs.iter().find(|r| r.name == "Total Cholesterol").unwrap();
|
||||
|
||||
// Normal value should have |z| < 2
|
||||
let z_normal = z_score(180.0, cholesterol_ref);
|
||||
assert!(
|
||||
z_normal.abs() < 2.0,
|
||||
"Normal cholesterol z-score should be small: {}",
|
||||
z_normal
|
||||
);
|
||||
|
||||
// High value should have z > 0
|
||||
let z_high = z_score(300.0, cholesterol_ref);
|
||||
assert!(
|
||||
z_high > 0.0,
|
||||
"High cholesterol should have positive z-score: {}",
|
||||
z_high
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_biomarker_classification() {
|
||||
let refs = biomarker_references();
|
||||
let glucose_ref = refs.iter().find(|r| r.name == "Fasting Glucose").unwrap();
|
||||
|
||||
let class_normal = classify_biomarker(85.0, glucose_ref);
|
||||
// Should be normal range
|
||||
let class_high = classify_biomarker(200.0, glucose_ref);
|
||||
// Should be high/critical
|
||||
assert_ne!(format!("{:?}", class_normal), format!("{:?}", class_high));
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SYNTHETIC POPULATION TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_synthetic_population() {
|
||||
let pop = generate_synthetic_population(100, 42);
|
||||
assert_eq!(pop.len(), 100);
|
||||
|
||||
// All vectors should be 64-dim
|
||||
for profile in &pop {
|
||||
assert_eq!(profile.profile_vector.len(), 64);
|
||||
}
|
||||
|
||||
// Risk scores should span a range
|
||||
let scores: Vec<f64> = pop.iter().map(|p| p.global_risk_score).collect();
|
||||
let min = scores.iter().cloned().fold(f64::INFINITY, f64::min);
|
||||
let max = scores.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
|
||||
assert!(
|
||||
max - min > 0.1,
|
||||
"Population should have risk score variance, range: {:.3}..{:.3}",
|
||||
min,
|
||||
max
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_synthetic_population_deterministic() {
|
||||
let pop1 = generate_synthetic_population(50, 42);
|
||||
let pop2 = generate_synthetic_population(50, 42);
|
||||
assert_eq!(pop1.len(), pop2.len());
|
||||
for (a, b) in pop1.iter().zip(pop2.iter()) {
|
||||
assert!((a.global_risk_score - b.global_risk_score).abs() < 1e-10);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// STREAMING TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_ring_buffer_basic() {
|
||||
let mut rb: RingBuffer<f64> = RingBuffer::new(5);
|
||||
for i in 0..3 {
|
||||
rb.push(i as f64);
|
||||
}
|
||||
assert_eq!(rb.len(), 3);
|
||||
let items: Vec<f64> = rb.iter().cloned().collect();
|
||||
assert_eq!(items, vec![0.0, 1.0, 2.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ring_buffer_overflow() {
|
||||
let mut rb: RingBuffer<f64> = RingBuffer::new(3);
|
||||
for i in 0..5 {
|
||||
rb.push(i as f64);
|
||||
}
|
||||
assert_eq!(rb.len(), 3);
|
||||
let items: Vec<f64> = rb.iter().cloned().collect();
|
||||
assert_eq!(items, vec![2.0, 3.0, 4.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_generation() {
|
||||
let config = StreamConfig::default();
|
||||
let num_biomarkers = config.num_biomarkers;
|
||||
let readings = generate_readings(&config, 1000, 42);
|
||||
// generate_readings produces count * num_biomarkers total readings
|
||||
assert_eq!(readings.len(), 1000 * num_biomarkers);
|
||||
|
||||
// All values should be positive
|
||||
for r in &readings {
|
||||
assert!(
|
||||
r.value > 0.0,
|
||||
"Biomarker values should be positive: {} = {}",
|
||||
r.biomarker_id,
|
||||
r.value
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_processor() {
|
||||
let config = StreamConfig::default();
|
||||
let num_biomarkers = config.num_biomarkers;
|
||||
let readings = generate_readings(&config, 500, 42);
|
||||
let mut processor = StreamProcessor::new(config);
|
||||
|
||||
for reading in &readings {
|
||||
processor.process_reading(reading);
|
||||
}
|
||||
|
||||
let summary = processor.summary();
|
||||
assert_eq!(summary.total_readings, 500 * num_biomarkers as u64);
|
||||
assert!(
|
||||
summary.anomaly_rate < 0.2,
|
||||
"Anomaly rate should be reasonable: {}",
|
||||
summary.anomaly_rate
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anomaly_detection() {
|
||||
let config = StreamConfig {
|
||||
anomaly_probability: 0.0, // No random anomalies
|
||||
num_biomarkers: 1,
|
||||
..StreamConfig::default()
|
||||
};
|
||||
|
||||
let readings = generate_readings(&config, 200, 42);
|
||||
let mut processor = StreamProcessor::new(config);
|
||||
|
||||
for reading in &readings {
|
||||
processor.process_reading(reading);
|
||||
}
|
||||
|
||||
// With no anomaly injection, anomaly rate should be very low
|
||||
let summary = processor.summary();
|
||||
assert!(
|
||||
summary.anomaly_rate < 0.1,
|
||||
"Without injection, anomaly rate should be low: {}",
|
||||
summary.anomaly_rate
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// GENE-GENE INTERACTION TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_mthfr_comt_interaction() {
|
||||
// MTHFR A1298C hom + COMT Met/Met should amplify neurological score
|
||||
let mut gts_both = HashMap::new();
|
||||
gts_both.insert("rs1801131".to_string(), "GG".to_string()); // A1298C hom_alt
|
||||
gts_both.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
|
||||
let both = compute_risk_scores(>s_both);
|
||||
|
||||
let mut gts_one = HashMap::new();
|
||||
gts_one.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met only
|
||||
let one = compute_risk_scores(>s_one);
|
||||
|
||||
let n_both = both.category_scores.get("Neurological").unwrap().score;
|
||||
let n_one = one.category_scores.get("Neurological").unwrap().score;
|
||||
assert!(
|
||||
n_both > n_one,
|
||||
"MTHFR×COMT interaction should amplify: {n_both} > {n_one}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drd2_comt_interaction() {
|
||||
// DRD2 Taq1A + COMT variant should amplify neurological score
|
||||
let mut gts = HashMap::new();
|
||||
gts.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 hom_alt
|
||||
gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
|
||||
let with = compute_risk_scores(>s);
|
||||
|
||||
let mut gts2 = HashMap::new();
|
||||
gts2.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 only
|
||||
let without = compute_risk_scores(>s2);
|
||||
|
||||
let n_with = with.category_scores.get("Neurological").unwrap().score;
|
||||
let n_without = without.category_scores.get("Neurological").unwrap().score;
|
||||
assert!(
|
||||
n_with > n_without,
|
||||
"DRD2×COMT interaction should amplify: {n_with} > {n_without}"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// GENE-BIOMARKER CORRELATION TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_apoe_lowers_hdl_in_population() {
|
||||
let pop = generate_synthetic_population(300, 88);
|
||||
let (mut apoe_hdl, mut ref_hdl) = (Vec::new(), Vec::new());
|
||||
for p in &pop {
|
||||
let hdl = p.biomarker_values.get("HDL").copied().unwrap_or(0.0);
|
||||
// APOE carriers have elevated neurological scores from rs429358
|
||||
let neuro = p
|
||||
.category_scores
|
||||
.get("Neurological")
|
||||
.map(|c| c.score)
|
||||
.unwrap_or(0.0);
|
||||
if neuro > 0.3 {
|
||||
apoe_hdl.push(hdl);
|
||||
} else {
|
||||
ref_hdl.push(hdl);
|
||||
}
|
||||
}
|
||||
if !apoe_hdl.is_empty() && !ref_hdl.is_empty() {
|
||||
let avg_apoe = apoe_hdl.iter().sum::<f64>() / apoe_hdl.len() as f64;
|
||||
let avg_ref = ref_hdl.iter().sum::<f64>() / ref_hdl.len() as f64;
|
||||
assert!(
|
||||
avg_apoe < avg_ref,
|
||||
"APOE e4 should lower HDL: {avg_apoe} < {avg_ref}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cusum_changepoint_detection() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 20,
|
||||
..Default::default()
|
||||
});
|
||||
// Establish baseline
|
||||
for i in 0..30 {
|
||||
p.process_reading(&BiomarkerReading {
|
||||
timestamp_ms: i * 1000,
|
||||
biomarker_id: "glucose".into(),
|
||||
value: 85.0,
|
||||
reference_low: 70.0,
|
||||
reference_high: 100.0,
|
||||
is_anomaly: false,
|
||||
z_score: 0.0,
|
||||
});
|
||||
}
|
||||
// Inject a sustained shift (changepoint)
|
||||
for i in 30..50 {
|
||||
p.process_reading(&BiomarkerReading {
|
||||
timestamp_ms: i * 1000,
|
||||
biomarker_id: "glucose".into(),
|
||||
value: 120.0,
|
||||
reference_low: 70.0,
|
||||
reference_high: 100.0,
|
||||
is_anomaly: false,
|
||||
z_score: 0.0,
|
||||
});
|
||||
}
|
||||
let stats = p.get_stats("glucose").unwrap();
|
||||
// After sustained shift, CUSUM should have triggered at least once
|
||||
// (changepoint_detected resets after trigger, but the sustained shift
|
||||
// will keep re-triggering, so the final state may or may not be true)
|
||||
assert!(
|
||||
stats.mean > 90.0,
|
||||
"Mean should shift upward after changepoint: {}",
|
||||
stats.mean
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_trend_detection() {
|
||||
let config = StreamConfig {
|
||||
drift_rate: 0.5, // Strong upward drift
|
||||
anomaly_probability: 0.0,
|
||||
num_biomarkers: 1,
|
||||
window_size: 50,
|
||||
..StreamConfig::default()
|
||||
};
|
||||
|
||||
let readings = generate_readings(&config, 200, 42);
|
||||
let mut processor = StreamProcessor::new(config);
|
||||
|
||||
for reading in &readings {
|
||||
processor.process_reading(reading);
|
||||
}
|
||||
|
||||
// Should detect positive trend
|
||||
let summary = processor.summary();
|
||||
for (_, stats) in &summary.biomarker_stats {
|
||||
assert!(
|
||||
stats.trend_slope > 0.0,
|
||||
"Should detect upward trend, got slope: {}",
|
||||
stats.trend_slope
|
||||
);
|
||||
}
|
||||
}
|
||||
403
examples/dna/tests/kmer_tests.rs
Normal file
403
examples/dna/tests/kmer_tests.rs
Normal file
@@ -0,0 +1,403 @@
|
||||
//! Integration tests for k-mer indexing module
|
||||
//!
|
||||
//! These tests use real VectorDB instances to validate k-mer encoding,
|
||||
//! indexing, and similarity search functionality.
|
||||
|
||||
use ::rvdna::kmer::{canonical_kmer, KmerEncoder, KmerIndex, MinHashSketch};
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Helper to create a test directory that will be automatically cleaned up
|
||||
fn create_test_db() -> TempDir {
|
||||
TempDir::new().expect("Failed to create temp directory")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_encoding_basic() {
|
||||
let encoder = KmerEncoder::new(4).expect("Failed to create encoder");
|
||||
let sequence = b"ACGTACGT";
|
||||
|
||||
let vector = encoder
|
||||
.encode_sequence(sequence)
|
||||
.expect("Failed to encode sequence");
|
||||
|
||||
// Verify vector has correct dimensions
|
||||
assert_eq!(
|
||||
vector.len(),
|
||||
encoder.dimensions(),
|
||||
"Vector dimensions should match encoder dimensions"
|
||||
);
|
||||
|
||||
// Verify L2 normalization
|
||||
let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!(
|
||||
(magnitude - 1.0).abs() < 1e-5,
|
||||
"Vector should be L2 normalized, got magnitude: {}",
|
||||
magnitude
|
||||
);
|
||||
|
||||
// Verify non-zero elements exist (sequence has k-mers)
|
||||
let non_zero_count = vector.iter().filter(|&&x| x != 0.0).count();
|
||||
assert!(non_zero_count > 0, "Vector should have non-zero elements");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_encoding_deterministic() {
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
let sequence = b"ACGTACGTACGTACGTACGT";
|
||||
|
||||
let vector1 = encoder
|
||||
.encode_sequence(sequence)
|
||||
.expect("Failed to encode sequence first time");
|
||||
let vector2 = encoder
|
||||
.encode_sequence(sequence)
|
||||
.expect("Failed to encode sequence second time");
|
||||
|
||||
// Verify same sequence produces identical vectors
|
||||
assert_eq!(
|
||||
vector1.len(),
|
||||
vector2.len(),
|
||||
"Vectors should have same length"
|
||||
);
|
||||
|
||||
for (i, (&v1, &v2)) in vector1.iter().zip(vector2.iter()).enumerate() {
|
||||
assert!(
|
||||
(v1 - v2).abs() < 1e-6,
|
||||
"Vector element {} should be identical: {} vs {}",
|
||||
i,
|
||||
v1,
|
||||
v2
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_complement_symmetry() {
|
||||
let kmer1 = b"ACGT";
|
||||
let kmer2 = b"ACGT"; // reverse complement is ACGT (palindrome)
|
||||
|
||||
let canon1 = canonical_kmer(kmer1);
|
||||
let canon2 = canonical_kmer(kmer2);
|
||||
|
||||
assert_eq!(canon1, canon2, "Canonical k-mers should be equal");
|
||||
|
||||
// Test with non-palindrome
|
||||
let kmer3 = b"AAAA";
|
||||
let kmer4 = b"TTTT"; // reverse complement of AAAA
|
||||
|
||||
let canon3 = canonical_kmer(kmer3);
|
||||
let canon4 = canonical_kmer(kmer4);
|
||||
|
||||
assert_eq!(
|
||||
canon3, canon4,
|
||||
"Canonical k-mer should be same for sequence and revcomp"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_index_insert_and_search() {
|
||||
let _temp_dir = create_test_db();
|
||||
|
||||
// Create index with k=11
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
|
||||
|
||||
// Insert 3 sequences
|
||||
let seq1 = b"ACGTACGTACGTACGTACGT";
|
||||
let seq2 = b"ACGTACGTACGTACGTACGG"; // Similar to seq1
|
||||
let seq3 = b"TTTTTTTTTTTTTTTTTTTT"; // Very different
|
||||
|
||||
index
|
||||
.index_sequence("seq1", seq1)
|
||||
.expect("Failed to index seq1");
|
||||
index
|
||||
.index_sequence("seq2", seq2)
|
||||
.expect("Failed to index seq2");
|
||||
index
|
||||
.index_sequence("seq3", seq3)
|
||||
.expect("Failed to index seq3");
|
||||
|
||||
// Search for similar sequences to seq1
|
||||
let results = index.search_similar(seq1, 3).expect("Failed to search");
|
||||
|
||||
assert!(results.len() > 0, "Should find at least one result");
|
||||
|
||||
// First result should be seq1 itself (exact match)
|
||||
assert_eq!(results[0].id, "seq1", "First result should be exact match");
|
||||
assert!(
|
||||
results[0].distance < 0.01,
|
||||
"Exact match should have very low distance: {}",
|
||||
results[0].distance
|
||||
);
|
||||
|
||||
// seq2 should be closer than seq3
|
||||
let seq2_idx = results.iter().position(|r| r.id == "seq2");
|
||||
let seq3_idx = results.iter().position(|r| r.id == "seq3");
|
||||
|
||||
if let (Some(idx2), Some(idx3)) = (seq2_idx, seq3_idx) {
|
||||
assert!(
|
||||
idx2 < idx3,
|
||||
"Similar sequence should rank higher than different sequence"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_index_batch_insert() {
|
||||
let _temp_dir = create_test_db();
|
||||
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
|
||||
|
||||
// Generate 100 random sequences
|
||||
let mut sequences = Vec::new();
|
||||
for i in 0..100 {
|
||||
let seq = generate_random_sequence(50, i as u64);
|
||||
sequences.push((format!("seq_{}", i), seq));
|
||||
}
|
||||
|
||||
// Convert to reference slices for batch insert
|
||||
let batch: Vec<(&str, &[u8])> = sequences
|
||||
.iter()
|
||||
.map(|(id, seq)| (id.as_str(), seq.as_slice()))
|
||||
.collect();
|
||||
|
||||
// Batch insert
|
||||
index
|
||||
.index_batch(batch)
|
||||
.expect("Failed to batch insert sequences");
|
||||
|
||||
// Verify we can search and get results
|
||||
let query = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
|
||||
let results = index.search_similar(query, 10).expect("Failed to search");
|
||||
|
||||
assert!(results.len() > 0, "Should find results after batch insert");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_similar_sequences_score_higher() {
|
||||
let _temp_dir = create_test_db();
|
||||
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
|
||||
|
||||
// Create two similar sequences (90% identical)
|
||||
let base_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"; // 40 bases
|
||||
let similar_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGG"; // 1 base different
|
||||
let random_seq = generate_random_sequence(40, 12345);
|
||||
|
||||
index
|
||||
.index_sequence("base", base_seq)
|
||||
.expect("Failed to index base");
|
||||
index
|
||||
.index_sequence("similar", similar_seq)
|
||||
.expect("Failed to index similar");
|
||||
index
|
||||
.index_sequence("random", &random_seq)
|
||||
.expect("Failed to index random");
|
||||
|
||||
// Search with base sequence
|
||||
let results = index
|
||||
.search_similar(base_seq, 10)
|
||||
.expect("Failed to search");
|
||||
|
||||
assert!(results.len() > 0, "Should find at least one result");
|
||||
|
||||
// Find positions in results
|
||||
let base_pos = results.iter().position(|r| r.id == "base");
|
||||
let similar_pos = results.iter().position(|r| r.id == "similar");
|
||||
|
||||
// Base and similar should definitely be in top results
|
||||
assert!(
|
||||
base_pos.is_some(),
|
||||
"Base sequence (exact match) should be found in results"
|
||||
);
|
||||
assert!(
|
||||
similar_pos.is_some(),
|
||||
"Similar sequence should be found in results"
|
||||
);
|
||||
|
||||
// Base should be first (exact match has distance 0)
|
||||
assert_eq!(
|
||||
base_pos.unwrap(),
|
||||
0,
|
||||
"Base sequence should be the top result (exact match)"
|
||||
);
|
||||
|
||||
// Similar sequence should be in top 3
|
||||
assert!(
|
||||
similar_pos.unwrap() < 3,
|
||||
"Similar sequence should rank in top 3, was at position {}",
|
||||
similar_pos.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_different_k_values() {
|
||||
// Test k=11
|
||||
let encoder11 = KmerEncoder::new(11).expect("Failed to create k=11 encoder");
|
||||
let seq = b"ACGTACGTACGTACGTACGTACGTACGT";
|
||||
let vec11 = encoder11
|
||||
.encode_sequence(seq)
|
||||
.expect("Failed to encode with k=11");
|
||||
assert_eq!(vec11.len(), encoder11.dimensions());
|
||||
|
||||
// Test k=21
|
||||
let encoder21 = KmerEncoder::new(21).expect("Failed to create k=21 encoder");
|
||||
let seq_long = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
|
||||
let vec21 = encoder21
|
||||
.encode_sequence(seq_long)
|
||||
.expect("Failed to encode with k=21");
|
||||
assert_eq!(vec21.len(), encoder21.dimensions());
|
||||
|
||||
// Test k=31
|
||||
let encoder31 = KmerEncoder::new(31).expect("Failed to create k=31 encoder");
|
||||
let seq_longer = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
|
||||
let vec31 = encoder31
|
||||
.encode_sequence(seq_longer)
|
||||
.expect("Failed to encode with k=31");
|
||||
assert_eq!(vec31.len(), encoder31.dimensions());
|
||||
|
||||
// All should be normalized
|
||||
for (vec, k) in &[(vec11, 11), (vec21, 21), (vec31, 31)] {
|
||||
let magnitude: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!(
|
||||
(magnitude - 1.0).abs() < 1e-5,
|
||||
"k={} vector should be normalized",
|
||||
k
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minhash_sketch_basic() {
|
||||
let num_hashes = 100;
|
||||
let mut sketch = MinHashSketch::new(num_hashes);
|
||||
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
|
||||
|
||||
let hashes = sketch
|
||||
.sketch(sequence, 11)
|
||||
.expect("Failed to sketch sequence");
|
||||
|
||||
assert!(
|
||||
hashes.len() <= num_hashes,
|
||||
"Sketch should have at most {} hashes, got {}",
|
||||
num_hashes,
|
||||
hashes.len()
|
||||
);
|
||||
assert!(hashes.len() > 0, "Sketch should have at least one hash");
|
||||
|
||||
// Verify hashes are sorted (implementation detail)
|
||||
for i in 1..hashes.len() {
|
||||
assert!(hashes[i] >= hashes[i - 1], "Hashes should be sorted");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minhash_jaccard_identical() {
|
||||
let mut sketch1 = MinHashSketch::new(100);
|
||||
let mut sketch2 = MinHashSketch::new(100);
|
||||
|
||||
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
|
||||
|
||||
sketch1
|
||||
.sketch(sequence, 11)
|
||||
.expect("Failed to sketch sequence 1");
|
||||
sketch2
|
||||
.sketch(sequence, 11)
|
||||
.expect("Failed to sketch sequence 2");
|
||||
|
||||
let distance = sketch1.jaccard_distance(&sketch2);
|
||||
|
||||
assert!(
|
||||
distance < 0.01,
|
||||
"Identical sequences should have distance close to 0, got {}",
|
||||
distance
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minhash_jaccard_different() {
|
||||
let mut sketch1 = MinHashSketch::new(100);
|
||||
let mut sketch2 = MinHashSketch::new(100);
|
||||
|
||||
let seq1 = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
|
||||
let seq2 = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
|
||||
|
||||
sketch1
|
||||
.sketch(seq1, 11)
|
||||
.expect("Failed to sketch sequence 1");
|
||||
sketch2
|
||||
.sketch(seq2, 11)
|
||||
.expect("Failed to sketch sequence 2");
|
||||
|
||||
let distance = sketch1.jaccard_distance(&sketch2);
|
||||
|
||||
assert!(
|
||||
distance > 0.9,
|
||||
"Very different sequences should have distance close to 1, got {}",
|
||||
distance
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_index_empty_sequence() {
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
|
||||
// Test empty sequence
|
||||
let empty_seq = b"";
|
||||
let result = encoder.encode_sequence(empty_seq);
|
||||
|
||||
assert!(result.is_err(), "Empty sequence should return error");
|
||||
|
||||
// Test sequence shorter than k
|
||||
let short_seq = b"ACGT"; // k=11 but only 4 bases
|
||||
let result = encoder.encode_sequence(short_seq);
|
||||
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"Sequence shorter than k should return error"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_index_with_n_bases() {
|
||||
let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
|
||||
|
||||
// Sequence with N (unknown) bases
|
||||
let seq_with_n = b"ACGTACGTNNNACGTACGT";
|
||||
|
||||
// Should still encode (N bases are handled in canonical_kmer)
|
||||
let result = encoder.encode_sequence(seq_with_n);
|
||||
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Sequence with N bases should encode successfully"
|
||||
);
|
||||
|
||||
let vector = result.unwrap();
|
||||
assert_eq!(
|
||||
vector.len(),
|
||||
encoder.dimensions(),
|
||||
"Vector should have correct dimensions"
|
||||
);
|
||||
}
|
||||
|
||||
// Helper function to generate random DNA sequences
|
||||
fn generate_random_sequence(length: usize, seed: u64) -> Vec<u8> {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let bases = [b'A', b'C', b'G', b'T'];
|
||||
let mut sequence = Vec::with_capacity(length);
|
||||
|
||||
for i in 0..length {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
seed.hash(&mut hasher);
|
||||
i.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
let base_idx = (hash % 4) as usize;
|
||||
sequence.push(bases[base_idx]);
|
||||
}
|
||||
|
||||
sequence
|
||||
}
|
||||
353
examples/dna/tests/pipeline_tests.rs
Normal file
353
examples/dna/tests/pipeline_tests.rs
Normal file
@@ -0,0 +1,353 @@
|
||||
//! End-to-End Integration Tests for DNA Analysis Pipeline
|
||||
//!
|
||||
//! Real data, real computation, real assertions. No mocks, no stubs.
|
||||
//! Tests the complete DNA analysis workflow from nucleotide encoding
|
||||
//! through variant calling, protein translation, epigenetics, and pharmacogenomics.
|
||||
|
||||
use ::rvdna::*;
|
||||
|
||||
// ============================================================================
|
||||
// NUCLEOTIDE & SEQUENCE TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_nucleotide_encoding() {
|
||||
assert_eq!(Nucleotide::A.to_u8(), 0);
|
||||
assert_eq!(Nucleotide::C.to_u8(), 1);
|
||||
assert_eq!(Nucleotide::G.to_u8(), 2);
|
||||
assert_eq!(Nucleotide::T.to_u8(), 3);
|
||||
assert_eq!(Nucleotide::N.to_u8(), 4);
|
||||
|
||||
assert_eq!(Nucleotide::from_u8(0).unwrap(), Nucleotide::A);
|
||||
assert_eq!(Nucleotide::from_u8(1).unwrap(), Nucleotide::C);
|
||||
assert_eq!(Nucleotide::from_u8(2).unwrap(), Nucleotide::G);
|
||||
assert_eq!(Nucleotide::from_u8(3).unwrap(), Nucleotide::T);
|
||||
assert_eq!(Nucleotide::from_u8(4).unwrap(), Nucleotide::N);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dna_sequence_reverse_complement() {
|
||||
let seq1 = DnaSequence::from_str("ACGT").unwrap();
|
||||
let rc1 = seq1.reverse_complement();
|
||||
assert_eq!(rc1.to_string(), "ACGT");
|
||||
|
||||
let seq2 = DnaSequence::from_str("AACG").unwrap();
|
||||
let rc2 = seq2.reverse_complement();
|
||||
assert_eq!(rc2.to_string(), "CGTT");
|
||||
|
||||
let seq3 = DnaSequence::from_str("ATGCATGC").unwrap();
|
||||
let rc3 = seq3.reverse_complement();
|
||||
assert_eq!(rc3.to_string(), "GCATGCAT");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// VARIANT CALLING TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_variant_calling_homozygous_snp() {
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
|
||||
let pileup = PileupColumn {
|
||||
bases: vec![b'G'; 15],
|
||||
qualities: vec![40; 15],
|
||||
position: 1000,
|
||||
chromosome: 1,
|
||||
};
|
||||
|
||||
let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
|
||||
assert_eq!(call.genotype, Genotype::HomAlt);
|
||||
assert_eq!(call.alt_allele, b'G');
|
||||
assert_eq!(call.ref_allele, b'A');
|
||||
assert!(call.quality > 20.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_variant_calling_heterozygous_snp() {
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
|
||||
let mut bases = vec![b'A'; 10];
|
||||
bases.extend(vec![b'G'; 10]);
|
||||
|
||||
let pileup = PileupColumn {
|
||||
bases,
|
||||
qualities: vec![40; 20],
|
||||
position: 2000,
|
||||
chromosome: 1,
|
||||
};
|
||||
|
||||
let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
|
||||
assert_eq!(call.genotype, Genotype::Het);
|
||||
assert_eq!(call.alt_allele, b'G');
|
||||
assert!(call.quality > 20.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_variant_calling_no_variant() {
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
|
||||
let pileup = PileupColumn {
|
||||
bases: vec![b'A'; 20],
|
||||
qualities: vec![40; 20],
|
||||
position: 3000,
|
||||
chromosome: 1,
|
||||
};
|
||||
|
||||
let call = caller.call_snp(&pileup, b'A');
|
||||
if let Some(c) = call {
|
||||
assert_eq!(c.ref_allele, b'A');
|
||||
assert!((c.allele_depth as f32 / c.depth as f32) < 0.2);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_variant_quality_filtering() {
|
||||
let mut config = VariantCallerConfig::default();
|
||||
config.min_quality = 30;
|
||||
config.min_depth = 10;
|
||||
let caller = VariantCaller::new(config);
|
||||
|
||||
let mut calls = vec![
|
||||
VariantCall {
|
||||
chromosome: 1,
|
||||
position: 1000,
|
||||
ref_allele: b'A',
|
||||
alt_allele: b'G',
|
||||
quality: 35.0,
|
||||
genotype: Genotype::Het,
|
||||
depth: 20,
|
||||
allele_depth: 10,
|
||||
filter_status: FilterStatus::Pass,
|
||||
},
|
||||
VariantCall {
|
||||
chromosome: 1,
|
||||
position: 2000,
|
||||
ref_allele: b'C',
|
||||
alt_allele: b'T',
|
||||
quality: 25.0,
|
||||
genotype: Genotype::Het,
|
||||
depth: 20,
|
||||
allele_depth: 10,
|
||||
filter_status: FilterStatus::Pass,
|
||||
},
|
||||
VariantCall {
|
||||
chromosome: 1,
|
||||
position: 3000,
|
||||
ref_allele: b'G',
|
||||
alt_allele: b'A',
|
||||
quality: 40.0,
|
||||
genotype: Genotype::Het,
|
||||
depth: 5,
|
||||
allele_depth: 2,
|
||||
filter_status: FilterStatus::Pass,
|
||||
},
|
||||
];
|
||||
|
||||
caller.filter_variants(&mut calls);
|
||||
assert_eq!(calls[0].filter_status, FilterStatus::Pass);
|
||||
assert_eq!(calls[1].filter_status, FilterStatus::LowQuality);
|
||||
assert_eq!(calls[2].filter_status, FilterStatus::LowDepth);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PROTEIN TRANSLATION TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_protein_translation() {
|
||||
use ::rvdna::protein::{translate_dna, AminoAcid};
|
||||
let proteins = translate_dna(b"ATGGCAGGT");
|
||||
assert_eq!(proteins.len(), 3);
|
||||
assert_eq!(proteins[0], AminoAcid::Met);
|
||||
assert_eq!(proteins[1], AminoAcid::Ala);
|
||||
assert_eq!(proteins[2], AminoAcid::Gly);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_protein_translation_stop_codon() {
|
||||
use ::rvdna::protein::{translate_dna, AminoAcid};
|
||||
let p1 = translate_dna(b"ATGGCATAA");
|
||||
assert_eq!(p1.len(), 2);
|
||||
assert_eq!(p1[0], AminoAcid::Met);
|
||||
|
||||
let p2 = translate_dna(b"ATGGCATAG");
|
||||
assert_eq!(p2.len(), 2);
|
||||
|
||||
let p3 = translate_dna(b"ATGGCATGA");
|
||||
assert_eq!(p3.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_amino_acid_hydrophobicity() {
|
||||
use ::rvdna::protein::AminoAcid;
|
||||
assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
|
||||
assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
|
||||
assert_eq!(AminoAcid::Val.hydrophobicity(), 4.2);
|
||||
assert_eq!(AminoAcid::Lys.hydrophobicity(), -3.9);
|
||||
assert_eq!(AminoAcid::Gly.hydrophobicity(), -0.4);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// EPIGENETICS TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_methylation_profile_creation() {
|
||||
let positions = vec![(1, 1000), (1, 2000), (2, 3000), (2, 4000)];
|
||||
let betas = vec![0.1, 0.5, 0.8, 0.3];
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
assert_eq!(profile.sites.len(), 4);
|
||||
let mean = profile.mean_methylation();
|
||||
assert!((mean - 0.425).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_horvath_clock_prediction() {
|
||||
let clock = HorvathClock::default_clock();
|
||||
let positions: Vec<(u8, u64)> = (0..700).map(|i| (1, i * 1000)).collect();
|
||||
let betas: Vec<f32> = (0..700)
|
||||
.map(|i| {
|
||||
if i < 100 {
|
||||
0.3
|
||||
} else if i < 200 {
|
||||
0.7
|
||||
} else {
|
||||
0.5
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
let predicted_age = clock.predict_age(&profile);
|
||||
assert!(predicted_age > 0.0);
|
||||
assert!(predicted_age < 150.0);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PHARMACOGENOMICS TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_pharma_star_allele_calling() {
|
||||
assert_eq!(call_star_allele(&[]), StarAllele::Star1);
|
||||
assert_eq!(
|
||||
call_star_allele(&[(42130692, b'G', b'A')]),
|
||||
StarAllele::Star4
|
||||
);
|
||||
assert_eq!(
|
||||
call_star_allele(&[(42126611, b'T', b'-')]),
|
||||
StarAllele::Star5
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pharma_metabolizer_phenotype() {
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star1, &StarAllele::Star1),
|
||||
MetabolizerPhenotype::Normal
|
||||
);
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star1, &StarAllele::Star4),
|
||||
MetabolizerPhenotype::Normal
|
||||
);
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star4, &StarAllele::Star4),
|
||||
MetabolizerPhenotype::Poor
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// ALIGNMENT TESTS
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_smith_waterman_alignment() {
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let query = DnaSequence::from_str("ACGT").unwrap();
|
||||
let reference = DnaSequence::from_str("ACGT").unwrap();
|
||||
let result = aligner.align(&query, &reference).unwrap();
|
||||
assert_eq!(result.score, 8); // 4 matches * 2 points each
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attention_alignment() {
|
||||
let query = DnaSequence::from_str("ATCGATCG").unwrap();
|
||||
let reference = DnaSequence::from_str("TTTTATCGATCGTTTT").unwrap();
|
||||
let alignment = query.align_with_attention(&reference).unwrap();
|
||||
assert!(alignment.score > 0);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// FULL PIPELINE INTEGRATION
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_config_defaults() {
|
||||
let config = AnalysisConfig::default();
|
||||
assert_eq!(config.kmer_size, 11);
|
||||
assert_eq!(config.vector_dims, 512);
|
||||
assert_eq!(config.min_quality, 20);
|
||||
assert!(config.parameters.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_full_pipeline_runs() {
|
||||
// 1. Create and manipulate DNA
|
||||
let dna_seq = DnaSequence::from_str("ATGCGATCGATCGATCGATCGTAGCTAGCTAGC").unwrap();
|
||||
let rev_comp = dna_seq.reverse_complement();
|
||||
assert_eq!(rev_comp.len(), dna_seq.len());
|
||||
|
||||
// 2. K-mer vector
|
||||
let kmer_vec = dna_seq.to_kmer_vector(11, 512).unwrap();
|
||||
assert_eq!(kmer_vec.len(), 512);
|
||||
|
||||
// 3. Variant calling
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
let pileup = PileupColumn {
|
||||
bases: vec![b'A', b'A', b'G', b'G', b'G', b'G', b'G', b'G', b'G', b'G'],
|
||||
qualities: vec![40; 10],
|
||||
position: 1000,
|
||||
chromosome: 1,
|
||||
};
|
||||
assert!(caller.call_snp(&pileup, b'A').is_some());
|
||||
|
||||
// 4. Protein translation
|
||||
let proteins = translate_dna(b"ATGGCAGGTAAACCC");
|
||||
assert!(!proteins.is_empty());
|
||||
|
||||
// 5. Methylation + Horvath
|
||||
let profile = MethylationProfile::from_beta_values(
|
||||
vec![(1, 1000), (1, 2000), (1, 3000)],
|
||||
vec![0.3, 0.5, 0.7],
|
||||
);
|
||||
let age = HorvathClock::default_clock().predict_age(&profile);
|
||||
assert!(age > 0.0);
|
||||
|
||||
// 6. Pharmacogenomics
|
||||
let allele = call_star_allele(&[(42130692, b'G', b'A')]);
|
||||
assert_eq!(allele, StarAllele::Star4);
|
||||
let phenotype = predict_phenotype(&allele, &StarAllele::Star1);
|
||||
assert_eq!(phenotype, MetabolizerPhenotype::Normal);
|
||||
|
||||
// 7. Alignment
|
||||
let alignment = dna_seq.align_with_attention(&rev_comp).unwrap();
|
||||
assert!(alignment.score > 0);
|
||||
|
||||
// 8. Protein contact graph
|
||||
let protein = ProteinSequence::new(vec![
|
||||
ProteinResidue::A,
|
||||
ProteinResidue::V,
|
||||
ProteinResidue::L,
|
||||
ProteinResidue::I,
|
||||
ProteinResidue::F,
|
||||
ProteinResidue::G,
|
||||
ProteinResidue::K,
|
||||
ProteinResidue::D,
|
||||
ProteinResidue::E,
|
||||
ProteinResidue::R,
|
||||
ProteinResidue::M,
|
||||
ProteinResidue::N,
|
||||
]);
|
||||
let graph = protein.build_contact_graph(8.0).unwrap();
|
||||
let contacts = protein.predict_contacts(&graph).unwrap();
|
||||
assert!(!contacts.is_empty());
|
||||
}
|
||||
191
examples/dna/tests/security_tests.rs
Normal file
191
examples/dna/tests/security_tests.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
//! Security validation tests for DNA analyzer - NO MOCKS, real computation only
|
||||
use ::rvdna::error::DnaError;
|
||||
use ::rvdna::types::*;
|
||||
use ::rvdna::VectorEntry;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
|
||||
#[test]
|
||||
fn test_buffer_overflow_protection() {
|
||||
// 10M+ bases shouldn't cause OOM/crash
|
||||
let large_size = 10_000_000;
|
||||
let bases: Vec<Nucleotide> = (0..large_size)
|
||||
.map(|i| match i % 4 {
|
||||
0 => Nucleotide::A,
|
||||
1 => Nucleotide::C,
|
||||
2 => Nucleotide::G,
|
||||
_ => Nucleotide::T,
|
||||
})
|
||||
.collect();
|
||||
let seq = DnaSequence::new(bases);
|
||||
assert_eq!(seq.len(), large_size);
|
||||
let rc = seq.reverse_complement();
|
||||
assert_eq!(rc.len(), large_size);
|
||||
assert!(seq.to_kmer_vector(11, 512).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_base_handling() {
|
||||
// Non-ACGTN characters rejected gracefully
|
||||
for input in ["ACGTX", "ACGT123", "ACGT!@#"] {
|
||||
let result = DnaSequence::from_str(input);
|
||||
assert!(result.is_err());
|
||||
assert!(matches!(result.unwrap_err(), DnaError::InvalidSequence(_)));
|
||||
}
|
||||
assert!(DnaSequence::from_str("ACGTN").is_ok());
|
||||
assert!(DnaSequence::from_str("acgtn").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unicode_injection() {
|
||||
// Unicode/malicious IDs don't break indexing
|
||||
let seq = DnaSequence::from_str("ACGTACGT").unwrap();
|
||||
let vector = seq.to_kmer_vector(3, 128).unwrap();
|
||||
let temp_dir = std::env::temp_dir().join(format!("dna_test_{}", std::process::id()));
|
||||
let _ = std::fs::create_dir_all(&temp_dir);
|
||||
let index = KmerIndex::new(3, 128, temp_dir.join("unicode").to_str().unwrap()).unwrap();
|
||||
|
||||
for id in ["seq_cafe_dna", "patient123", "seq_hidden"] {
|
||||
let entry = VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector: vector.clone(),
|
||||
metadata: None,
|
||||
};
|
||||
assert!(index.db().insert(entry).is_ok());
|
||||
}
|
||||
let _ = std::fs::remove_dir_all(&temp_dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_path_traversal_prevention() {
|
||||
// Verify KmerIndex handles unusual paths without panicking
|
||||
// The key security property: operations complete or fail gracefully
|
||||
let temp_dir = std::env::temp_dir().join(format!("dna_path_{}", std::process::id()));
|
||||
let _ = std::fs::create_dir_all(&temp_dir);
|
||||
|
||||
for path in ["../../../tmp/evil", "../../etc/passwd"] {
|
||||
let full_path = temp_dir.join(path);
|
||||
// KmerIndex creation with traversal paths should either succeed
|
||||
// (contained to actual resolved path) or fail gracefully - never panic
|
||||
let result =
|
||||
std::panic::catch_unwind(|| KmerIndex::new(3, 128, full_path.to_str().unwrap()));
|
||||
assert!(result.is_ok(), "Path traversal should not cause panic");
|
||||
}
|
||||
|
||||
// Clean up any created dirs
|
||||
let _ = std::fs::remove_dir_all(&temp_dir);
|
||||
let _ = std::fs::remove_dir_all(std::env::temp_dir().join("evil"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_integer_overflow_kmer() {
|
||||
// k=64 would overflow, k=0 invalid
|
||||
let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
|
||||
assert!(matches!(
|
||||
seq.to_kmer_vector(64, 512).unwrap_err(),
|
||||
DnaError::InvalidKmerSize(64)
|
||||
));
|
||||
assert!(seq.to_kmer_vector(0, 512).is_err());
|
||||
assert!(seq.to_kmer_vector(11, 512).is_ok());
|
||||
assert!(seq.to_kmer_vector(15, 512).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_input_safety() {
|
||||
// Empty inputs handled safely
|
||||
assert!(matches!(
|
||||
DnaSequence::from_str("").unwrap_err(),
|
||||
DnaError::EmptySequence
|
||||
));
|
||||
let empty = DnaSequence::new(vec![]);
|
||||
assert!(empty.is_empty() && empty.len() == 0);
|
||||
assert!(empty.complement().is_empty());
|
||||
assert!(empty.reverse_complement().is_empty());
|
||||
assert_eq!(empty.to_string(), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_null_byte_handling() {
|
||||
// Null bytes rejected
|
||||
assert!(DnaSequence::from_str("ACGT\0").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_concurrent_access_safety() {
|
||||
// 10 threads accessing VectorDB concurrently
|
||||
let temp_dir = std::env::temp_dir().join(format!("dna_conc_{}", std::process::id()));
|
||||
let _ = std::fs::create_dir_all(&temp_dir);
|
||||
let index = Arc::new(Mutex::new(
|
||||
KmerIndex::new(3, 128, temp_dir.join("idx").to_str().unwrap()).unwrap(),
|
||||
));
|
||||
|
||||
let handles: Vec<_> = (0..10)
|
||||
.map(|i| {
|
||||
let idx_clone = Arc::clone(&index);
|
||||
thread::spawn(move || {
|
||||
let seq = DnaSequence::from_str("ACGTACGTACGT").unwrap();
|
||||
let entry = VectorEntry {
|
||||
id: Some(format!("seq_{}", i)),
|
||||
vector: seq.to_kmer_vector(3, 128).unwrap(),
|
||||
metadata: None,
|
||||
};
|
||||
idx_clone.lock().unwrap().db().insert(entry).unwrap();
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
for h in handles {
|
||||
assert!(h.join().is_ok());
|
||||
}
|
||||
let _ = std::fs::remove_dir_all(&temp_dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_score_bounds() {
|
||||
// Phred >93 rejected, 0-93 accepted
|
||||
assert!(matches!(
|
||||
QualityScore::new(100).unwrap_err(),
|
||||
DnaError::InvalidQuality(100)
|
||||
));
|
||||
assert!(QualityScore::new(0).is_ok());
|
||||
assert!(QualityScore::new(93).is_ok());
|
||||
assert!((QualityScore::new(30).unwrap().to_error_probability() - 0.001).abs() < 1e-6);
|
||||
assert!((QualityScore::new(0).unwrap().to_error_probability() - 1.0).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_variant_position_overflow() {
|
||||
// u64::MAX positions handled
|
||||
let pos = GenomicPosition {
|
||||
chromosome: 25,
|
||||
position: u64::MAX,
|
||||
reference_allele: Nucleotide::A,
|
||||
alternate_allele: Some(Nucleotide::G),
|
||||
};
|
||||
assert_eq!(pos.position, u64::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_methylation_bounds() {
|
||||
// Beta values clamped to [0,1]
|
||||
for val in [-0.5f32, 0.0, 0.5, 1.0, 1.5] {
|
||||
let clamped = val.clamp(0.0, 1.0);
|
||||
assert!(clamped >= 0.0 && clamped <= 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deterministic_output() {
|
||||
// Same input -> same output (no randomness)
|
||||
let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
|
||||
assert_eq!(
|
||||
seq.to_kmer_vector(11, 512).unwrap(),
|
||||
seq.to_kmer_vector(11, 512).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
seq.reverse_complement().to_string(),
|
||||
seq.reverse_complement().to_string()
|
||||
);
|
||||
assert_eq!(seq.complement().to_string(), seq.complement().to_string());
|
||||
assert_eq!(seq.to_string(), seq.to_string());
|
||||
}
|
||||
Reference in New Issue
Block a user