Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/examples/dna/tests/biomarker_tests.rs
+++ b/examples/dna/tests/biomarker_tests.rs
@@ -0,0 +1,409 @@
+//! Integration tests for the biomarker analysis engine.
+//!
+//! Tests composite risk scoring, profile vector encoding, clinical biomarker
+//! references, synthetic population generation, and streaming biomarker
+//! processing with anomaly and trend detection.
+
+use rvdna::biomarker::*;
+use rvdna::biomarker_stream::*;
+use std::collections::HashMap;
+
+// ============================================================================
+// COMPOSITE RISK SCORING TESTS
+// ============================================================================
+
+#[test]
+fn test_compute_risk_scores_baseline() {
+    // All homozygous reference (low risk) genotypes
+    let mut gts = HashMap::new();
+    gts.insert("rs429358".to_string(), "TT".to_string()); // APOE ref
+    gts.insert("rs7412".to_string(), "CC".to_string()); // APOE ref
+    gts.insert("rs4680".to_string(), "GG".to_string()); // COMT ref
+    gts.insert("rs1799971".to_string(), "AA".to_string()); // OPRM1 ref
+    gts.insert("rs762551".to_string(), "AA".to_string()); // CYP1A2 fast
+    gts.insert("rs1801133".to_string(), "GG".to_string()); // MTHFR ref
+    gts.insert("rs1801131".to_string(), "TT".to_string()); // MTHFR ref
+    gts.insert("rs1042522".to_string(), "CC".to_string()); // TP53 ref
+    gts.insert("rs80357906".to_string(), "DD".to_string()); // BRCA1 ref
+    gts.insert("rs4363657".to_string(), "TT".to_string()); // SLCO1B1 ref
+
+    let profile = compute_risk_scores(&gts);
+    assert!(
+        profile.global_risk_score < 0.3,
+        "Baseline should be low risk, got {}",
+        profile.global_risk_score
+    );
+    assert!(!profile.category_scores.is_empty());
+}
+
+#[test]
+fn test_compute_risk_scores_high_risk() {
+    // High-risk genotype combinations
+    let mut gts = HashMap::new();
+    gts.insert("rs429358".to_string(), "CC".to_string()); // APOE e4/e4
+    gts.insert("rs7412".to_string(), "CC".to_string());
+    gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
+    gts.insert("rs1799971".to_string(), "GG".to_string()); // OPRM1 Asp/Asp
+    gts.insert("rs1801133".to_string(), "AA".to_string()); // MTHFR 677TT
+    gts.insert("rs1801131".to_string(), "GG".to_string()); // MTHFR 1298CC
+    gts.insert("rs4363657".to_string(), "CC".to_string()); // SLCO1B1 hom variant
+
+    let profile = compute_risk_scores(&gts);
+    assert!(
+        profile.global_risk_score > 0.4,
+        "High-risk should score >0.4, got {}",
+        profile.global_risk_score
+    );
+}
+
+// ============================================================================
+// PROFILE VECTOR TESTS
+// ============================================================================
+
+#[test]
+fn test_profile_vector_dimension() {
+    let gts = HashMap::new(); // empty genotypes
+    let profile = compute_risk_scores(&gts);
+    assert_eq!(
+        profile.profile_vector.len(),
+        64,
+        "Profile vector must be exactly 64 dimensions"
+    );
+}
+
+#[test]
+fn test_profile_vector_normalized() {
+    let mut gts = HashMap::new();
+    gts.insert("rs429358".to_string(), "CT".to_string());
+    gts.insert("rs4680".to_string(), "AG".to_string());
+    let profile = compute_risk_scores(&gts);
+    let mag: f32 = profile
+        .profile_vector
+        .iter()
+        .map(|x| x * x)
+        .sum::<f32>()
+        .sqrt();
+    assert!(
+        (mag - 1.0).abs() < 0.01 || mag == 0.0,
+        "Vector should be L2-normalized, got magnitude {}",
+        mag
+    );
+}
+
+// ============================================================================
+// BIOMARKER REFERENCE TESTS
+// ============================================================================
+
+#[test]
+fn test_biomarker_references_exist() {
+    let refs = biomarker_references();
+    assert!(
+        refs.len() >= 13,
+        "Should have at least 13 biomarker references, got {}",
+        refs.len()
+    );
+}
+
+#[test]
+fn test_z_score_computation() {
+    let refs = biomarker_references();
+    let cholesterol_ref = refs.iter().find(|r| r.name == "Total Cholesterol").unwrap();
+
+    // Normal value should have |z| < 2
+    let z_normal = z_score(180.0, cholesterol_ref);
+    assert!(
+        z_normal.abs() < 2.0,
+        "Normal cholesterol z-score should be small: {}",
+        z_normal
+    );
+
+    // High value should have z > 0
+    let z_high = z_score(300.0, cholesterol_ref);
+    assert!(
+        z_high > 0.0,
+        "High cholesterol should have positive z-score: {}",
+        z_high
+    );
+}
+
+#[test]
+fn test_biomarker_classification() {
+    let refs = biomarker_references();
+    let glucose_ref = refs.iter().find(|r| r.name == "Fasting Glucose").unwrap();
+
+    let class_normal = classify_biomarker(85.0, glucose_ref);
+    // Should be normal range
+    let class_high = classify_biomarker(200.0, glucose_ref);
+    // Should be high/critical
+    assert_ne!(format!("{:?}", class_normal), format!("{:?}", class_high));
+}
+
+// ============================================================================
+// SYNTHETIC POPULATION TESTS
+// ============================================================================
+
+#[test]
+fn test_synthetic_population() {
+    let pop = generate_synthetic_population(100, 42);
+    assert_eq!(pop.len(), 100);
+
+    // All vectors should be 64-dim
+    for profile in &pop {
+        assert_eq!(profile.profile_vector.len(), 64);
+    }
+
+    // Risk scores should span a range
+    let scores: Vec<f64> = pop.iter().map(|p| p.global_risk_score).collect();
+    let min = scores.iter().cloned().fold(f64::INFINITY, f64::min);
+    let max = scores.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+    assert!(
+        max - min > 0.1,
+        "Population should have risk score variance, range: {:.3}..{:.3}",
+        min,
+        max
+    );
+}
+
+#[test]
+fn test_synthetic_population_deterministic() {
+    let pop1 = generate_synthetic_population(50, 42);
+    let pop2 = generate_synthetic_population(50, 42);
+    assert_eq!(pop1.len(), pop2.len());
+    for (a, b) in pop1.iter().zip(pop2.iter()) {
+        assert!((a.global_risk_score - b.global_risk_score).abs() < 1e-10);
+    }
+}
+
+// ============================================================================
+// STREAMING TESTS
+// ============================================================================
+
+#[test]
+fn test_ring_buffer_basic() {
+    let mut rb: RingBuffer<f64> = RingBuffer::new(5);
+    for i in 0..3 {
+        rb.push(i as f64);
+    }
+    assert_eq!(rb.len(), 3);
+    let items: Vec<f64> = rb.iter().cloned().collect();
+    assert_eq!(items, vec![0.0, 1.0, 2.0]);
+}
+
+#[test]
+fn test_ring_buffer_overflow() {
+    let mut rb: RingBuffer<f64> = RingBuffer::new(3);
+    for i in 0..5 {
+        rb.push(i as f64);
+    }
+    assert_eq!(rb.len(), 3);
+    let items: Vec<f64> = rb.iter().cloned().collect();
+    assert_eq!(items, vec![2.0, 3.0, 4.0]);
+}
+
+#[test]
+fn test_stream_generation() {
+    let config = StreamConfig::default();
+    let num_biomarkers = config.num_biomarkers;
+    let readings = generate_readings(&config, 1000, 42);
+    // generate_readings produces count * num_biomarkers total readings
+    assert_eq!(readings.len(), 1000 * num_biomarkers);
+
+    // All values should be positive
+    for r in &readings {
+        assert!(
+            r.value > 0.0,
+            "Biomarker values should be positive: {} = {}",
+            r.biomarker_id,
+            r.value
+        );
+    }
+}
+
+#[test]
+fn test_stream_processor() {
+    let config = StreamConfig::default();
+    let num_biomarkers = config.num_biomarkers;
+    let readings = generate_readings(&config, 500, 42);
+    let mut processor = StreamProcessor::new(config);
+
+    for reading in &readings {
+        processor.process_reading(reading);
+    }
+
+    let summary = processor.summary();
+    assert_eq!(summary.total_readings, 500 * num_biomarkers as u64);
+    assert!(
+        summary.anomaly_rate < 0.2,
+        "Anomaly rate should be reasonable: {}",
+        summary.anomaly_rate
+    );
+}
+
+#[test]
+fn test_anomaly_detection() {
+    let config = StreamConfig {
+        anomaly_probability: 0.0, // No random anomalies
+        num_biomarkers: 1,
+        ..StreamConfig::default()
+    };
+
+    let readings = generate_readings(&config, 200, 42);
+    let mut processor = StreamProcessor::new(config);
+
+    for reading in &readings {
+        processor.process_reading(reading);
+    }
+
+    // With no anomaly injection, anomaly rate should be very low
+    let summary = processor.summary();
+    assert!(
+        summary.anomaly_rate < 0.1,
+        "Without injection, anomaly rate should be low: {}",
+        summary.anomaly_rate
+    );
+}
+
+// ============================================================================
+// GENE-GENE INTERACTION TESTS
+// ============================================================================
+
+#[test]
+fn test_mthfr_comt_interaction() {
+    // MTHFR A1298C hom + COMT Met/Met should amplify neurological score
+    let mut gts_both = HashMap::new();
+    gts_both.insert("rs1801131".to_string(), "GG".to_string()); // A1298C hom_alt
+    gts_both.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
+    let both = compute_risk_scores(&gts_both);
+
+    let mut gts_one = HashMap::new();
+    gts_one.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met only
+    let one = compute_risk_scores(&gts_one);
+
+    let n_both = both.category_scores.get("Neurological").unwrap().score;
+    let n_one = one.category_scores.get("Neurological").unwrap().score;
+    assert!(
+        n_both > n_one,
+        "MTHFR×COMT interaction should amplify: {n_both} > {n_one}"
+    );
+}
+
+#[test]
+fn test_drd2_comt_interaction() {
+    // DRD2 Taq1A + COMT variant should amplify neurological score
+    let mut gts = HashMap::new();
+    gts.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 hom_alt
+    gts.insert("rs4680".to_string(), "AA".to_string()); // COMT Met/Met
+    let with = compute_risk_scores(&gts);
+
+    let mut gts2 = HashMap::new();
+    gts2.insert("rs1800497".to_string(), "AA".to_string()); // DRD2 only
+    let without = compute_risk_scores(&gts2);
+
+    let n_with = with.category_scores.get("Neurological").unwrap().score;
+    let n_without = without.category_scores.get("Neurological").unwrap().score;
+    assert!(
+        n_with > n_without,
+        "DRD2×COMT interaction should amplify: {n_with} > {n_without}"
+    );
+}
+
+// ============================================================================
+// GENE-BIOMARKER CORRELATION TESTS
+// ============================================================================
+
+#[test]
+fn test_apoe_lowers_hdl_in_population() {
+    let pop = generate_synthetic_population(300, 88);
+    let (mut apoe_hdl, mut ref_hdl) = (Vec::new(), Vec::new());
+    for p in &pop {
+        let hdl = p.biomarker_values.get("HDL").copied().unwrap_or(0.0);
+        // APOE carriers have elevated neurological scores from rs429358
+        let neuro = p
+            .category_scores
+            .get("Neurological")
+            .map(|c| c.score)
+            .unwrap_or(0.0);
+        if neuro > 0.3 {
+            apoe_hdl.push(hdl);
+        } else {
+            ref_hdl.push(hdl);
+        }
+    }
+    if !apoe_hdl.is_empty() && !ref_hdl.is_empty() {
+        let avg_apoe = apoe_hdl.iter().sum::<f64>() / apoe_hdl.len() as f64;
+        let avg_ref = ref_hdl.iter().sum::<f64>() / ref_hdl.len() as f64;
+        assert!(
+            avg_apoe < avg_ref,
+            "APOE e4 should lower HDL: {avg_apoe} < {avg_ref}"
+        );
+    }
+}
+
+#[test]
+fn test_cusum_changepoint_detection() {
+    let mut p = StreamProcessor::new(StreamConfig {
+        window_size: 20,
+        ..Default::default()
+    });
+    // Establish baseline
+    for i in 0..30 {
+        p.process_reading(&BiomarkerReading {
+            timestamp_ms: i * 1000,
+            biomarker_id: "glucose".into(),
+            value: 85.0,
+            reference_low: 70.0,
+            reference_high: 100.0,
+            is_anomaly: false,
+            z_score: 0.0,
+        });
+    }
+    // Inject a sustained shift (changepoint)
+    for i in 30..50 {
+        p.process_reading(&BiomarkerReading {
+            timestamp_ms: i * 1000,
+            biomarker_id: "glucose".into(),
+            value: 120.0,
+            reference_low: 70.0,
+            reference_high: 100.0,
+            is_anomaly: false,
+            z_score: 0.0,
+        });
+    }
+    let stats = p.get_stats("glucose").unwrap();
+    // After sustained shift, CUSUM should have triggered at least once
+    // (changepoint_detected resets after trigger, but the sustained shift
+    // will keep re-triggering, so the final state may or may not be true)
+    assert!(
+        stats.mean > 90.0,
+        "Mean should shift upward after changepoint: {}",
+        stats.mean
+    );
+}
+
+#[test]
+fn test_trend_detection() {
+    let config = StreamConfig {
+        drift_rate: 0.5, // Strong upward drift
+        anomaly_probability: 0.0,
+        num_biomarkers: 1,
+        window_size: 50,
+        ..StreamConfig::default()
+    };
+
+    let readings = generate_readings(&config, 200, 42);
+    let mut processor = StreamProcessor::new(config);
+
+    for reading in &readings {
+        processor.process_reading(reading);
+    }
+
+    // Should detect positive trend
+    let summary = processor.summary();
+    for (_, stats) in &summary.biomarker_stats {
+        assert!(
+            stats.trend_slope > 0.0,
+            "Should detect upward trend, got slope: {}",
+            stats.trend_slope
+        );
+    }
+}
--- a/examples/dna/tests/kmer_tests.rs
+++ b/examples/dna/tests/kmer_tests.rs
@@ -0,0 +1,403 @@
+//! Integration tests for k-mer indexing module
+//!
+//! These tests use real VectorDB instances to validate k-mer encoding,
+//! indexing, and similarity search functionality.
+
+use ::rvdna::kmer::{canonical_kmer, KmerEncoder, KmerIndex, MinHashSketch};
+use tempfile::TempDir;
+
+/// Helper to create a test directory that will be automatically cleaned up
+fn create_test_db() -> TempDir {
+    TempDir::new().expect("Failed to create temp directory")
+}
+
+#[test]
+fn test_kmer_encoding_basic() {
+    let encoder = KmerEncoder::new(4).expect("Failed to create encoder");
+    let sequence = b"ACGTACGT";
+
+    let vector = encoder
+        .encode_sequence(sequence)
+        .expect("Failed to encode sequence");
+
+    // Verify vector has correct dimensions
+    assert_eq!(
+        vector.len(),
+        encoder.dimensions(),
+        "Vector dimensions should match encoder dimensions"
+    );
+
+    // Verify L2 normalization
+    let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
+    assert!(
+        (magnitude - 1.0).abs() < 1e-5,
+        "Vector should be L2 normalized, got magnitude: {}",
+        magnitude
+    );
+
+    // Verify non-zero elements exist (sequence has k-mers)
+    let non_zero_count = vector.iter().filter(|&&x| x != 0.0).count();
+    assert!(non_zero_count > 0, "Vector should have non-zero elements");
+}
+
+#[test]
+fn test_kmer_encoding_deterministic() {
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+    let sequence = b"ACGTACGTACGTACGTACGT";
+
+    let vector1 = encoder
+        .encode_sequence(sequence)
+        .expect("Failed to encode sequence first time");
+    let vector2 = encoder
+        .encode_sequence(sequence)
+        .expect("Failed to encode sequence second time");
+
+    // Verify same sequence produces identical vectors
+    assert_eq!(
+        vector1.len(),
+        vector2.len(),
+        "Vectors should have same length"
+    );
+
+    for (i, (&v1, &v2)) in vector1.iter().zip(vector2.iter()).enumerate() {
+        assert!(
+            (v1 - v2).abs() < 1e-6,
+            "Vector element {} should be identical: {} vs {}",
+            i,
+            v1,
+            v2
+        );
+    }
+}
+
+#[test]
+fn test_kmer_complement_symmetry() {
+    let kmer1 = b"ACGT";
+    let kmer2 = b"ACGT"; // reverse complement is ACGT (palindrome)
+
+    let canon1 = canonical_kmer(kmer1);
+    let canon2 = canonical_kmer(kmer2);
+
+    assert_eq!(canon1, canon2, "Canonical k-mers should be equal");
+
+    // Test with non-palindrome
+    let kmer3 = b"AAAA";
+    let kmer4 = b"TTTT"; // reverse complement of AAAA
+
+    let canon3 = canonical_kmer(kmer3);
+    let canon4 = canonical_kmer(kmer4);
+
+    assert_eq!(
+        canon3, canon4,
+        "Canonical k-mer should be same for sequence and revcomp"
+    );
+}
+
+#[test]
+fn test_kmer_index_insert_and_search() {
+    let _temp_dir = create_test_db();
+
+    // Create index with k=11
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+    let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
+
+    // Insert 3 sequences
+    let seq1 = b"ACGTACGTACGTACGTACGT";
+    let seq2 = b"ACGTACGTACGTACGTACGG"; // Similar to seq1
+    let seq3 = b"TTTTTTTTTTTTTTTTTTTT"; // Very different
+
+    index
+        .index_sequence("seq1", seq1)
+        .expect("Failed to index seq1");
+    index
+        .index_sequence("seq2", seq2)
+        .expect("Failed to index seq2");
+    index
+        .index_sequence("seq3", seq3)
+        .expect("Failed to index seq3");
+
+    // Search for similar sequences to seq1
+    let results = index.search_similar(seq1, 3).expect("Failed to search");
+
+    assert!(results.len() > 0, "Should find at least one result");
+
+    // First result should be seq1 itself (exact match)
+    assert_eq!(results[0].id, "seq1", "First result should be exact match");
+    assert!(
+        results[0].distance < 0.01,
+        "Exact match should have very low distance: {}",
+        results[0].distance
+    );
+
+    // seq2 should be closer than seq3
+    let seq2_idx = results.iter().position(|r| r.id == "seq2");
+    let seq3_idx = results.iter().position(|r| r.id == "seq3");
+
+    if let (Some(idx2), Some(idx3)) = (seq2_idx, seq3_idx) {
+        assert!(
+            idx2 < idx3,
+            "Similar sequence should rank higher than different sequence"
+        );
+    }
+}
+
+#[test]
+fn test_kmer_index_batch_insert() {
+    let _temp_dir = create_test_db();
+
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+    let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
+
+    // Generate 100 random sequences
+    let mut sequences = Vec::new();
+    for i in 0..100 {
+        let seq = generate_random_sequence(50, i as u64);
+        sequences.push((format!("seq_{}", i), seq));
+    }
+
+    // Convert to reference slices for batch insert
+    let batch: Vec<(&str, &[u8])> = sequences
+        .iter()
+        .map(|(id, seq)| (id.as_str(), seq.as_slice()))
+        .collect();
+
+    // Batch insert
+    index
+        .index_batch(batch)
+        .expect("Failed to batch insert sequences");
+
+    // Verify we can search and get results
+    let query = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
+    let results = index.search_similar(query, 10).expect("Failed to search");
+
+    assert!(results.len() > 0, "Should find results after batch insert");
+}
+
+#[test]
+fn test_kmer_similar_sequences_score_higher() {
+    let _temp_dir = create_test_db();
+
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+    let index = KmerIndex::new(11, encoder.dimensions()).expect("Failed to create index");
+
+    // Create two similar sequences (90% identical)
+    let base_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"; // 40 bases
+    let similar_seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGG"; // 1 base different
+    let random_seq = generate_random_sequence(40, 12345);
+
+    index
+        .index_sequence("base", base_seq)
+        .expect("Failed to index base");
+    index
+        .index_sequence("similar", similar_seq)
+        .expect("Failed to index similar");
+    index
+        .index_sequence("random", &random_seq)
+        .expect("Failed to index random");
+
+    // Search with base sequence
+    let results = index
+        .search_similar(base_seq, 10)
+        .expect("Failed to search");
+
+    assert!(results.len() > 0, "Should find at least one result");
+
+    // Find positions in results
+    let base_pos = results.iter().position(|r| r.id == "base");
+    let similar_pos = results.iter().position(|r| r.id == "similar");
+
+    // Base and similar should definitely be in top results
+    assert!(
+        base_pos.is_some(),
+        "Base sequence (exact match) should be found in results"
+    );
+    assert!(
+        similar_pos.is_some(),
+        "Similar sequence should be found in results"
+    );
+
+    // Base should be first (exact match has distance 0)
+    assert_eq!(
+        base_pos.unwrap(),
+        0,
+        "Base sequence should be the top result (exact match)"
+    );
+
+    // Similar sequence should be in top 3
+    assert!(
+        similar_pos.unwrap() < 3,
+        "Similar sequence should rank in top 3, was at position {}",
+        similar_pos.unwrap()
+    );
+}
+
+#[test]
+fn test_kmer_different_k_values() {
+    // Test k=11
+    let encoder11 = KmerEncoder::new(11).expect("Failed to create k=11 encoder");
+    let seq = b"ACGTACGTACGTACGTACGTACGTACGT";
+    let vec11 = encoder11
+        .encode_sequence(seq)
+        .expect("Failed to encode with k=11");
+    assert_eq!(vec11.len(), encoder11.dimensions());
+
+    // Test k=21
+    let encoder21 = KmerEncoder::new(21).expect("Failed to create k=21 encoder");
+    let seq_long = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
+    let vec21 = encoder21
+        .encode_sequence(seq_long)
+        .expect("Failed to encode with k=21");
+    assert_eq!(vec21.len(), encoder21.dimensions());
+
+    // Test k=31
+    let encoder31 = KmerEncoder::new(31).expect("Failed to create k=31 encoder");
+    let seq_longer = b"ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
+    let vec31 = encoder31
+        .encode_sequence(seq_longer)
+        .expect("Failed to encode with k=31");
+    assert_eq!(vec31.len(), encoder31.dimensions());
+
+    // All should be normalized
+    for (vec, k) in &[(vec11, 11), (vec21, 21), (vec31, 31)] {
+        let magnitude: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!(
+            (magnitude - 1.0).abs() < 1e-5,
+            "k={} vector should be normalized",
+            k
+        );
+    }
+}
+
+#[test]
+fn test_minhash_sketch_basic() {
+    let num_hashes = 100;
+    let mut sketch = MinHashSketch::new(num_hashes);
+    let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
+
+    let hashes = sketch
+        .sketch(sequence, 11)
+        .expect("Failed to sketch sequence");
+
+    assert!(
+        hashes.len() <= num_hashes,
+        "Sketch should have at most {} hashes, got {}",
+        num_hashes,
+        hashes.len()
+    );
+    assert!(hashes.len() > 0, "Sketch should have at least one hash");
+
+    // Verify hashes are sorted (implementation detail)
+    for i in 1..hashes.len() {
+        assert!(hashes[i] >= hashes[i - 1], "Hashes should be sorted");
+    }
+}
+
+#[test]
+fn test_minhash_jaccard_identical() {
+    let mut sketch1 = MinHashSketch::new(100);
+    let mut sketch2 = MinHashSketch::new(100);
+
+    let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
+
+    sketch1
+        .sketch(sequence, 11)
+        .expect("Failed to sketch sequence 1");
+    sketch2
+        .sketch(sequence, 11)
+        .expect("Failed to sketch sequence 2");
+
+    let distance = sketch1.jaccard_distance(&sketch2);
+
+    assert!(
+        distance < 0.01,
+        "Identical sequences should have distance close to 0, got {}",
+        distance
+    );
+}
+
+#[test]
+fn test_minhash_jaccard_different() {
+    let mut sketch1 = MinHashSketch::new(100);
+    let mut sketch2 = MinHashSketch::new(100);
+
+    let seq1 = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+    let seq2 = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
+
+    sketch1
+        .sketch(seq1, 11)
+        .expect("Failed to sketch sequence 1");
+    sketch2
+        .sketch(seq2, 11)
+        .expect("Failed to sketch sequence 2");
+
+    let distance = sketch1.jaccard_distance(&sketch2);
+
+    assert!(
+        distance > 0.9,
+        "Very different sequences should have distance close to 1, got {}",
+        distance
+    );
+}
+
+#[test]
+fn test_kmer_index_empty_sequence() {
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+
+    // Test empty sequence
+    let empty_seq = b"";
+    let result = encoder.encode_sequence(empty_seq);
+
+    assert!(result.is_err(), "Empty sequence should return error");
+
+    // Test sequence shorter than k
+    let short_seq = b"ACGT"; // k=11 but only 4 bases
+    let result = encoder.encode_sequence(short_seq);
+
+    assert!(
+        result.is_err(),
+        "Sequence shorter than k should return error"
+    );
+}
+
+#[test]
+fn test_kmer_index_with_n_bases() {
+    let encoder = KmerEncoder::new(11).expect("Failed to create encoder");
+
+    // Sequence with N (unknown) bases
+    let seq_with_n = b"ACGTACGTNNNACGTACGT";
+
+    // Should still encode (N bases are handled in canonical_kmer)
+    let result = encoder.encode_sequence(seq_with_n);
+
+    assert!(
+        result.is_ok(),
+        "Sequence with N bases should encode successfully"
+    );
+
+    let vector = result.unwrap();
+    assert_eq!(
+        vector.len(),
+        encoder.dimensions(),
+        "Vector should have correct dimensions"
+    );
+}
+
+// Helper function to generate random DNA sequences
+fn generate_random_sequence(length: usize, seed: u64) -> Vec<u8> {
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+
+    let bases = [b'A', b'C', b'G', b'T'];
+    let mut sequence = Vec::with_capacity(length);
+
+    for i in 0..length {
+        let mut hasher = DefaultHasher::new();
+        seed.hash(&mut hasher);
+        i.hash(&mut hasher);
+        let hash = hasher.finish();
+        let base_idx = (hash % 4) as usize;
+        sequence.push(bases[base_idx]);
+    }
+
+    sequence
+}
--- a/examples/dna/tests/pipeline_tests.rs
+++ b/examples/dna/tests/pipeline_tests.rs
@@ -0,0 +1,353 @@
+//! End-to-End Integration Tests for DNA Analysis Pipeline
+//!
+//! Real data, real computation, real assertions. No mocks, no stubs.
+//! Tests the complete DNA analysis workflow from nucleotide encoding
+//! through variant calling, protein translation, epigenetics, and pharmacogenomics.
+
+use ::rvdna::*;
+
+// ============================================================================
+// NUCLEOTIDE & SEQUENCE TESTS
+// ============================================================================
+
+#[test]
+fn test_nucleotide_encoding() {
+    assert_eq!(Nucleotide::A.to_u8(), 0);
+    assert_eq!(Nucleotide::C.to_u8(), 1);
+    assert_eq!(Nucleotide::G.to_u8(), 2);
+    assert_eq!(Nucleotide::T.to_u8(), 3);
+    assert_eq!(Nucleotide::N.to_u8(), 4);
+
+    assert_eq!(Nucleotide::from_u8(0).unwrap(), Nucleotide::A);
+    assert_eq!(Nucleotide::from_u8(1).unwrap(), Nucleotide::C);
+    assert_eq!(Nucleotide::from_u8(2).unwrap(), Nucleotide::G);
+    assert_eq!(Nucleotide::from_u8(3).unwrap(), Nucleotide::T);
+    assert_eq!(Nucleotide::from_u8(4).unwrap(), Nucleotide::N);
+}
+
+#[test]
+fn test_dna_sequence_reverse_complement() {
+    let seq1 = DnaSequence::from_str("ACGT").unwrap();
+    let rc1 = seq1.reverse_complement();
+    assert_eq!(rc1.to_string(), "ACGT");
+
+    let seq2 = DnaSequence::from_str("AACG").unwrap();
+    let rc2 = seq2.reverse_complement();
+    assert_eq!(rc2.to_string(), "CGTT");
+
+    let seq3 = DnaSequence::from_str("ATGCATGC").unwrap();
+    let rc3 = seq3.reverse_complement();
+    assert_eq!(rc3.to_string(), "GCATGCAT");
+}
+
+// ============================================================================
+// VARIANT CALLING TESTS
+// ============================================================================
+
+#[test]
+fn test_variant_calling_homozygous_snp() {
+    let caller = VariantCaller::new(VariantCallerConfig::default());
+
+    let pileup = PileupColumn {
+        bases: vec![b'G'; 15],
+        qualities: vec![40; 15],
+        position: 1000,
+        chromosome: 1,
+    };
+
+    let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
+    assert_eq!(call.genotype, Genotype::HomAlt);
+    assert_eq!(call.alt_allele, b'G');
+    assert_eq!(call.ref_allele, b'A');
+    assert!(call.quality > 20.0);
+}
+
+#[test]
+fn test_variant_calling_heterozygous_snp() {
+    let caller = VariantCaller::new(VariantCallerConfig::default());
+
+    let mut bases = vec![b'A'; 10];
+    bases.extend(vec![b'G'; 10]);
+
+    let pileup = PileupColumn {
+        bases,
+        qualities: vec![40; 20],
+        position: 2000,
+        chromosome: 1,
+    };
+
+    let call = caller.call_snp(&pileup, b'A').expect("Should call variant");
+    assert_eq!(call.genotype, Genotype::Het);
+    assert_eq!(call.alt_allele, b'G');
+    assert!(call.quality > 20.0);
+}
+
+#[test]
+fn test_variant_calling_no_variant() {
+    let caller = VariantCaller::new(VariantCallerConfig::default());
+
+    let pileup = PileupColumn {
+        bases: vec![b'A'; 20],
+        qualities: vec![40; 20],
+        position: 3000,
+        chromosome: 1,
+    };
+
+    let call = caller.call_snp(&pileup, b'A');
+    if let Some(c) = call {
+        assert_eq!(c.ref_allele, b'A');
+        assert!((c.allele_depth as f32 / c.depth as f32) < 0.2);
+    }
+}
+
+#[test]
+fn test_variant_quality_filtering() {
+    let mut config = VariantCallerConfig::default();
+    config.min_quality = 30;
+    config.min_depth = 10;
+    let caller = VariantCaller::new(config);
+
+    let mut calls = vec![
+        VariantCall {
+            chromosome: 1,
+            position: 1000,
+            ref_allele: b'A',
+            alt_allele: b'G',
+            quality: 35.0,
+            genotype: Genotype::Het,
+            depth: 20,
+            allele_depth: 10,
+            filter_status: FilterStatus::Pass,
+        },
+        VariantCall {
+            chromosome: 1,
+            position: 2000,
+            ref_allele: b'C',
+            alt_allele: b'T',
+            quality: 25.0,
+            genotype: Genotype::Het,
+            depth: 20,
+            allele_depth: 10,
+            filter_status: FilterStatus::Pass,
+        },
+        VariantCall {
+            chromosome: 1,
+            position: 3000,
+            ref_allele: b'G',
+            alt_allele: b'A',
+            quality: 40.0,
+            genotype: Genotype::Het,
+            depth: 5,
+            allele_depth: 2,
+            filter_status: FilterStatus::Pass,
+        },
+    ];
+
+    caller.filter_variants(&mut calls);
+    assert_eq!(calls[0].filter_status, FilterStatus::Pass);
+    assert_eq!(calls[1].filter_status, FilterStatus::LowQuality);
+    assert_eq!(calls[2].filter_status, FilterStatus::LowDepth);
+}
+
+// ============================================================================
+// PROTEIN TRANSLATION TESTS
+// ============================================================================
+
+#[test]
+fn test_protein_translation() {
+    use ::rvdna::protein::{translate_dna, AminoAcid};
+    let proteins = translate_dna(b"ATGGCAGGT");
+    assert_eq!(proteins.len(), 3);
+    assert_eq!(proteins[0], AminoAcid::Met);
+    assert_eq!(proteins[1], AminoAcid::Ala);
+    assert_eq!(proteins[2], AminoAcid::Gly);
+}
+
+#[test]
+fn test_protein_translation_stop_codon() {
+    use ::rvdna::protein::{translate_dna, AminoAcid};
+    let p1 = translate_dna(b"ATGGCATAA");
+    assert_eq!(p1.len(), 2);
+    assert_eq!(p1[0], AminoAcid::Met);
+
+    let p2 = translate_dna(b"ATGGCATAG");
+    assert_eq!(p2.len(), 2);
+
+    let p3 = translate_dna(b"ATGGCATGA");
+    assert_eq!(p3.len(), 2);
+}
+
+#[test]
+fn test_amino_acid_hydrophobicity() {
+    use ::rvdna::protein::AminoAcid;
+    assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
+    assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
+    assert_eq!(AminoAcid::Val.hydrophobicity(), 4.2);
+    assert_eq!(AminoAcid::Lys.hydrophobicity(), -3.9);
+    assert_eq!(AminoAcid::Gly.hydrophobicity(), -0.4);
+}
+
+// ============================================================================
+// EPIGENETICS TESTS
+// ============================================================================
+
+#[test]
+fn test_methylation_profile_creation() {
+    let positions = vec![(1, 1000), (1, 2000), (2, 3000), (2, 4000)];
+    let betas = vec![0.1, 0.5, 0.8, 0.3];
+    let profile = MethylationProfile::from_beta_values(positions, betas);
+    assert_eq!(profile.sites.len(), 4);
+    let mean = profile.mean_methylation();
+    assert!((mean - 0.425).abs() < 0.001);
+}
+
+#[test]
+fn test_horvath_clock_prediction() {
+    let clock = HorvathClock::default_clock();
+    let positions: Vec<(u8, u64)> = (0..700).map(|i| (1, i * 1000)).collect();
+    let betas: Vec<f32> = (0..700)
+        .map(|i| {
+            if i < 100 {
+                0.3
+            } else if i < 200 {
+                0.7
+            } else {
+                0.5
+            }
+        })
+        .collect();
+    let profile = MethylationProfile::from_beta_values(positions, betas);
+    let predicted_age = clock.predict_age(&profile);
+    assert!(predicted_age > 0.0);
+    assert!(predicted_age < 150.0);
+}
+
+// ============================================================================
+// PHARMACOGENOMICS TESTS
+// ============================================================================
+
+#[test]
+fn test_pharma_star_allele_calling() {
+    assert_eq!(call_star_allele(&[]), StarAllele::Star1);
+    assert_eq!(
+        call_star_allele(&[(42130692, b'G', b'A')]),
+        StarAllele::Star4
+    );
+    assert_eq!(
+        call_star_allele(&[(42126611, b'T', b'-')]),
+        StarAllele::Star5
+    );
+}
+
+#[test]
+fn test_pharma_metabolizer_phenotype() {
+    assert_eq!(
+        predict_phenotype(&StarAllele::Star1, &StarAllele::Star1),
+        MetabolizerPhenotype::Normal
+    );
+    assert_eq!(
+        predict_phenotype(&StarAllele::Star1, &StarAllele::Star4),
+        MetabolizerPhenotype::Normal
+    );
+    assert_eq!(
+        predict_phenotype(&StarAllele::Star4, &StarAllele::Star4),
+        MetabolizerPhenotype::Poor
+    );
+}
+
+// ============================================================================
+// ALIGNMENT TESTS
+// ============================================================================
+
+#[test]
+fn test_smith_waterman_alignment() {
+    let aligner = SmithWaterman::new(AlignmentConfig::default());
+    let query = DnaSequence::from_str("ACGT").unwrap();
+    let reference = DnaSequence::from_str("ACGT").unwrap();
+    let result = aligner.align(&query, &reference).unwrap();
+    assert_eq!(result.score, 8); // 4 matches * 2 points each
+}
+
+#[test]
+fn test_attention_alignment() {
+    let query = DnaSequence::from_str("ATCGATCG").unwrap();
+    let reference = DnaSequence::from_str("TTTTATCGATCGTTTT").unwrap();
+    let alignment = query.align_with_attention(&reference).unwrap();
+    assert!(alignment.score > 0);
+}
+
+// ============================================================================
+// FULL PIPELINE INTEGRATION
+// ============================================================================
+
+#[test]
+fn test_pipeline_config_defaults() {
+    let config = AnalysisConfig::default();
+    assert_eq!(config.kmer_size, 11);
+    assert_eq!(config.vector_dims, 512);
+    assert_eq!(config.min_quality, 20);
+    assert!(config.parameters.is_empty());
+}
+
+#[test]
+fn test_full_pipeline_runs() {
+    // 1. Create and manipulate DNA
+    let dna_seq = DnaSequence::from_str("ATGCGATCGATCGATCGATCGTAGCTAGCTAGC").unwrap();
+    let rev_comp = dna_seq.reverse_complement();
+    assert_eq!(rev_comp.len(), dna_seq.len());
+
+    // 2. K-mer vector
+    let kmer_vec = dna_seq.to_kmer_vector(11, 512).unwrap();
+    assert_eq!(kmer_vec.len(), 512);
+
+    // 3. Variant calling
+    let caller = VariantCaller::new(VariantCallerConfig::default());
+    let pileup = PileupColumn {
+        bases: vec![b'A', b'A', b'G', b'G', b'G', b'G', b'G', b'G', b'G', b'G'],
+        qualities: vec![40; 10],
+        position: 1000,
+        chromosome: 1,
+    };
+    assert!(caller.call_snp(&pileup, b'A').is_some());
+
+    // 4. Protein translation
+    let proteins = translate_dna(b"ATGGCAGGTAAACCC");
+    assert!(!proteins.is_empty());
+
+    // 5. Methylation + Horvath
+    let profile = MethylationProfile::from_beta_values(
+        vec![(1, 1000), (1, 2000), (1, 3000)],
+        vec![0.3, 0.5, 0.7],
+    );
+    let age = HorvathClock::default_clock().predict_age(&profile);
+    assert!(age > 0.0);
+
+    // 6. Pharmacogenomics
+    let allele = call_star_allele(&[(42130692, b'G', b'A')]);
+    assert_eq!(allele, StarAllele::Star4);
+    let phenotype = predict_phenotype(&allele, &StarAllele::Star1);
+    assert_eq!(phenotype, MetabolizerPhenotype::Normal);
+
+    // 7. Alignment
+    let alignment = dna_seq.align_with_attention(&rev_comp).unwrap();
+    assert!(alignment.score > 0);
+
+    // 8. Protein contact graph
+    let protein = ProteinSequence::new(vec![
+        ProteinResidue::A,
+        ProteinResidue::V,
+        ProteinResidue::L,
+        ProteinResidue::I,
+        ProteinResidue::F,
+        ProteinResidue::G,
+        ProteinResidue::K,
+        ProteinResidue::D,
+        ProteinResidue::E,
+        ProteinResidue::R,
+        ProteinResidue::M,
+        ProteinResidue::N,
+    ]);
+    let graph = protein.build_contact_graph(8.0).unwrap();
+    let contacts = protein.predict_contacts(&graph).unwrap();
+    assert!(!contacts.is_empty());
+}
--- a/examples/dna/tests/security_tests.rs
+++ b/examples/dna/tests/security_tests.rs
@@ -0,0 +1,191 @@
+//! Security validation tests for DNA analyzer - NO MOCKS, real computation only
+use ::rvdna::error::DnaError;
+use ::rvdna::types::*;
+use ::rvdna::VectorEntry;
+use std::sync::{Arc, Mutex};
+use std::thread;
+
+#[test]
+fn test_buffer_overflow_protection() {
+    // 10M+ bases shouldn't cause OOM/crash
+    let large_size = 10_000_000;
+    let bases: Vec<Nucleotide> = (0..large_size)
+        .map(|i| match i % 4 {
+            0 => Nucleotide::A,
+            1 => Nucleotide::C,
+            2 => Nucleotide::G,
+            _ => Nucleotide::T,
+        })
+        .collect();
+    let seq = DnaSequence::new(bases);
+    assert_eq!(seq.len(), large_size);
+    let rc = seq.reverse_complement();
+    assert_eq!(rc.len(), large_size);
+    assert!(seq.to_kmer_vector(11, 512).is_ok());
+}
+
+#[test]
+fn test_invalid_base_handling() {
+    // Non-ACGTN characters rejected gracefully
+    for input in ["ACGTX", "ACGT123", "ACGT!@#"] {
+        let result = DnaSequence::from_str(input);
+        assert!(result.is_err());
+        assert!(matches!(result.unwrap_err(), DnaError::InvalidSequence(_)));
+    }
+    assert!(DnaSequence::from_str("ACGTN").is_ok());
+    assert!(DnaSequence::from_str("acgtn").is_ok());
+}
+
+#[test]
+fn test_unicode_injection() {
+    // Unicode/malicious IDs don't break indexing
+    let seq = DnaSequence::from_str("ACGTACGT").unwrap();
+    let vector = seq.to_kmer_vector(3, 128).unwrap();
+    let temp_dir = std::env::temp_dir().join(format!("dna_test_{}", std::process::id()));
+    let _ = std::fs::create_dir_all(&temp_dir);
+    let index = KmerIndex::new(3, 128, temp_dir.join("unicode").to_str().unwrap()).unwrap();
+
+    for id in ["seq_cafe_dna", "patient123", "seq_hidden"] {
+        let entry = VectorEntry {
+            id: Some(id.to_string()),
+            vector: vector.clone(),
+            metadata: None,
+        };
+        assert!(index.db().insert(entry).is_ok());
+    }
+    let _ = std::fs::remove_dir_all(&temp_dir);
+}
+
+#[test]
+fn test_path_traversal_prevention() {
+    // Verify KmerIndex handles unusual paths without panicking
+    // The key security property: operations complete or fail gracefully
+    let temp_dir = std::env::temp_dir().join(format!("dna_path_{}", std::process::id()));
+    let _ = std::fs::create_dir_all(&temp_dir);
+
+    for path in ["../../../tmp/evil", "../../etc/passwd"] {
+        let full_path = temp_dir.join(path);
+        // KmerIndex creation with traversal paths should either succeed
+        // (contained to actual resolved path) or fail gracefully - never panic
+        let result =
+            std::panic::catch_unwind(|| KmerIndex::new(3, 128, full_path.to_str().unwrap()));
+        assert!(result.is_ok(), "Path traversal should not cause panic");
+    }
+
+    // Clean up any created dirs
+    let _ = std::fs::remove_dir_all(&temp_dir);
+    let _ = std::fs::remove_dir_all(std::env::temp_dir().join("evil"));
+}
+
+#[test]
+fn test_integer_overflow_kmer() {
+    // k=64 would overflow, k=0 invalid
+    let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
+    assert!(matches!(
+        seq.to_kmer_vector(64, 512).unwrap_err(),
+        DnaError::InvalidKmerSize(64)
+    ));
+    assert!(seq.to_kmer_vector(0, 512).is_err());
+    assert!(seq.to_kmer_vector(11, 512).is_ok());
+    assert!(seq.to_kmer_vector(15, 512).is_ok());
+}
+
+#[test]
+fn test_empty_input_safety() {
+    // Empty inputs handled safely
+    assert!(matches!(
+        DnaSequence::from_str("").unwrap_err(),
+        DnaError::EmptySequence
+    ));
+    let empty = DnaSequence::new(vec![]);
+    assert!(empty.is_empty() && empty.len() == 0);
+    assert!(empty.complement().is_empty());
+    assert!(empty.reverse_complement().is_empty());
+    assert_eq!(empty.to_string(), "");
+}
+
+#[test]
+fn test_null_byte_handling() {
+    // Null bytes rejected
+    assert!(DnaSequence::from_str("ACGT\0").is_err());
+}
+
+#[test]
+fn test_concurrent_access_safety() {
+    // 10 threads accessing VectorDB concurrently
+    let temp_dir = std::env::temp_dir().join(format!("dna_conc_{}", std::process::id()));
+    let _ = std::fs::create_dir_all(&temp_dir);
+    let index = Arc::new(Mutex::new(
+        KmerIndex::new(3, 128, temp_dir.join("idx").to_str().unwrap()).unwrap(),
+    ));
+
+    let handles: Vec<_> = (0..10)
+        .map(|i| {
+            let idx_clone = Arc::clone(&index);
+            thread::spawn(move || {
+                let seq = DnaSequence::from_str("ACGTACGTACGT").unwrap();
+                let entry = VectorEntry {
+                    id: Some(format!("seq_{}", i)),
+                    vector: seq.to_kmer_vector(3, 128).unwrap(),
+                    metadata: None,
+                };
+                idx_clone.lock().unwrap().db().insert(entry).unwrap();
+            })
+        })
+        .collect();
+
+    for h in handles {
+        assert!(h.join().is_ok());
+    }
+    let _ = std::fs::remove_dir_all(&temp_dir);
+}
+
+#[test]
+fn test_quality_score_bounds() {
+    // Phred >93 rejected, 0-93 accepted
+    assert!(matches!(
+        QualityScore::new(100).unwrap_err(),
+        DnaError::InvalidQuality(100)
+    ));
+    assert!(QualityScore::new(0).is_ok());
+    assert!(QualityScore::new(93).is_ok());
+    assert!((QualityScore::new(30).unwrap().to_error_probability() - 0.001).abs() < 1e-6);
+    assert!((QualityScore::new(0).unwrap().to_error_probability() - 1.0).abs() < 0.01);
+}
+
+#[test]
+fn test_variant_position_overflow() {
+    // u64::MAX positions handled
+    let pos = GenomicPosition {
+        chromosome: 25,
+        position: u64::MAX,
+        reference_allele: Nucleotide::A,
+        alternate_allele: Some(Nucleotide::G),
+    };
+    assert_eq!(pos.position, u64::MAX);
+}
+
+#[test]
+fn test_methylation_bounds() {
+    // Beta values clamped to [0,1]
+    for val in [-0.5f32, 0.0, 0.5, 1.0, 1.5] {
+        let clamped = val.clamp(0.0, 1.0);
+        assert!(clamped >= 0.0 && clamped <= 1.0);
+    }
+}
+
+#[test]
+fn test_deterministic_output() {
+    // Same input -> same output (no randomness)
+    let seq = DnaSequence::from_str("ACGTACGTACGTACGT").unwrap();
+    assert_eq!(
+        seq.to_kmer_vector(11, 512).unwrap(),
+        seq.to_kmer_vector(11, 512).unwrap()
+    );
+    assert_eq!(
+        seq.reverse_complement().to_string(),
+        seq.reverse_complement().to_string()
+    );
+    assert_eq!(seq.complement().to_string(), seq.complement().to_string());
+    assert_eq!(seq.to_string(), seq.to_string());
+}