wifi-densepose/vendor/ruvector/examples/dna/benches/dna_bench.rs

//! Criterion benchmarks for DNA Analyzer
//!
//! Comprehensive performance benchmarks covering:
//! - K-mer encoding and HNSW indexing
//! - Sequence alignment
//! - Variant calling
//! - Protein translation
//! - Full pipeline integration

use ::rvdna::prelude::*;
use ::rvdna::types::KmerIndex as TypesKmerIndex;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};

/// Generate random DNA sequence of specified length
fn random_dna(len: usize, seed: u64) -> DnaSequence {
    let mut rng = StdRng::seed_from_u64(seed);
    let bases = [Nucleotide::A, Nucleotide::C, Nucleotide::G, Nucleotide::T];
    let sequence: Vec<Nucleotide> = (0..len).map(|_| bases[rng.gen_range(0..4)]).collect();
    DnaSequence::new(sequence)
}

/// Generate multiple random sequences
fn random_sequences(count: usize, len: usize, seed: u64) -> Vec<DnaSequence> {
    (0..count)
        .map(|i| random_dna(len, seed + i as u64))
        .collect()
}

// ============================================================================
// K-mer Benchmarks
// ============================================================================

fn kmer_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("kmer");

    group.bench_function("encode_1kb", |b| {
        let seq = random_dna(1_000, 42);
        b.iter(|| black_box(seq.to_kmer_vector(11, 512).unwrap()));
    });

    group.bench_function("encode_10kb", |b| {
        let seq = random_dna(10_000, 42);
        b.iter(|| black_box(seq.to_kmer_vector(11, 512).unwrap()));
    });

    group.bench_function("encode_100kb", |b| {
        let seq = random_dna(100_000, 42);
        b.iter(|| black_box(seq.to_kmer_vector(11, 512).unwrap()));
    });

    // HNSW index insertion
    group.bench_function("index_insert_100", |b| {
        let sequences = random_sequences(100, 100, 42);
        b.iter(|| {
            let temp = tempfile::TempDir::new().unwrap();
            let index =
                TypesKmerIndex::new(11, 512, temp.path().join("idx").to_str().unwrap()).unwrap();
            for (i, seq) in sequences.iter().enumerate() {
                let vec = seq.to_kmer_vector(11, 512).unwrap();
                index
                    .db()
                    .insert(ruvector_core::VectorEntry {
                        id: Some(format!("seq{}", i)),
                        vector: vec,
                        metadata: None,
                    })
                    .unwrap();
            }
            black_box(index)
        });
    });

    // HNSW search
    group.bench_function("search_top10", |b| {
        let sequences = random_sequences(100, 100, 42);
        let temp = tempfile::TempDir::new().unwrap();
        let index =
            TypesKmerIndex::new(11, 512, temp.path().join("idx").to_str().unwrap()).unwrap();

        for (i, seq) in sequences.iter().enumerate() {
            let vec = seq.to_kmer_vector(11, 512).unwrap();
            index
                .db()
                .insert(ruvector_core::VectorEntry {
                    id: Some(format!("seq{}", i)),
                    vector: vec,
                    metadata: None,
                })
                .unwrap();
        }

        let query = random_dna(100, 999);
        let query_vec = query.to_kmer_vector(11, 512).unwrap();

        b.iter(|| {
            black_box(
                index
                    .db()
                    .search(ruvector_core::SearchQuery {
                        vector: query_vec.clone(),
                        k: 10,
                        filter: None,
                        ef_search: None,
                    })
                    .unwrap(),
            )
        });
    });

    group.finish();
}

// ============================================================================
// Alignment Benchmarks
// ============================================================================

fn alignment_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("alignment");

    group.bench_function("one_hot_encoding_1kb", |b| {
        let seq = random_dna(1_000, 42);
        b.iter(|| black_box(seq.encode_one_hot()));
    });

    group.bench_function("attention_align_100bp", |b| {
        let query = random_dna(100, 42);
        let reference = random_dna(1_000, 43);
        b.iter(|| black_box(query.align_with_attention(&reference).unwrap()));
    });

    group.bench_function("smith_waterman_100bp", |b| {
        let query = random_dna(100, 42);
        let reference = random_dna(500, 43);
        let aligner = SmithWaterman::new(AlignmentConfig::default());
        b.iter(|| black_box(aligner.align(&query, &reference).unwrap()));
    });

    group.finish();
}

// ============================================================================
// Variant Calling Benchmarks
// ============================================================================

fn variant_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("variant");

    group.bench_function("snp_calling_single", |b| {
        let caller = VariantCaller::new(VariantCallerConfig::default());
        let pileup = PileupColumn {
            bases: vec![b'A', b'A', b'G', b'G', b'G', b'G', b'G', b'G', b'G', b'G'],
            qualities: vec![35; 10],
            position: 12345,
            chromosome: 1,
        };

        b.iter(|| black_box(caller.call_snp(&pileup, b'A')));
    });

    group.bench_function("snp_calling_1000_positions", |b| {
        let caller = VariantCaller::new(VariantCallerConfig::default());
        let mut rng = StdRng::seed_from_u64(42);

        let pileups: Vec<(PileupColumn, u8)> = (0..1000)
            .map(|i| {
                let bases: Vec<u8> = (0..20)
                    .map(|_| [b'A', b'C', b'G', b'T'][rng.gen_range(0..4)])
                    .collect();
                let quals: Vec<u8> = (0..20).map(|_| rng.gen_range(20..41)).collect();
                let ref_base = [b'A', b'C', b'G', b'T'][i % 4];
                (
                    PileupColumn {
                        bases,
                        qualities: quals,
                        position: i as u64,
                        chromosome: 1,
                    },
                    ref_base,
                )
            })
            .collect();

        b.iter(|| {
            let mut count = 0;
            for (pileup, ref_base) in &pileups {
                if caller.call_snp(pileup, *ref_base).is_some() {
                    count += 1;
                }
            }
            black_box(count)
        });
    });

    group.finish();
}

// ============================================================================
// Protein Analysis Benchmarks
// ============================================================================

fn protein_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("protein");

    group.bench_function("translate_1kb", |b| {
        let seq = random_dna(1_002, 42);
        b.iter(|| black_box(seq.translate().unwrap()));
    });

    group.bench_function("contact_graph_100residues", |b| {
        let protein = create_random_protein(100, 42);
        b.iter(|| black_box(protein.build_contact_graph(8.0).unwrap()));
    });

    group.bench_function("contact_prediction_100residues", |b| {
        let protein = create_random_protein(100, 42);
        let graph = protein.build_contact_graph(8.0).unwrap();
        b.iter(|| black_box(protein.predict_contacts(&graph).unwrap()));
    });

    group.finish();
}

// ============================================================================
// RVDNA Format Benchmarks
// ============================================================================

fn rvdna_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("rvdna");

    group.bench_function("encode_2bit_1kb", |b| {
        let seq = random_dna(1_000, 42);
        b.iter(|| black_box(rvdna::encode_2bit(seq.bases())));
    });

    group.bench_function("encode_2bit_100kb", |b| {
        let seq = random_dna(100_000, 42);
        b.iter(|| black_box(rvdna::encode_2bit(seq.bases())));
    });

    group.bench_function("fasta_to_rvdna_1kb", |b| {
        let seq_str: String = random_dna(1_000, 42)
            .bases()
            .iter()
            .map(|n| match n {
                Nucleotide::A => 'A',
                Nucleotide::C => 'C',
                Nucleotide::G => 'G',
                Nucleotide::T => 'T',
                _ => 'N',
            })
            .collect();
        b.iter(|| black_box(rvdna::fasta_to_rvdna(&seq_str, 11, 256, 1000).unwrap()));
    });

    group.finish();
}

// ============================================================================
// Epigenomics Benchmarks
// ============================================================================

fn epigenomics_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("epigenomics");

    group.bench_function("cancer_signal_1000_sites", |b| {
        let positions: Vec<(u8, u64)> = (0..1000).map(|i| (1u8, i as u64)).collect();
        let betas: Vec<f32> = (0..1000).map(|i| (i as f32 / 1000.0)).collect();
        let profile = rvdna::MethylationProfile::from_beta_values(positions, betas);
        let detector = rvdna::CancerSignalDetector::new();
        b.iter(|| black_box(detector.detect(&profile)));
    });

    group.bench_function("horvath_clock_1000_sites", |b| {
        let positions: Vec<(u8, u64)> = (0..1000).map(|i| (1u8, i as u64)).collect();
        let betas: Vec<f32> = (0..1000).map(|i| (i as f32 / 2000.0 + 0.25)).collect();
        let profile = rvdna::MethylationProfile::from_beta_values(positions, betas);
        let clock = rvdna::HorvathClock::default_clock();
        b.iter(|| black_box(clock.predict_age(&profile)));
    });

    group.finish();
}

// ============================================================================
// Protein Analysis Benchmarks (extended)
// ============================================================================

fn protein_extended_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("protein_analysis");

    group.bench_function("molecular_weight_300aa", |b| {
        let protein = rvdna::translate_dna(
            &random_dna(900, 42)
                .bases()
                .iter()
                .map(|n| match n {
                    Nucleotide::A => b'A',
                    Nucleotide::C => b'C',
                    Nucleotide::G => b'G',
                    Nucleotide::T => b'T',
                    _ => b'N',
                })
                .collect::<Vec<u8>>(),
        );
        b.iter(|| black_box(rvdna::molecular_weight(&protein)));
    });

    group.bench_function("isoelectric_point_300aa", |b| {
        let protein = rvdna::translate_dna(
            &random_dna(900, 42)
                .bases()
                .iter()
                .map(|n| match n {
                    Nucleotide::A => b'A',
                    Nucleotide::C => b'C',
                    Nucleotide::G => b'G',
                    Nucleotide::T => b'T',
                    _ => b'N',
                })
                .collect::<Vec<u8>>(),
        );
        b.iter(|| black_box(rvdna::isoelectric_point(&protein)));
    });

    group.finish();
}

// ============================================================================
// Full Pipeline Benchmarks
// ============================================================================

fn pipeline_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("pipeline");

    group.bench_function("full_pipeline_1kb", |b| {
        let reference = random_dna(1_000, 42);
        let reads = random_sequences(20, 150, 43);
        let caller = VariantCaller::new(VariantCallerConfig::default());

        b.iter(|| {
            // K-mer encoding
            let ref_vec = reference.to_kmer_vector(11, 512).unwrap();

            // Align reads
            let mut alignments = Vec::new();
            for read in &reads {
                if let Ok(alignment) = read.align_with_attention(&reference) {
                    alignments.push(alignment);
                }
            }

            // Call variants at a few positions
            let mut variants = Vec::new();
            let pileup = PileupColumn {
                bases: vec![b'A', b'G', b'G', b'G', b'A', b'G', b'G', b'A', b'G', b'G'],
                qualities: vec![35; 10],
                position: 0,
                chromosome: 1,
            };
            if let Some(v) = caller.call_snp(&pileup, b'A') {
                variants.push(v);
            }

            // Translate to protein
            let protein = reference.translate().unwrap();

            black_box((ref_vec, alignments, variants, protein))
        });
    });

    group.finish();
}

// ============================================================================
// Helpers
// ============================================================================

fn create_random_protein(len: usize, seed: u64) -> ProteinSequence {
    let mut rng = StdRng::seed_from_u64(seed);
    let residues = [
        ProteinResidue::A,
        ProteinResidue::C,
        ProteinResidue::D,
        ProteinResidue::E,
        ProteinResidue::F,
        ProteinResidue::G,
        ProteinResidue::H,
        ProteinResidue::I,
        ProteinResidue::K,
        ProteinResidue::L,
        ProteinResidue::M,
        ProteinResidue::N,
    ];

    let sequence: Vec<ProteinResidue> = (0..len)
        .map(|_| residues[rng.gen_range(0..residues.len())])
        .collect();

    ProteinSequence::new(sequence)
}

// ============================================================================
// Criterion Configuration
// ============================================================================

criterion_group!(
    benches,
    kmer_benchmarks,
    alignment_benchmarks,
    variant_benchmarks,
    protein_benchmarks,
    rvdna_benchmarks,
    epigenomics_benchmarks,
    protein_extended_benchmarks,
    pipeline_benchmarks
);

criterion_main!(benches);