//! DAG-based genomic analysis pipeline orchestrator use crate::error::Result; use crate::types::{DnaSequence, KmerIndex, Nucleotide, ProteinResidue, ProteinSequence}; use ruvector_core::types::{SearchQuery, VectorEntry}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Instant; /// Pipeline configuration #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PipelineConfig { /// K-mer size (default: 21) pub k: usize, /// Attention window size (default: 512) pub window_size: usize, /// Variant calling min depth (default: 10) pub min_depth: usize, /// Min variant quality (default: 20) pub min_quality: u8, } impl Default for PipelineConfig { fn default() -> Self { Self { k: 21, window_size: 512, min_depth: 10, min_quality: 20, } } } /// K-mer analysis results #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KmerAnalysisResult { /// Total k-mers extracted pub total_kmers: usize, /// Unique k-mers found pub unique_kmers: usize, /// GC content ratio pub gc_content: f64, /// Top similar sequences pub top_similar_sequences: Vec, } /// Similar sequence match #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SimilarSequence { /// Sequence identifier pub id: String, /// Similarity score pub similarity: f32, /// Position in the index pub position: usize, } /// Variant call result #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VariantCall { /// Genomic position pub position: u64, /// Reference base pub reference: Nucleotide, /// Alternate base pub alternate: Nucleotide, /// Variant quality pub quality: u8, /// Read depth pub depth: usize, /// Allele frequency pub allele_frequency: f64, } /// Pileup column for variant calling #[derive(Debug, Clone)] pub struct PileupColumn { /// Genomic position pub position: u64, /// Reference base pub reference: Nucleotide, /// Observed bases pub bases: Vec, /// Quality scores pub qualities: Vec, } /// Protein analysis results #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProteinAnalysisResult { /// Amino acid sequence (single letter codes) pub sequence: String, /// Protein length pub length: usize, /// Predicted contacts as (i, j, score) pub predicted_contacts: Vec<(usize, usize, f32)>, /// Secondary structure prediction (H/E/C) pub secondary_structure: Vec, } /// Full pipeline analysis results #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FullAnalysisResult { /// K-mer statistics pub kmer_stats: KmerAnalysisResult, /// Called variants pub variants: Vec, /// Protein analysis results pub proteins: Vec, /// Execution time in milliseconds pub execution_time_ms: u128, } /// Genomic analysis pipeline orchestrator pub struct GenomicPipeline { config: PipelineConfig, } impl GenomicPipeline { /// Create new pipeline with configuration pub fn new(config: PipelineConfig) -> Self { Self { config } } /// Run k-mer analysis on sequences pub fn run_kmer_analysis(&self, sequences: &[(&str, &[u8])]) -> Result { let mut total_kmers = 0; let mut kmer_set = std::collections::HashSet::new(); let mut gc_count = 0; let mut total_bases = 0; // Create temporary k-mer index let index = KmerIndex::new(self.config.k, 384, ":memory:")?; for (id, seq) in sequences { // Extract k-mers if seq.len() < self.config.k { continue; } total_bases += seq.len(); for window in seq.windows(self.config.k) { total_kmers += 1; kmer_set.insert(window.to_vec()); // Count GC content for &base in window { if base == b'G' || base == b'C' { gc_count += 1; } } } // Convert sequence to vector and index let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(seq))?; if let Ok(vector) = dna_seq.to_kmer_vector(self.config.k, 384) { let entry = VectorEntry { id: Some(id.to_string()), vector, metadata: None, }; let _ = index.db().insert(entry); } } let gc_content = if total_bases > 0 { (gc_count as f64) / (total_bases as f64) } else { 0.0 }; // Find similar sequences using HNSW search let mut top_similar = Vec::new(); if !sequences.is_empty() { if let Some((query_id, query_seq)) = sequences.first() { let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(query_seq))?; if let Ok(query_vector) = dna_seq.to_kmer_vector(self.config.k, 384) { let search_query = SearchQuery { vector: query_vector, k: 5, filter: None, ef_search: None, }; if let Ok(results) = index.db().search(search_query) { for result in results { if result.id != *query_id { top_similar.push(SimilarSequence { id: result.id.clone(), similarity: result.score, position: 0, }); } } } } } } Ok(KmerAnalysisResult { total_kmers, unique_kmers: kmer_set.len(), gc_content, top_similar_sequences: top_similar, }) } /// Run variant calling against reference pub fn run_variant_calling( &self, pileups: &[PileupColumn], _reference: &[u8], ) -> Result> { let mut variants = Vec::new(); for pileup in pileups { if pileup.bases.len() < self.config.min_depth { continue; } // Count allele frequencies let mut allele_counts: HashMap = HashMap::new(); for &base in &pileup.bases { *allele_counts.entry(base).or_insert(0) += 1; } // Find most common alternate allele let _ref_count = allele_counts.get(&pileup.reference).copied().unwrap_or(0); for (&allele, &count) in &allele_counts { if allele == pileup.reference || allele == Nucleotide::N { continue; } let allele_freq = count as f64 / pileup.bases.len() as f64; // Call variant if alternate allele frequency is significant if allele_freq > 0.2 && count >= 3 { // Calculate quality score from supporting reads let quality = pileup .qualities .iter() .take(count) .map(|&q| q as u16) .sum::() .min(255) as u8; if quality >= self.config.min_quality { variants.push(VariantCall { position: pileup.position, reference: pileup.reference, alternate: allele, quality, depth: pileup.bases.len(), allele_frequency: allele_freq, }); } } } } Ok(variants) } /// Translate DNA to protein and analyze structure pub fn run_protein_analysis(&self, dna: &[u8]) -> Result { // Translate DNA to protein using standard genetic code let protein = self.translate_dna(dna)?; // Predict contacts using heuristic scoring let contacts = self.predict_protein_contacts(&protein)?; // Simple secondary structure prediction let secondary_structure = self.predict_secondary_structure(&protein); Ok(ProteinAnalysisResult { sequence: protein.residues().iter().map(|r| r.to_char()).collect(), length: protein.len(), predicted_contacts: contacts, secondary_structure, }) } /// Run full analysis pipeline pub fn run_full_pipeline( &self, sequence: &[u8], reference: &[u8], ) -> Result { let start = Instant::now(); // Stage 1: K-mer analysis let kmer_stats = self.run_kmer_analysis(&[("query", sequence), ("reference", reference)])?; // Stage 2: Variant calling - generate pileups from sequence let pileups = self.generate_pileups(sequence, reference)?; let variants = self.run_variant_calling(&pileups, reference)?; // Stage 3: Protein analysis - find ORFs and translate let proteins = self.find_orfs_and_translate(sequence)?; let execution_time_ms = start.elapsed().as_millis(); Ok(FullAnalysisResult { kmer_stats, variants, proteins, execution_time_ms, }) } // Helper methods /// Translate DNA to protein fn translate_dna(&self, dna: &[u8]) -> Result { let mut residues = Vec::new(); for codon in dna.chunks(3) { if codon.len() < 3 { break; } let aa = self.codon_to_amino_acid(codon); if aa == ProteinResidue::X { break; // Stop codon } residues.push(aa); } Ok(ProteinSequence::new(residues)) } /// Map codon to amino acid (simplified genetic code) fn codon_to_amino_acid(&self, codon: &[u8]) -> ProteinResidue { match codon { b"ATG" => ProteinResidue::M, b"TGG" => ProteinResidue::W, b"TTT" | b"TTC" => ProteinResidue::F, b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => ProteinResidue::L, b"ATT" | b"ATC" | b"ATA" => ProteinResidue::I, b"GTT" | b"GTC" | b"GTA" | b"GTG" => ProteinResidue::V, b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => ProteinResidue::S, b"CCT" | b"CCC" | b"CCA" | b"CCG" => ProteinResidue::P, b"ACT" | b"ACC" | b"ACA" | b"ACG" => ProteinResidue::T, b"GCT" | b"GCC" | b"GCA" | b"GCG" => ProteinResidue::A, b"TAT" | b"TAC" => ProteinResidue::Y, b"CAT" | b"CAC" => ProteinResidue::H, b"CAA" | b"CAG" => ProteinResidue::Q, b"AAT" | b"AAC" => ProteinResidue::N, b"AAA" | b"AAG" => ProteinResidue::K, b"GAT" | b"GAC" => ProteinResidue::D, b"GAA" | b"GAG" => ProteinResidue::E, b"TGT" | b"TGC" => ProteinResidue::C, b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => ProteinResidue::R, b"GGT" | b"GGC" | b"GGA" | b"GGG" => ProteinResidue::G, _ => ProteinResidue::X, // Stop or unknown } } /// Predict protein contacts using residue property heuristics fn predict_protein_contacts( &self, protein: &ProteinSequence, ) -> Result> { let residues = protein.residues(); let n = residues.len(); if n < 5 { return Ok(Vec::new()); } // Compute residue feature scores let features: Vec = residues .iter() .map(|r| r.to_char() as u8 as f32 / 255.0) .collect(); // Predict contacts: pairs of residues >4 apart with similar features let mut contacts = Vec::new(); for i in 0..n { for j in (i + 5)..n { let score = (features[i] + features[j]) / 2.0; if score > 0.5 { contacts.push((i, j, score)); } } } contacts.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap()); contacts.truncate(10); Ok(contacts) } /// Simple secondary structure prediction fn predict_secondary_structure(&self, protein: &ProteinSequence) -> Vec { protein .residues() .iter() .map(|r| match r { ProteinResidue::A | ProteinResidue::E | ProteinResidue::L | ProteinResidue::M => { 'H' } ProteinResidue::V | ProteinResidue::I | ProteinResidue::Y | ProteinResidue::F => { 'E' } _ => 'C', }) .collect() } /// Generate pileups from sequence alignment fn generate_pileups(&self, sequence: &[u8], reference: &[u8]) -> Result> { let mut pileups = Vec::new(); let min_len = sequence.len().min(reference.len()); for i in 0..min_len { let ref_base = match reference[i] { b'A' => Nucleotide::A, b'C' => Nucleotide::C, b'G' => Nucleotide::G, b'T' => Nucleotide::T, _ => Nucleotide::N, }; let seq_base = match sequence[i] { b'A' => Nucleotide::A, b'C' => Nucleotide::C, b'G' => Nucleotide::G, b'T' => Nucleotide::T, _ => Nucleotide::N, }; // Simulate coverage depth let depth = 15 + (i % 10); let bases = vec![seq_base; depth]; let qualities = vec![30; depth]; pileups.push(PileupColumn { position: i as u64, reference: ref_base, bases, qualities, }); } Ok(pileups) } /// Find ORFs and translate to proteins fn find_orfs_and_translate(&self, sequence: &[u8]) -> Result> { let mut proteins = Vec::new(); // Look for ATG start codons for i in 0..sequence.len().saturating_sub(30) { if sequence[i..].starts_with(b"ATG") { let orf = &sequence[i..]; if let Ok(protein_result) = self.run_protein_analysis(orf) { if protein_result.length >= 10 { proteins.push(protein_result); if proteins.len() >= 3 { break; } } } } } Ok(proteins) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_pipeline_creation() { let config = PipelineConfig::default(); let pipeline = GenomicPipeline::new(config); assert_eq!(pipeline.config.k, 21); } #[test] fn test_kmer_analysis() { let config = PipelineConfig::default(); let pipeline = GenomicPipeline::new(config); let sequences = vec![("seq1", b"ACGTACGTACGTACGTACGTACGT".as_ref())]; let result = pipeline.run_kmer_analysis(&sequences); assert!(result.is_ok()); } }