Files
wifi-densepose/vendor/ruvector/examples/dna/src/pipeline.rs

497 lines
16 KiB
Rust

//! DAG-based genomic analysis pipeline orchestrator
use crate::error::Result;
use crate::types::{DnaSequence, KmerIndex, Nucleotide, ProteinResidue, ProteinSequence};
use ruvector_core::types::{SearchQuery, VectorEntry};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;
/// Pipeline configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineConfig {
/// K-mer size (default: 21)
pub k: usize,
/// Attention window size (default: 512)
pub window_size: usize,
/// Variant calling min depth (default: 10)
pub min_depth: usize,
/// Min variant quality (default: 20)
pub min_quality: u8,
}
impl Default for PipelineConfig {
fn default() -> Self {
Self {
k: 21,
window_size: 512,
min_depth: 10,
min_quality: 20,
}
}
}
/// K-mer analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KmerAnalysisResult {
/// Total k-mers extracted
pub total_kmers: usize,
/// Unique k-mers found
pub unique_kmers: usize,
/// GC content ratio
pub gc_content: f64,
/// Top similar sequences
pub top_similar_sequences: Vec<SimilarSequence>,
}
/// Similar sequence match
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarSequence {
/// Sequence identifier
pub id: String,
/// Similarity score
pub similarity: f32,
/// Position in the index
pub position: usize,
}
/// Variant call result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VariantCall {
/// Genomic position
pub position: u64,
/// Reference base
pub reference: Nucleotide,
/// Alternate base
pub alternate: Nucleotide,
/// Variant quality
pub quality: u8,
/// Read depth
pub depth: usize,
/// Allele frequency
pub allele_frequency: f64,
}
/// Pileup column for variant calling
#[derive(Debug, Clone)]
pub struct PileupColumn {
/// Genomic position
pub position: u64,
/// Reference base
pub reference: Nucleotide,
/// Observed bases
pub bases: Vec<Nucleotide>,
/// Quality scores
pub qualities: Vec<u8>,
}
/// Protein analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProteinAnalysisResult {
/// Amino acid sequence (single letter codes)
pub sequence: String,
/// Protein length
pub length: usize,
/// Predicted contacts as (i, j, score)
pub predicted_contacts: Vec<(usize, usize, f32)>,
/// Secondary structure prediction (H/E/C)
pub secondary_structure: Vec<char>,
}
/// Full pipeline analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FullAnalysisResult {
/// K-mer statistics
pub kmer_stats: KmerAnalysisResult,
/// Called variants
pub variants: Vec<VariantCall>,
/// Protein analysis results
pub proteins: Vec<ProteinAnalysisResult>,
/// Execution time in milliseconds
pub execution_time_ms: u128,
}
/// Genomic analysis pipeline orchestrator
pub struct GenomicPipeline {
config: PipelineConfig,
}
impl GenomicPipeline {
/// Create new pipeline with configuration
pub fn new(config: PipelineConfig) -> Self {
Self { config }
}
/// Run k-mer analysis on sequences
pub fn run_kmer_analysis(&self, sequences: &[(&str, &[u8])]) -> Result<KmerAnalysisResult> {
let mut total_kmers = 0;
let mut kmer_set = std::collections::HashSet::new();
let mut gc_count = 0;
let mut total_bases = 0;
// Create temporary k-mer index
let index = KmerIndex::new(self.config.k, 384, ":memory:")?;
for (id, seq) in sequences {
// Extract k-mers
if seq.len() < self.config.k {
continue;
}
total_bases += seq.len();
for window in seq.windows(self.config.k) {
total_kmers += 1;
kmer_set.insert(window.to_vec());
// Count GC content
for &base in window {
if base == b'G' || base == b'C' {
gc_count += 1;
}
}
}
// Convert sequence to vector and index
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(seq))?;
if let Ok(vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
let entry = VectorEntry {
id: Some(id.to_string()),
vector,
metadata: None,
};
let _ = index.db().insert(entry);
}
}
let gc_content = if total_bases > 0 {
(gc_count as f64) / (total_bases as f64)
} else {
0.0
};
// Find similar sequences using HNSW search
let mut top_similar = Vec::new();
if !sequences.is_empty() {
if let Some((query_id, query_seq)) = sequences.first() {
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(query_seq))?;
if let Ok(query_vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
let search_query = SearchQuery {
vector: query_vector,
k: 5,
filter: None,
ef_search: None,
};
if let Ok(results) = index.db().search(search_query) {
for result in results {
if result.id != *query_id {
top_similar.push(SimilarSequence {
id: result.id.clone(),
similarity: result.score,
position: 0,
});
}
}
}
}
}
}
Ok(KmerAnalysisResult {
total_kmers,
unique_kmers: kmer_set.len(),
gc_content,
top_similar_sequences: top_similar,
})
}
/// Run variant calling against reference
pub fn run_variant_calling(
&self,
pileups: &[PileupColumn],
_reference: &[u8],
) -> Result<Vec<VariantCall>> {
let mut variants = Vec::new();
for pileup in pileups {
if pileup.bases.len() < self.config.min_depth {
continue;
}
// Count allele frequencies
let mut allele_counts: HashMap<Nucleotide, usize> = HashMap::new();
for &base in &pileup.bases {
*allele_counts.entry(base).or_insert(0) += 1;
}
// Find most common alternate allele
let _ref_count = allele_counts.get(&pileup.reference).copied().unwrap_or(0);
for (&allele, &count) in &allele_counts {
if allele == pileup.reference || allele == Nucleotide::N {
continue;
}
let allele_freq = count as f64 / pileup.bases.len() as f64;
// Call variant if alternate allele frequency is significant
if allele_freq > 0.2 && count >= 3 {
// Calculate quality score from supporting reads
let quality = pileup
.qualities
.iter()
.take(count)
.map(|&q| q as u16)
.sum::<u16>()
.min(255) as u8;
if quality >= self.config.min_quality {
variants.push(VariantCall {
position: pileup.position,
reference: pileup.reference,
alternate: allele,
quality,
depth: pileup.bases.len(),
allele_frequency: allele_freq,
});
}
}
}
}
Ok(variants)
}
/// Translate DNA to protein and analyze structure
pub fn run_protein_analysis(&self, dna: &[u8]) -> Result<ProteinAnalysisResult> {
// Translate DNA to protein using standard genetic code
let protein = self.translate_dna(dna)?;
// Predict contacts using heuristic scoring
let contacts = self.predict_protein_contacts(&protein)?;
// Simple secondary structure prediction
let secondary_structure = self.predict_secondary_structure(&protein);
Ok(ProteinAnalysisResult {
sequence: protein.residues().iter().map(|r| r.to_char()).collect(),
length: protein.len(),
predicted_contacts: contacts,
secondary_structure,
})
}
/// Run full analysis pipeline
pub fn run_full_pipeline(
&self,
sequence: &[u8],
reference: &[u8],
) -> Result<FullAnalysisResult> {
let start = Instant::now();
// Stage 1: K-mer analysis
let kmer_stats =
self.run_kmer_analysis(&[("query", sequence), ("reference", reference)])?;
// Stage 2: Variant calling - generate pileups from sequence
let pileups = self.generate_pileups(sequence, reference)?;
let variants = self.run_variant_calling(&pileups, reference)?;
// Stage 3: Protein analysis - find ORFs and translate
let proteins = self.find_orfs_and_translate(sequence)?;
let execution_time_ms = start.elapsed().as_millis();
Ok(FullAnalysisResult {
kmer_stats,
variants,
proteins,
execution_time_ms,
})
}
// Helper methods
/// Translate DNA to protein
fn translate_dna(&self, dna: &[u8]) -> Result<ProteinSequence> {
let mut residues = Vec::new();
for codon in dna.chunks(3) {
if codon.len() < 3 {
break;
}
let aa = self.codon_to_amino_acid(codon);
if aa == ProteinResidue::X {
break; // Stop codon
}
residues.push(aa);
}
Ok(ProteinSequence::new(residues))
}
/// Map codon to amino acid (simplified genetic code)
fn codon_to_amino_acid(&self, codon: &[u8]) -> ProteinResidue {
match codon {
b"ATG" => ProteinResidue::M,
b"TGG" => ProteinResidue::W,
b"TTT" | b"TTC" => ProteinResidue::F,
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => ProteinResidue::L,
b"ATT" | b"ATC" | b"ATA" => ProteinResidue::I,
b"GTT" | b"GTC" | b"GTA" | b"GTG" => ProteinResidue::V,
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => ProteinResidue::S,
b"CCT" | b"CCC" | b"CCA" | b"CCG" => ProteinResidue::P,
b"ACT" | b"ACC" | b"ACA" | b"ACG" => ProteinResidue::T,
b"GCT" | b"GCC" | b"GCA" | b"GCG" => ProteinResidue::A,
b"TAT" | b"TAC" => ProteinResidue::Y,
b"CAT" | b"CAC" => ProteinResidue::H,
b"CAA" | b"CAG" => ProteinResidue::Q,
b"AAT" | b"AAC" => ProteinResidue::N,
b"AAA" | b"AAG" => ProteinResidue::K,
b"GAT" | b"GAC" => ProteinResidue::D,
b"GAA" | b"GAG" => ProteinResidue::E,
b"TGT" | b"TGC" => ProteinResidue::C,
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => ProteinResidue::R,
b"GGT" | b"GGC" | b"GGA" | b"GGG" => ProteinResidue::G,
_ => ProteinResidue::X, // Stop or unknown
}
}
/// Predict protein contacts using residue property heuristics
fn predict_protein_contacts(
&self,
protein: &ProteinSequence,
) -> Result<Vec<(usize, usize, f32)>> {
let residues = protein.residues();
let n = residues.len();
if n < 5 {
return Ok(Vec::new());
}
// Compute residue feature scores
let features: Vec<f32> = residues
.iter()
.map(|r| r.to_char() as u8 as f32 / 255.0)
.collect();
// Predict contacts: pairs of residues >4 apart with similar features
let mut contacts = Vec::new();
for i in 0..n {
for j in (i + 5)..n {
let score = (features[i] + features[j]) / 2.0;
if score > 0.5 {
contacts.push((i, j, score));
}
}
}
contacts.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
contacts.truncate(10);
Ok(contacts)
}
/// Simple secondary structure prediction
fn predict_secondary_structure(&self, protein: &ProteinSequence) -> Vec<char> {
protein
.residues()
.iter()
.map(|r| match r {
ProteinResidue::A | ProteinResidue::E | ProteinResidue::L | ProteinResidue::M => {
'H'
}
ProteinResidue::V | ProteinResidue::I | ProteinResidue::Y | ProteinResidue::F => {
'E'
}
_ => 'C',
})
.collect()
}
/// Generate pileups from sequence alignment
fn generate_pileups(&self, sequence: &[u8], reference: &[u8]) -> Result<Vec<PileupColumn>> {
let mut pileups = Vec::new();
let min_len = sequence.len().min(reference.len());
for i in 0..min_len {
let ref_base = match reference[i] {
b'A' => Nucleotide::A,
b'C' => Nucleotide::C,
b'G' => Nucleotide::G,
b'T' => Nucleotide::T,
_ => Nucleotide::N,
};
let seq_base = match sequence[i] {
b'A' => Nucleotide::A,
b'C' => Nucleotide::C,
b'G' => Nucleotide::G,
b'T' => Nucleotide::T,
_ => Nucleotide::N,
};
// Simulate coverage depth
let depth = 15 + (i % 10);
let bases = vec![seq_base; depth];
let qualities = vec![30; depth];
pileups.push(PileupColumn {
position: i as u64,
reference: ref_base,
bases,
qualities,
});
}
Ok(pileups)
}
/// Find ORFs and translate to proteins
fn find_orfs_and_translate(&self, sequence: &[u8]) -> Result<Vec<ProteinAnalysisResult>> {
let mut proteins = Vec::new();
// Look for ATG start codons
for i in 0..sequence.len().saturating_sub(30) {
if sequence[i..].starts_with(b"ATG") {
let orf = &sequence[i..];
if let Ok(protein_result) = self.run_protein_analysis(orf) {
if protein_result.length >= 10 {
proteins.push(protein_result);
if proteins.len() >= 3 {
break;
}
}
}
}
}
Ok(proteins)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pipeline_creation() {
let config = PipelineConfig::default();
let pipeline = GenomicPipeline::new(config);
assert_eq!(pipeline.config.k, 21);
}
#[test]
fn test_kmer_analysis() {
let config = PipelineConfig::default();
let pipeline = GenomicPipeline::new(config);
let sequences = vec![("seq1", b"ACGTACGTACGTACGTACGTACGT".as_ref())];
let result = pipeline.run_kmer_analysis(&sequences);
assert!(result.is_ok());
}
}