Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
496
vendor/ruvector/examples/dna/src/pipeline.rs
vendored
Normal file
496
vendor/ruvector/examples/dna/src/pipeline.rs
vendored
Normal file
@@ -0,0 +1,496 @@
|
||||
//! DAG-based genomic analysis pipeline orchestrator
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::types::{DnaSequence, KmerIndex, Nucleotide, ProteinResidue, ProteinSequence};
|
||||
use ruvector_core::types::{SearchQuery, VectorEntry};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Pipeline configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PipelineConfig {
|
||||
/// K-mer size (default: 21)
|
||||
pub k: usize,
|
||||
/// Attention window size (default: 512)
|
||||
pub window_size: usize,
|
||||
/// Variant calling min depth (default: 10)
|
||||
pub min_depth: usize,
|
||||
/// Min variant quality (default: 20)
|
||||
pub min_quality: u8,
|
||||
}
|
||||
|
||||
impl Default for PipelineConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
k: 21,
|
||||
window_size: 512,
|
||||
min_depth: 10,
|
||||
min_quality: 20,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// K-mer analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct KmerAnalysisResult {
|
||||
/// Total k-mers extracted
|
||||
pub total_kmers: usize,
|
||||
/// Unique k-mers found
|
||||
pub unique_kmers: usize,
|
||||
/// GC content ratio
|
||||
pub gc_content: f64,
|
||||
/// Top similar sequences
|
||||
pub top_similar_sequences: Vec<SimilarSequence>,
|
||||
}
|
||||
|
||||
/// Similar sequence match
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SimilarSequence {
|
||||
/// Sequence identifier
|
||||
pub id: String,
|
||||
/// Similarity score
|
||||
pub similarity: f32,
|
||||
/// Position in the index
|
||||
pub position: usize,
|
||||
}
|
||||
|
||||
/// Variant call result
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VariantCall {
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference base
|
||||
pub reference: Nucleotide,
|
||||
/// Alternate base
|
||||
pub alternate: Nucleotide,
|
||||
/// Variant quality
|
||||
pub quality: u8,
|
||||
/// Read depth
|
||||
pub depth: usize,
|
||||
/// Allele frequency
|
||||
pub allele_frequency: f64,
|
||||
}
|
||||
|
||||
/// Pileup column for variant calling
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PileupColumn {
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference base
|
||||
pub reference: Nucleotide,
|
||||
/// Observed bases
|
||||
pub bases: Vec<Nucleotide>,
|
||||
/// Quality scores
|
||||
pub qualities: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Protein analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProteinAnalysisResult {
|
||||
/// Amino acid sequence (single letter codes)
|
||||
pub sequence: String,
|
||||
/// Protein length
|
||||
pub length: usize,
|
||||
/// Predicted contacts as (i, j, score)
|
||||
pub predicted_contacts: Vec<(usize, usize, f32)>,
|
||||
/// Secondary structure prediction (H/E/C)
|
||||
pub secondary_structure: Vec<char>,
|
||||
}
|
||||
|
||||
/// Full pipeline analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FullAnalysisResult {
|
||||
/// K-mer statistics
|
||||
pub kmer_stats: KmerAnalysisResult,
|
||||
/// Called variants
|
||||
pub variants: Vec<VariantCall>,
|
||||
/// Protein analysis results
|
||||
pub proteins: Vec<ProteinAnalysisResult>,
|
||||
/// Execution time in milliseconds
|
||||
pub execution_time_ms: u128,
|
||||
}
|
||||
|
||||
/// Genomic analysis pipeline orchestrator
|
||||
pub struct GenomicPipeline {
|
||||
config: PipelineConfig,
|
||||
}
|
||||
|
||||
impl GenomicPipeline {
|
||||
/// Create new pipeline with configuration
|
||||
pub fn new(config: PipelineConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Run k-mer analysis on sequences
|
||||
pub fn run_kmer_analysis(&self, sequences: &[(&str, &[u8])]) -> Result<KmerAnalysisResult> {
|
||||
let mut total_kmers = 0;
|
||||
let mut kmer_set = std::collections::HashSet::new();
|
||||
let mut gc_count = 0;
|
||||
let mut total_bases = 0;
|
||||
|
||||
// Create temporary k-mer index
|
||||
let index = KmerIndex::new(self.config.k, 384, ":memory:")?;
|
||||
|
||||
for (id, seq) in sequences {
|
||||
// Extract k-mers
|
||||
if seq.len() < self.config.k {
|
||||
continue;
|
||||
}
|
||||
|
||||
total_bases += seq.len();
|
||||
|
||||
for window in seq.windows(self.config.k) {
|
||||
total_kmers += 1;
|
||||
kmer_set.insert(window.to_vec());
|
||||
|
||||
// Count GC content
|
||||
for &base in window {
|
||||
if base == b'G' || base == b'C' {
|
||||
gc_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert sequence to vector and index
|
||||
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(seq))?;
|
||||
|
||||
if let Ok(vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
|
||||
let entry = VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector,
|
||||
metadata: None,
|
||||
};
|
||||
let _ = index.db().insert(entry);
|
||||
}
|
||||
}
|
||||
|
||||
let gc_content = if total_bases > 0 {
|
||||
(gc_count as f64) / (total_bases as f64)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Find similar sequences using HNSW search
|
||||
let mut top_similar = Vec::new();
|
||||
if !sequences.is_empty() {
|
||||
if let Some((query_id, query_seq)) = sequences.first() {
|
||||
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(query_seq))?;
|
||||
|
||||
if let Ok(query_vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
|
||||
let search_query = SearchQuery {
|
||||
vector: query_vector,
|
||||
k: 5,
|
||||
filter: None,
|
||||
ef_search: None,
|
||||
};
|
||||
if let Ok(results) = index.db().search(search_query) {
|
||||
for result in results {
|
||||
if result.id != *query_id {
|
||||
top_similar.push(SimilarSequence {
|
||||
id: result.id.clone(),
|
||||
similarity: result.score,
|
||||
position: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(KmerAnalysisResult {
|
||||
total_kmers,
|
||||
unique_kmers: kmer_set.len(),
|
||||
gc_content,
|
||||
top_similar_sequences: top_similar,
|
||||
})
|
||||
}
|
||||
|
||||
/// Run variant calling against reference
|
||||
pub fn run_variant_calling(
|
||||
&self,
|
||||
pileups: &[PileupColumn],
|
||||
_reference: &[u8],
|
||||
) -> Result<Vec<VariantCall>> {
|
||||
let mut variants = Vec::new();
|
||||
|
||||
for pileup in pileups {
|
||||
if pileup.bases.len() < self.config.min_depth {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Count allele frequencies
|
||||
let mut allele_counts: HashMap<Nucleotide, usize> = HashMap::new();
|
||||
for &base in &pileup.bases {
|
||||
*allele_counts.entry(base).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
// Find most common alternate allele
|
||||
let _ref_count = allele_counts.get(&pileup.reference).copied().unwrap_or(0);
|
||||
|
||||
for (&allele, &count) in &allele_counts {
|
||||
if allele == pileup.reference || allele == Nucleotide::N {
|
||||
continue;
|
||||
}
|
||||
|
||||
let allele_freq = count as f64 / pileup.bases.len() as f64;
|
||||
|
||||
// Call variant if alternate allele frequency is significant
|
||||
if allele_freq > 0.2 && count >= 3 {
|
||||
// Calculate quality score from supporting reads
|
||||
let quality = pileup
|
||||
.qualities
|
||||
.iter()
|
||||
.take(count)
|
||||
.map(|&q| q as u16)
|
||||
.sum::<u16>()
|
||||
.min(255) as u8;
|
||||
|
||||
if quality >= self.config.min_quality {
|
||||
variants.push(VariantCall {
|
||||
position: pileup.position,
|
||||
reference: pileup.reference,
|
||||
alternate: allele,
|
||||
quality,
|
||||
depth: pileup.bases.len(),
|
||||
allele_frequency: allele_freq,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(variants)
|
||||
}
|
||||
|
||||
/// Translate DNA to protein and analyze structure
|
||||
pub fn run_protein_analysis(&self, dna: &[u8]) -> Result<ProteinAnalysisResult> {
|
||||
// Translate DNA to protein using standard genetic code
|
||||
let protein = self.translate_dna(dna)?;
|
||||
|
||||
// Predict contacts using heuristic scoring
|
||||
let contacts = self.predict_protein_contacts(&protein)?;
|
||||
|
||||
// Simple secondary structure prediction
|
||||
let secondary_structure = self.predict_secondary_structure(&protein);
|
||||
|
||||
Ok(ProteinAnalysisResult {
|
||||
sequence: protein.residues().iter().map(|r| r.to_char()).collect(),
|
||||
length: protein.len(),
|
||||
predicted_contacts: contacts,
|
||||
secondary_structure,
|
||||
})
|
||||
}
|
||||
|
||||
/// Run full analysis pipeline
|
||||
pub fn run_full_pipeline(
|
||||
&self,
|
||||
sequence: &[u8],
|
||||
reference: &[u8],
|
||||
) -> Result<FullAnalysisResult> {
|
||||
let start = Instant::now();
|
||||
|
||||
// Stage 1: K-mer analysis
|
||||
let kmer_stats =
|
||||
self.run_kmer_analysis(&[("query", sequence), ("reference", reference)])?;
|
||||
|
||||
// Stage 2: Variant calling - generate pileups from sequence
|
||||
let pileups = self.generate_pileups(sequence, reference)?;
|
||||
let variants = self.run_variant_calling(&pileups, reference)?;
|
||||
|
||||
// Stage 3: Protein analysis - find ORFs and translate
|
||||
let proteins = self.find_orfs_and_translate(sequence)?;
|
||||
|
||||
let execution_time_ms = start.elapsed().as_millis();
|
||||
|
||||
Ok(FullAnalysisResult {
|
||||
kmer_stats,
|
||||
variants,
|
||||
proteins,
|
||||
execution_time_ms,
|
||||
})
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
|
||||
/// Translate DNA to protein
|
||||
fn translate_dna(&self, dna: &[u8]) -> Result<ProteinSequence> {
|
||||
let mut residues = Vec::new();
|
||||
|
||||
for codon in dna.chunks(3) {
|
||||
if codon.len() < 3 {
|
||||
break;
|
||||
}
|
||||
|
||||
let aa = self.codon_to_amino_acid(codon);
|
||||
if aa == ProteinResidue::X {
|
||||
break; // Stop codon
|
||||
}
|
||||
residues.push(aa);
|
||||
}
|
||||
|
||||
Ok(ProteinSequence::new(residues))
|
||||
}
|
||||
|
||||
/// Map codon to amino acid (simplified genetic code)
|
||||
fn codon_to_amino_acid(&self, codon: &[u8]) -> ProteinResidue {
|
||||
match codon {
|
||||
b"ATG" => ProteinResidue::M,
|
||||
b"TGG" => ProteinResidue::W,
|
||||
b"TTT" | b"TTC" => ProteinResidue::F,
|
||||
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => ProteinResidue::L,
|
||||
b"ATT" | b"ATC" | b"ATA" => ProteinResidue::I,
|
||||
b"GTT" | b"GTC" | b"GTA" | b"GTG" => ProteinResidue::V,
|
||||
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => ProteinResidue::S,
|
||||
b"CCT" | b"CCC" | b"CCA" | b"CCG" => ProteinResidue::P,
|
||||
b"ACT" | b"ACC" | b"ACA" | b"ACG" => ProteinResidue::T,
|
||||
b"GCT" | b"GCC" | b"GCA" | b"GCG" => ProteinResidue::A,
|
||||
b"TAT" | b"TAC" => ProteinResidue::Y,
|
||||
b"CAT" | b"CAC" => ProteinResidue::H,
|
||||
b"CAA" | b"CAG" => ProteinResidue::Q,
|
||||
b"AAT" | b"AAC" => ProteinResidue::N,
|
||||
b"AAA" | b"AAG" => ProteinResidue::K,
|
||||
b"GAT" | b"GAC" => ProteinResidue::D,
|
||||
b"GAA" | b"GAG" => ProteinResidue::E,
|
||||
b"TGT" | b"TGC" => ProteinResidue::C,
|
||||
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => ProteinResidue::R,
|
||||
b"GGT" | b"GGC" | b"GGA" | b"GGG" => ProteinResidue::G,
|
||||
_ => ProteinResidue::X, // Stop or unknown
|
||||
}
|
||||
}
|
||||
|
||||
/// Predict protein contacts using residue property heuristics
|
||||
fn predict_protein_contacts(
|
||||
&self,
|
||||
protein: &ProteinSequence,
|
||||
) -> Result<Vec<(usize, usize, f32)>> {
|
||||
let residues = protein.residues();
|
||||
let n = residues.len();
|
||||
|
||||
if n < 5 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Compute residue feature scores
|
||||
let features: Vec<f32> = residues
|
||||
.iter()
|
||||
.map(|r| r.to_char() as u8 as f32 / 255.0)
|
||||
.collect();
|
||||
|
||||
// Predict contacts: pairs of residues >4 apart with similar features
|
||||
let mut contacts = Vec::new();
|
||||
for i in 0..n {
|
||||
for j in (i + 5)..n {
|
||||
let score = (features[i] + features[j]) / 2.0;
|
||||
if score > 0.5 {
|
||||
contacts.push((i, j, score));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
contacts.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
|
||||
contacts.truncate(10);
|
||||
Ok(contacts)
|
||||
}
|
||||
|
||||
/// Simple secondary structure prediction
|
||||
fn predict_secondary_structure(&self, protein: &ProteinSequence) -> Vec<char> {
|
||||
protein
|
||||
.residues()
|
||||
.iter()
|
||||
.map(|r| match r {
|
||||
ProteinResidue::A | ProteinResidue::E | ProteinResidue::L | ProteinResidue::M => {
|
||||
'H'
|
||||
}
|
||||
ProteinResidue::V | ProteinResidue::I | ProteinResidue::Y | ProteinResidue::F => {
|
||||
'E'
|
||||
}
|
||||
_ => 'C',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Generate pileups from sequence alignment
|
||||
fn generate_pileups(&self, sequence: &[u8], reference: &[u8]) -> Result<Vec<PileupColumn>> {
|
||||
let mut pileups = Vec::new();
|
||||
let min_len = sequence.len().min(reference.len());
|
||||
|
||||
for i in 0..min_len {
|
||||
let ref_base = match reference[i] {
|
||||
b'A' => Nucleotide::A,
|
||||
b'C' => Nucleotide::C,
|
||||
b'G' => Nucleotide::G,
|
||||
b'T' => Nucleotide::T,
|
||||
_ => Nucleotide::N,
|
||||
};
|
||||
|
||||
let seq_base = match sequence[i] {
|
||||
b'A' => Nucleotide::A,
|
||||
b'C' => Nucleotide::C,
|
||||
b'G' => Nucleotide::G,
|
||||
b'T' => Nucleotide::T,
|
||||
_ => Nucleotide::N,
|
||||
};
|
||||
|
||||
// Simulate coverage depth
|
||||
let depth = 15 + (i % 10);
|
||||
let bases = vec![seq_base; depth];
|
||||
let qualities = vec![30; depth];
|
||||
|
||||
pileups.push(PileupColumn {
|
||||
position: i as u64,
|
||||
reference: ref_base,
|
||||
bases,
|
||||
qualities,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(pileups)
|
||||
}
|
||||
|
||||
/// Find ORFs and translate to proteins
|
||||
fn find_orfs_and_translate(&self, sequence: &[u8]) -> Result<Vec<ProteinAnalysisResult>> {
|
||||
let mut proteins = Vec::new();
|
||||
|
||||
// Look for ATG start codons
|
||||
for i in 0..sequence.len().saturating_sub(30) {
|
||||
if sequence[i..].starts_with(b"ATG") {
|
||||
let orf = &sequence[i..];
|
||||
if let Ok(protein_result) = self.run_protein_analysis(orf) {
|
||||
if protein_result.length >= 10 {
|
||||
proteins.push(protein_result);
|
||||
if proteins.len() >= 3 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(proteins)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_creation() {
|
||||
let config = PipelineConfig::default();
|
||||
let pipeline = GenomicPipeline::new(config);
|
||||
assert_eq!(pipeline.config.k, 21);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_analysis() {
|
||||
let config = PipelineConfig::default();
|
||||
let pipeline = GenomicPipeline::new(config);
|
||||
|
||||
let sequences = vec![("seq1", b"ACGTACGTACGTACGTACGTACGT".as_ref())];
|
||||
|
||||
let result = pipeline.run_kmer_analysis(&sequences);
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user