Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,319 @@
//! Variant calling module for DNA analysis
//!
//! Provides SNP and indel calling from pileup data.
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Pileup column representing reads aligned at a single position
#[derive(Debug, Clone)]
pub struct PileupColumn {
/// Observed bases from aligned reads
pub bases: Vec<u8>,
/// Quality scores for each base
pub qualities: Vec<u8>,
/// Genomic position
pub position: u64,
/// Chromosome number
pub chromosome: u8,
}
/// Genotype classification
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Genotype {
/// Homozygous reference (0/0)
HomRef,
/// Heterozygous (0/1)
Het,
/// Homozygous alternate (1/1)
HomAlt,
}
/// Variant filter status
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FilterStatus {
/// Passed all filters
Pass,
/// Failed quality filter
LowQuality,
/// Failed depth filter
LowDepth,
}
/// Called variant
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VariantCall {
/// Chromosome number
pub chromosome: u8,
/// Genomic position
pub position: u64,
/// Reference allele
pub ref_allele: u8,
/// Alternate allele
pub alt_allele: u8,
/// Variant quality (Phred-scaled)
pub quality: f64,
/// Genotype call
pub genotype: Genotype,
/// Total read depth
pub depth: usize,
/// Alternate allele depth
pub allele_depth: usize,
/// Filter status
pub filter_status: FilterStatus,
}
/// Variant caller configuration
#[derive(Debug, Clone)]
pub struct VariantCallerConfig {
/// Minimum base quality to consider
pub min_quality: u8,
/// Minimum read depth
pub min_depth: usize,
/// Minimum alternate allele frequency for heterozygous call
pub het_threshold: f64,
/// Minimum alternate allele frequency for homozygous alt call
pub hom_alt_threshold: f64,
}
impl Default for VariantCallerConfig {
fn default() -> Self {
Self {
min_quality: 20,
min_depth: 5,
het_threshold: 0.2,
hom_alt_threshold: 0.8,
}
}
}
/// Variant caller that processes pileup data to call SNPs
pub struct VariantCaller {
config: VariantCallerConfig,
}
impl VariantCaller {
/// Create a new variant caller with the given configuration
pub fn new(config: VariantCallerConfig) -> Self {
Self { config }
}
/// Call a SNP at a single pileup position
///
/// Returns `Some(VariantCall)` if a variant is detected, `None` if all reads
/// match the reference or depth is insufficient.
pub fn call_snp(&self, pileup: &PileupColumn, reference_base: u8) -> Option<VariantCall> {
let ref_base = reference_base.to_ascii_uppercase();
// Count alleles (only high-quality bases)
let mut allele_counts: HashMap<u8, usize> = HashMap::new();
for (i, &base) in pileup.bases.iter().enumerate() {
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
if qual >= self.config.min_quality {
*allele_counts.entry(base.to_ascii_uppercase()).or_insert(0) += 1;
}
}
let total_depth: usize = allele_counts.values().sum();
if total_depth < self.config.min_depth {
return None;
}
// Find the most common non-reference allele
let mut best_alt: Option<(u8, usize)> = None;
for (&allele, &count) in &allele_counts {
if allele != ref_base {
if best_alt.map_or(true, |(_, best_count)| count > best_count) {
best_alt = Some((allele, count));
}
}
}
let (alt_allele, alt_count) = best_alt?;
let alt_freq = alt_count as f64 / total_depth as f64;
if alt_freq < self.config.het_threshold {
return None;
}
let genotype = if alt_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
// Phred-scaled quality estimate
let quality = -10.0 * (1.0 - alt_freq).max(1e-10).log10() * (alt_count as f64);
Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele,
quality,
genotype,
depth: total_depth,
allele_depth: alt_count,
filter_status: FilterStatus::Pass,
})
}
/// Detect insertions/deletions from pileup data
///
/// Looks for gaps (represented as b'-') in the pileup bases that indicate
/// indels relative to the reference.
pub fn call_indel(
&self,
pileup: &PileupColumn,
reference_base: u8,
next_ref_bases: &[u8],
) -> Option<VariantCall> {
let ref_base = reference_base.to_ascii_uppercase();
let mut del_count = 0usize;
let mut ins_count = 0usize;
for (i, &base) in pileup.bases.iter().enumerate() {
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
if qual < self.config.min_quality {
continue;
}
if base == b'-' || base == b'*' {
del_count += 1;
} else if base == b'+' {
ins_count += 1;
}
}
let total = pileup.bases.len();
if total < self.config.min_depth {
return None;
}
// Check for deletion
if del_count > 0 {
let del_freq = del_count as f64 / total as f64;
if del_freq >= self.config.het_threshold {
let genotype = if del_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
let quality = -10.0 * (1.0 - del_freq).max(1e-10).log10() * (del_count as f64);
return Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele: b'-',
quality,
genotype,
depth: total,
allele_depth: del_count,
filter_status: FilterStatus::Pass,
});
}
}
// Check for insertion
if ins_count > 0 {
let ins_freq = ins_count as f64 / total as f64;
if ins_freq >= self.config.het_threshold {
let genotype = if ins_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
let quality = -10.0 * (1.0 - ins_freq).max(1e-10).log10() * (ins_count as f64);
return Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele: b'+',
quality,
genotype,
depth: total,
allele_depth: ins_count,
filter_status: FilterStatus::Pass,
});
}
}
None
}
/// Apply quality and depth filters to a list of variant calls
pub fn filter_variants(&self, calls: &mut [VariantCall]) {
for call in calls.iter_mut() {
if call.quality < self.config.min_quality as f64 {
call.filter_status = FilterStatus::LowQuality;
} else if call.depth < self.config.min_depth {
call.filter_status = FilterStatus::LowDepth;
}
}
}
/// Generate VCF-formatted output for variant calls
pub fn to_vcf(&self, calls: &[VariantCall], sample_name: &str) -> String {
let mut vcf = String::new();
vcf.push_str("##fileformat=VCFv4.3\n");
vcf.push_str(&format!("##source=RuVectorDNA\n"));
vcf.push_str("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t");
vcf.push_str(sample_name);
vcf.push('\n');
for call in calls {
let filter = match call.filter_status {
FilterStatus::Pass => "PASS",
FilterStatus::LowQuality => "LowQual",
FilterStatus::LowDepth => "LowDepth",
};
let gt = match call.genotype {
Genotype::HomRef => "0/0",
Genotype::Het => "0/1",
Genotype::HomAlt => "1/1",
};
vcf.push_str(&format!(
"chr{}\t{}\t.\t{}\t{}\t{:.1}\t{}\tDP={};AF={:.3}\tGT:DP:AD\t{}:{}:{}\n",
call.chromosome,
call.position,
call.ref_allele as char,
call.alt_allele as char,
call.quality,
filter,
call.depth,
call.allele_depth as f64 / call.depth as f64,
gt,
call.depth,
call.allele_depth,
));
}
vcf
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_variant_caller_creation() {
let config = VariantCallerConfig::default();
let _caller = VariantCaller::new(config);
}
#[test]
fn test_snp_calling() {
let caller = VariantCaller::new(VariantCallerConfig::default());
let pileup = PileupColumn {
bases: vec![b'G'; 15],
qualities: vec![40; 15],
position: 1000,
chromosome: 1,
};
let call = caller.call_snp(&pileup, b'A');
assert!(call.is_some());
let call = call.unwrap();
assert_eq!(call.genotype, Genotype::HomAlt);
}
}