Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
319
vendor/ruvector/examples/dna/src/variant.rs
vendored
Normal file
319
vendor/ruvector/examples/dna/src/variant.rs
vendored
Normal file
@@ -0,0 +1,319 @@
|
||||
//! Variant calling module for DNA analysis
|
||||
//!
|
||||
//! Provides SNP and indel calling from pileup data.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Pileup column representing reads aligned at a single position
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PileupColumn {
|
||||
/// Observed bases from aligned reads
|
||||
pub bases: Vec<u8>,
|
||||
/// Quality scores for each base
|
||||
pub qualities: Vec<u8>,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Chromosome number
|
||||
pub chromosome: u8,
|
||||
}
|
||||
|
||||
/// Genotype classification
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Genotype {
|
||||
/// Homozygous reference (0/0)
|
||||
HomRef,
|
||||
/// Heterozygous (0/1)
|
||||
Het,
|
||||
/// Homozygous alternate (1/1)
|
||||
HomAlt,
|
||||
}
|
||||
|
||||
/// Variant filter status
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum FilterStatus {
|
||||
/// Passed all filters
|
||||
Pass,
|
||||
/// Failed quality filter
|
||||
LowQuality,
|
||||
/// Failed depth filter
|
||||
LowDepth,
|
||||
}
|
||||
|
||||
/// Called variant
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VariantCall {
|
||||
/// Chromosome number
|
||||
pub chromosome: u8,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference allele
|
||||
pub ref_allele: u8,
|
||||
/// Alternate allele
|
||||
pub alt_allele: u8,
|
||||
/// Variant quality (Phred-scaled)
|
||||
pub quality: f64,
|
||||
/// Genotype call
|
||||
pub genotype: Genotype,
|
||||
/// Total read depth
|
||||
pub depth: usize,
|
||||
/// Alternate allele depth
|
||||
pub allele_depth: usize,
|
||||
/// Filter status
|
||||
pub filter_status: FilterStatus,
|
||||
}
|
||||
|
||||
/// Variant caller configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VariantCallerConfig {
|
||||
/// Minimum base quality to consider
|
||||
pub min_quality: u8,
|
||||
/// Minimum read depth
|
||||
pub min_depth: usize,
|
||||
/// Minimum alternate allele frequency for heterozygous call
|
||||
pub het_threshold: f64,
|
||||
/// Minimum alternate allele frequency for homozygous alt call
|
||||
pub hom_alt_threshold: f64,
|
||||
}
|
||||
|
||||
impl Default for VariantCallerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_quality: 20,
|
||||
min_depth: 5,
|
||||
het_threshold: 0.2,
|
||||
hom_alt_threshold: 0.8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Variant caller that processes pileup data to call SNPs
|
||||
pub struct VariantCaller {
|
||||
config: VariantCallerConfig,
|
||||
}
|
||||
|
||||
impl VariantCaller {
|
||||
/// Create a new variant caller with the given configuration
|
||||
pub fn new(config: VariantCallerConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Call a SNP at a single pileup position
|
||||
///
|
||||
/// Returns `Some(VariantCall)` if a variant is detected, `None` if all reads
|
||||
/// match the reference or depth is insufficient.
|
||||
pub fn call_snp(&self, pileup: &PileupColumn, reference_base: u8) -> Option<VariantCall> {
|
||||
let ref_base = reference_base.to_ascii_uppercase();
|
||||
|
||||
// Count alleles (only high-quality bases)
|
||||
let mut allele_counts: HashMap<u8, usize> = HashMap::new();
|
||||
for (i, &base) in pileup.bases.iter().enumerate() {
|
||||
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
|
||||
if qual >= self.config.min_quality {
|
||||
*allele_counts.entry(base.to_ascii_uppercase()).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let total_depth: usize = allele_counts.values().sum();
|
||||
if total_depth < self.config.min_depth {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Find the most common non-reference allele
|
||||
let mut best_alt: Option<(u8, usize)> = None;
|
||||
for (&allele, &count) in &allele_counts {
|
||||
if allele != ref_base {
|
||||
if best_alt.map_or(true, |(_, best_count)| count > best_count) {
|
||||
best_alt = Some((allele, count));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (alt_allele, alt_count) = best_alt?;
|
||||
let alt_freq = alt_count as f64 / total_depth as f64;
|
||||
|
||||
if alt_freq < self.config.het_threshold {
|
||||
return None;
|
||||
}
|
||||
|
||||
let genotype = if alt_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
|
||||
// Phred-scaled quality estimate
|
||||
let quality = -10.0 * (1.0 - alt_freq).max(1e-10).log10() * (alt_count as f64);
|
||||
|
||||
Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele,
|
||||
quality,
|
||||
genotype,
|
||||
depth: total_depth,
|
||||
allele_depth: alt_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect insertions/deletions from pileup data
|
||||
///
|
||||
/// Looks for gaps (represented as b'-') in the pileup bases that indicate
|
||||
/// indels relative to the reference.
|
||||
pub fn call_indel(
|
||||
&self,
|
||||
pileup: &PileupColumn,
|
||||
reference_base: u8,
|
||||
next_ref_bases: &[u8],
|
||||
) -> Option<VariantCall> {
|
||||
let ref_base = reference_base.to_ascii_uppercase();
|
||||
let mut del_count = 0usize;
|
||||
let mut ins_count = 0usize;
|
||||
|
||||
for (i, &base) in pileup.bases.iter().enumerate() {
|
||||
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
|
||||
if qual < self.config.min_quality {
|
||||
continue;
|
||||
}
|
||||
if base == b'-' || base == b'*' {
|
||||
del_count += 1;
|
||||
} else if base == b'+' {
|
||||
ins_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let total = pileup.bases.len();
|
||||
if total < self.config.min_depth {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check for deletion
|
||||
if del_count > 0 {
|
||||
let del_freq = del_count as f64 / total as f64;
|
||||
if del_freq >= self.config.het_threshold {
|
||||
let genotype = if del_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
let quality = -10.0 * (1.0 - del_freq).max(1e-10).log10() * (del_count as f64);
|
||||
return Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele: b'-',
|
||||
quality,
|
||||
genotype,
|
||||
depth: total,
|
||||
allele_depth: del_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check for insertion
|
||||
if ins_count > 0 {
|
||||
let ins_freq = ins_count as f64 / total as f64;
|
||||
if ins_freq >= self.config.het_threshold {
|
||||
let genotype = if ins_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
let quality = -10.0 * (1.0 - ins_freq).max(1e-10).log10() * (ins_count as f64);
|
||||
return Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele: b'+',
|
||||
quality,
|
||||
genotype,
|
||||
depth: total,
|
||||
allele_depth: ins_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Apply quality and depth filters to a list of variant calls
|
||||
pub fn filter_variants(&self, calls: &mut [VariantCall]) {
|
||||
for call in calls.iter_mut() {
|
||||
if call.quality < self.config.min_quality as f64 {
|
||||
call.filter_status = FilterStatus::LowQuality;
|
||||
} else if call.depth < self.config.min_depth {
|
||||
call.filter_status = FilterStatus::LowDepth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate VCF-formatted output for variant calls
|
||||
pub fn to_vcf(&self, calls: &[VariantCall], sample_name: &str) -> String {
|
||||
let mut vcf = String::new();
|
||||
vcf.push_str("##fileformat=VCFv4.3\n");
|
||||
vcf.push_str(&format!("##source=RuVectorDNA\n"));
|
||||
vcf.push_str("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t");
|
||||
vcf.push_str(sample_name);
|
||||
vcf.push('\n');
|
||||
|
||||
for call in calls {
|
||||
let filter = match call.filter_status {
|
||||
FilterStatus::Pass => "PASS",
|
||||
FilterStatus::LowQuality => "LowQual",
|
||||
FilterStatus::LowDepth => "LowDepth",
|
||||
};
|
||||
let gt = match call.genotype {
|
||||
Genotype::HomRef => "0/0",
|
||||
Genotype::Het => "0/1",
|
||||
Genotype::HomAlt => "1/1",
|
||||
};
|
||||
vcf.push_str(&format!(
|
||||
"chr{}\t{}\t.\t{}\t{}\t{:.1}\t{}\tDP={};AF={:.3}\tGT:DP:AD\t{}:{}:{}\n",
|
||||
call.chromosome,
|
||||
call.position,
|
||||
call.ref_allele as char,
|
||||
call.alt_allele as char,
|
||||
call.quality,
|
||||
filter,
|
||||
call.depth,
|
||||
call.allele_depth as f64 / call.depth as f64,
|
||||
gt,
|
||||
call.depth,
|
||||
call.allele_depth,
|
||||
));
|
||||
}
|
||||
|
||||
vcf
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_variant_caller_creation() {
|
||||
let config = VariantCallerConfig::default();
|
||||
let _caller = VariantCaller::new(config);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snp_calling() {
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
let pileup = PileupColumn {
|
||||
bases: vec![b'G'; 15],
|
||||
qualities: vec![40; 15],
|
||||
position: 1000,
|
||||
chromosome: 1,
|
||||
};
|
||||
|
||||
let call = caller.call_snp(&pileup, b'A');
|
||||
assert!(call.is_some());
|
||||
let call = call.unwrap();
|
||||
assert_eq!(call.genotype, Genotype::HomAlt);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user