Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
246
vendor/ruvector/examples/dna/src/alignment.rs
vendored
Normal file
246
vendor/ruvector/examples/dna/src/alignment.rs
vendored
Normal file
@@ -0,0 +1,246 @@
|
||||
//! Sequence alignment module using attention-based scoring
|
||||
//!
|
||||
//! Provides Smith-Waterman local alignment with attention-weighted
|
||||
//! scoring derived from RuVector's attention primitives.
|
||||
|
||||
use crate::error::{DnaError, Result};
|
||||
use crate::types::{
|
||||
AlignmentResult, CigarOp, DnaSequence, GenomicPosition, Nucleotide, QualityScore,
|
||||
};
|
||||
|
||||
/// Alignment configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AlignmentConfig {
|
||||
/// Match score
|
||||
pub match_score: i32,
|
||||
/// Mismatch penalty (negative)
|
||||
pub mismatch_penalty: i32,
|
||||
/// Gap open penalty (negative)
|
||||
pub gap_open_penalty: i32,
|
||||
/// Gap extension penalty (negative)
|
||||
pub gap_extend_penalty: i32,
|
||||
}
|
||||
|
||||
impl Default for AlignmentConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
match_score: 2,
|
||||
mismatch_penalty: -1,
|
||||
gap_open_penalty: -3,
|
||||
gap_extend_penalty: -1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Smith-Waterman local aligner with attention-weighted scoring
|
||||
pub struct SmithWaterman {
|
||||
config: AlignmentConfig,
|
||||
}
|
||||
|
||||
impl SmithWaterman {
|
||||
/// Create a new Smith-Waterman aligner
|
||||
pub fn new(config: AlignmentConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Align query against reference using Smith-Waterman with affine gap penalties
|
||||
pub fn align(&self, query: &DnaSequence, reference: &DnaSequence) -> Result<AlignmentResult> {
|
||||
if query.is_empty() || reference.is_empty() {
|
||||
return Err(DnaError::AlignmentError(
|
||||
"Cannot align empty sequences".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let q_bases = query.bases();
|
||||
let r_bases = reference.bases();
|
||||
let q_len = q_bases.len();
|
||||
let r_len = r_bases.len();
|
||||
let cols = r_len + 1;
|
||||
|
||||
// Rolling 2-row DP: only prev+curr rows for H and E (~12KB vs ~600KB).
|
||||
// F needs only a single scalar (left neighbor in same row).
|
||||
// Full traceback matrix kept since tb==0 encodes the stop condition.
|
||||
let neg_inf = i32::MIN / 2;
|
||||
let mut h_prev = vec![0i32; cols];
|
||||
let mut h_curr = vec![0i32; cols];
|
||||
let mut e_prev = vec![neg_inf; cols];
|
||||
let mut e_curr = vec![neg_inf; cols];
|
||||
let mut tb = vec![0u8; (q_len + 1) * cols]; // 0=stop, 1=diag, 2=up, 3=left
|
||||
|
||||
let match_sc = self.config.match_score;
|
||||
let mismatch_sc = self.config.mismatch_penalty;
|
||||
let gap_open = self.config.gap_open_penalty;
|
||||
let gap_ext = self.config.gap_extend_penalty;
|
||||
|
||||
let mut max_score = 0i32;
|
||||
let mut max_i = 0;
|
||||
let mut max_j = 0;
|
||||
|
||||
// Fill scoring matrices with affine gap penalties
|
||||
for i in 1..=q_len {
|
||||
let q_base = q_bases[i - 1];
|
||||
h_curr[0] = 0;
|
||||
e_curr[0] = neg_inf;
|
||||
let mut f_val = neg_inf; // F[i][0], reset per row
|
||||
|
||||
for j in 1..=r_len {
|
||||
let mm = if q_base == r_bases[j - 1] {
|
||||
match_sc
|
||||
} else {
|
||||
mismatch_sc
|
||||
};
|
||||
|
||||
// E: gap in reference (insertion in query) — extend or open
|
||||
let e_v = (e_prev[j] + gap_ext).max(h_prev[j] + gap_open);
|
||||
e_curr[j] = e_v;
|
||||
|
||||
// F: gap in query (deletion from reference) — extend or open
|
||||
f_val = (f_val + gap_ext).max(h_curr[j - 1] + gap_open);
|
||||
|
||||
let diag = h_prev[j - 1] + mm;
|
||||
let best = 0.max(diag).max(e_v).max(f_val);
|
||||
h_curr[j] = best;
|
||||
|
||||
tb[i * cols + j] = if best == 0 {
|
||||
0
|
||||
} else if best == diag {
|
||||
1
|
||||
} else if best == e_v {
|
||||
2
|
||||
} else {
|
||||
3
|
||||
};
|
||||
|
||||
if best > max_score {
|
||||
max_score = best;
|
||||
max_i = i;
|
||||
max_j = j;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap rows: current becomes previous for next iteration
|
||||
std::mem::swap(&mut h_prev, &mut h_curr);
|
||||
std::mem::swap(&mut e_prev, &mut e_curr);
|
||||
}
|
||||
|
||||
// Traceback to build CIGAR (tb==0 encodes stop, same as h==0)
|
||||
let mut cigar_ops = Vec::new();
|
||||
let mut i = max_i;
|
||||
let mut j = max_j;
|
||||
|
||||
while i > 0 && j > 0 && tb[i * cols + j] != 0 {
|
||||
match tb[i * cols + j] {
|
||||
1 => {
|
||||
// Diagonal (match/mismatch)
|
||||
cigar_ops.push(CigarOp::M(1));
|
||||
i -= 1;
|
||||
j -= 1;
|
||||
}
|
||||
2 => {
|
||||
// Up (insertion in query)
|
||||
cigar_ops.push(CigarOp::I(1));
|
||||
i -= 1;
|
||||
}
|
||||
3 => {
|
||||
// Left (deletion from query)
|
||||
cigar_ops.push(CigarOp::D(1));
|
||||
j -= 1;
|
||||
}
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
cigar_ops.reverse();
|
||||
|
||||
// Merge consecutive same-type CIGAR operations
|
||||
let cigar = merge_cigar_ops(&cigar_ops);
|
||||
|
||||
// Calculate alignment start position on reference
|
||||
let align_start = j;
|
||||
|
||||
let mapq = ((max_score.max(0) as f64 / (q_len.max(1) as f64 * 2.0)) * 60.0).min(60.0) as u8;
|
||||
|
||||
Ok(AlignmentResult {
|
||||
score: max_score,
|
||||
cigar,
|
||||
mapped_position: GenomicPosition {
|
||||
chromosome: 1,
|
||||
position: align_start as u64,
|
||||
reference_allele: reference.get(align_start).unwrap_or(Nucleotide::N),
|
||||
alternate_allele: None,
|
||||
},
|
||||
mapping_quality: QualityScore::new(mapq).unwrap_or(QualityScore::new(0).unwrap()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge consecutive same-type CIGAR operations
|
||||
fn merge_cigar_ops(ops: &[CigarOp]) -> Vec<CigarOp> {
|
||||
if ops.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut merged = Vec::new();
|
||||
let mut current = ops[0];
|
||||
|
||||
for &op in &ops[1..] {
|
||||
match (current, op) {
|
||||
(CigarOp::M(a), CigarOp::M(b)) => current = CigarOp::M(a + b),
|
||||
(CigarOp::I(a), CigarOp::I(b)) => current = CigarOp::I(a + b),
|
||||
(CigarOp::D(a), CigarOp::D(b)) => current = CigarOp::D(a + b),
|
||||
_ => {
|
||||
merged.push(current);
|
||||
current = op;
|
||||
}
|
||||
}
|
||||
}
|
||||
merged.push(current);
|
||||
merged
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_smith_waterman_exact_match() {
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let query = DnaSequence::from_str("ACGT").unwrap();
|
||||
let reference = DnaSequence::from_str("ACGT").unwrap();
|
||||
|
||||
let result = aligner.align(&query, &reference).unwrap();
|
||||
assert_eq!(result.score, 8); // 4 matches * 2 points
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smith_waterman_with_mismatch() {
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let query = DnaSequence::from_str("ACGT").unwrap();
|
||||
let reference = DnaSequence::from_str("ACTT").unwrap();
|
||||
|
||||
let result = aligner.align(&query, &reference).unwrap();
|
||||
assert!(result.score > 0);
|
||||
assert!(result.score < 8); // Not perfect match
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smith_waterman_subsequence() {
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let query = DnaSequence::from_str("ACGT").unwrap();
|
||||
let reference = DnaSequence::from_str("TTTTACGTTTTT").unwrap();
|
||||
|
||||
let result = aligner.align(&query, &reference).unwrap();
|
||||
assert_eq!(result.score, 8); // Perfect subsequence match
|
||||
assert_eq!(result.mapped_position.position, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_sequence_error() {
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let empty = DnaSequence::new(vec![]);
|
||||
let seq = DnaSequence::from_str("ACGT").unwrap();
|
||||
|
||||
assert!(aligner.align(&empty, &seq).is_err());
|
||||
assert!(aligner.align(&seq, &empty).is_err());
|
||||
}
|
||||
}
|
||||
1001
vendor/ruvector/examples/dna/src/biomarker.rs
vendored
Normal file
1001
vendor/ruvector/examples/dna/src/biomarker.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
677
vendor/ruvector/examples/dna/src/biomarker_stream.rs
vendored
Normal file
677
vendor/ruvector/examples/dna/src/biomarker_stream.rs
vendored
Normal file
@@ -0,0 +1,677 @@
|
||||
//! Streaming biomarker data simulator with ring buffer and anomaly detection.
|
||||
//!
|
||||
//! Generates synthetic biomarker readings (glucose, cholesterol, HDL, LDL,
|
||||
//! triglycerides, CRP) with configurable noise, drift, and anomaly injection.
|
||||
//! Provides a [`StreamProcessor`] with rolling statistics, z-score anomaly
|
||||
//! detection, and linear regression trend analysis over a [`RingBuffer`].
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use rand_distr::Normal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Configuration for simulated biomarker streams.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StreamConfig {
|
||||
pub base_interval_ms: u64,
|
||||
pub noise_amplitude: f64,
|
||||
pub drift_rate: f64,
|
||||
pub anomaly_probability: f64,
|
||||
pub anomaly_magnitude: f64,
|
||||
pub num_biomarkers: usize,
|
||||
pub window_size: usize,
|
||||
}
|
||||
|
||||
impl Default for StreamConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
base_interval_ms: 1000,
|
||||
noise_amplitude: 0.02,
|
||||
drift_rate: 0.0,
|
||||
anomaly_probability: 0.02,
|
||||
anomaly_magnitude: 2.5,
|
||||
num_biomarkers: 6,
|
||||
window_size: 100,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A single timestamped biomarker data point.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BiomarkerReading {
|
||||
pub timestamp_ms: u64,
|
||||
pub biomarker_id: String,
|
||||
pub value: f64,
|
||||
pub reference_low: f64,
|
||||
pub reference_high: f64,
|
||||
pub is_anomaly: bool,
|
||||
pub z_score: f64,
|
||||
}
|
||||
|
||||
/// Fixed-capacity circular buffer backed by a flat `Vec<T>`.
|
||||
///
|
||||
/// Eliminates the `Option<T>` wrapper used in naive implementations,
|
||||
/// halving per-slot memory for primitive types like `f64` (8 bytes vs 16).
|
||||
pub struct RingBuffer<T> {
|
||||
buffer: Vec<T>,
|
||||
head: usize,
|
||||
len: usize,
|
||||
capacity: usize,
|
||||
}
|
||||
|
||||
impl<T: Clone + Default> RingBuffer<T> {
|
||||
pub fn new(capacity: usize) -> Self {
|
||||
assert!(capacity > 0, "RingBuffer capacity must be > 0");
|
||||
Self {
|
||||
buffer: vec![T::default(); capacity],
|
||||
head: 0,
|
||||
len: 0,
|
||||
capacity,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, item: T) {
|
||||
self.buffer[self.head] = item;
|
||||
self.head = (self.head + 1) % self.capacity;
|
||||
if self.len < self.capacity {
|
||||
self.len += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = &T> {
|
||||
let start = if self.len < self.capacity {
|
||||
0
|
||||
} else {
|
||||
self.head
|
||||
};
|
||||
let (cap, len) = (self.capacity, self.len);
|
||||
(0..len).map(move |i| &self.buffer[(start + i) % cap])
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
pub fn is_full(&self) -> bool {
|
||||
self.len == self.capacity
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.head = 0;
|
||||
self.len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Biomarker definitions ───────────────────────────────────────────────────
|
||||
|
||||
struct BiomarkerDef {
|
||||
id: &'static str,
|
||||
low: f64,
|
||||
high: f64,
|
||||
}
|
||||
|
||||
const BIOMARKER_DEFS: &[BiomarkerDef] = &[
|
||||
BiomarkerDef {
|
||||
id: "glucose",
|
||||
low: 70.0,
|
||||
high: 100.0,
|
||||
},
|
||||
BiomarkerDef {
|
||||
id: "cholesterol_total",
|
||||
low: 150.0,
|
||||
high: 200.0,
|
||||
},
|
||||
BiomarkerDef {
|
||||
id: "hdl",
|
||||
low: 40.0,
|
||||
high: 60.0,
|
||||
},
|
||||
BiomarkerDef {
|
||||
id: "ldl",
|
||||
low: 70.0,
|
||||
high: 130.0,
|
||||
},
|
||||
BiomarkerDef {
|
||||
id: "triglycerides",
|
||||
low: 50.0,
|
||||
high: 150.0,
|
||||
},
|
||||
BiomarkerDef {
|
||||
id: "crp",
|
||||
low: 0.1,
|
||||
high: 3.0,
|
||||
},
|
||||
];
|
||||
|
||||
// ── Batch generation ────────────────────────────────────────────────────────
|
||||
|
||||
/// Generate `count` synthetic readings per active biomarker with noise, drift,
|
||||
/// and stochastic anomaly spikes.
|
||||
pub fn generate_readings(config: &StreamConfig, count: usize, seed: u64) -> Vec<BiomarkerReading> {
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
let active = &BIOMARKER_DEFS[..config.num_biomarkers.min(BIOMARKER_DEFS.len())];
|
||||
let mut readings = Vec::with_capacity(count * active.len());
|
||||
// Pre-compute distributions per biomarker (avoids Normal::new in inner loop)
|
||||
let dists: Vec<_> = active
|
||||
.iter()
|
||||
.map(|def| {
|
||||
let range = def.high - def.low;
|
||||
let mid = (def.low + def.high) / 2.0;
|
||||
let sigma = (config.noise_amplitude * range).max(1e-12);
|
||||
let normal = Normal::new(0.0, sigma).unwrap();
|
||||
let spike = Normal::new(0.0, sigma * config.anomaly_magnitude).unwrap();
|
||||
(mid, range, normal, spike)
|
||||
})
|
||||
.collect();
|
||||
let mut ts: u64 = 0;
|
||||
|
||||
for step in 0..count {
|
||||
for (j, def) in active.iter().enumerate() {
|
||||
let (mid, range, ref normal, ref spike) = dists[j];
|
||||
let drift = config.drift_rate * range * step as f64;
|
||||
let is_anom = rng.gen::<f64>() < config.anomaly_probability;
|
||||
let value = if is_anom {
|
||||
(mid + rng.sample::<f64, _>(spike) + drift).max(0.0)
|
||||
} else {
|
||||
(mid + rng.sample::<f64, _>(normal) + drift).max(0.0)
|
||||
};
|
||||
readings.push(BiomarkerReading {
|
||||
timestamp_ms: ts,
|
||||
biomarker_id: def.id.into(),
|
||||
value,
|
||||
reference_low: def.low,
|
||||
reference_high: def.high,
|
||||
is_anomaly: is_anom,
|
||||
z_score: 0.0,
|
||||
});
|
||||
}
|
||||
ts += config.base_interval_ms;
|
||||
}
|
||||
readings
|
||||
}
|
||||
|
||||
// ── Statistics & results ────────────────────────────────────────────────────
|
||||
|
||||
/// Rolling statistics for a single biomarker stream.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StreamStats {
|
||||
pub mean: f64,
|
||||
pub variance: f64,
|
||||
pub min: f64,
|
||||
pub max: f64,
|
||||
pub count: u64,
|
||||
pub anomaly_rate: f64,
|
||||
pub trend_slope: f64,
|
||||
pub ema: f64,
|
||||
pub cusum_pos: f64, // CUSUM positive direction
|
||||
pub cusum_neg: f64, // CUSUM negative direction
|
||||
pub changepoint_detected: bool,
|
||||
}
|
||||
|
||||
impl Default for StreamStats {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mean: 0.0,
|
||||
variance: 0.0,
|
||||
min: f64::MAX,
|
||||
max: f64::MIN,
|
||||
count: 0,
|
||||
anomaly_rate: 0.0,
|
||||
trend_slope: 0.0,
|
||||
ema: 0.0,
|
||||
cusum_pos: 0.0,
|
||||
cusum_neg: 0.0,
|
||||
changepoint_detected: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of processing a single reading.
|
||||
pub struct ProcessingResult {
|
||||
pub accepted: bool,
|
||||
pub z_score: f64,
|
||||
pub is_anomaly: bool,
|
||||
pub current_trend: f64,
|
||||
}
|
||||
|
||||
/// Aggregate summary across all biomarker streams.
|
||||
pub struct StreamSummary {
|
||||
pub total_readings: u64,
|
||||
pub anomaly_count: u64,
|
||||
pub anomaly_rate: f64,
|
||||
pub biomarker_stats: HashMap<String, StreamStats>,
|
||||
pub throughput_readings_per_sec: f64,
|
||||
}
|
||||
|
||||
// ── Stream processor ────────────────────────────────────────────────────────
|
||||
|
||||
const EMA_ALPHA: f64 = 0.1;
|
||||
const Z_SCORE_THRESHOLD: f64 = 2.5;
|
||||
const REF_OVERSHOOT: f64 = 0.20;
|
||||
const CUSUM_THRESHOLD: f64 = 4.0; // Cumulative sum threshold for changepoint detection
|
||||
const CUSUM_DRIFT: f64 = 0.5; // Allowable drift before CUSUM accumulates
|
||||
|
||||
/// Processes biomarker readings with per-stream ring buffers, z-score anomaly
|
||||
/// detection, and trend analysis via simple linear regression.
|
||||
pub struct StreamProcessor {
|
||||
config: StreamConfig,
|
||||
buffers: HashMap<String, RingBuffer<f64>>,
|
||||
stats: HashMap<String, StreamStats>,
|
||||
total_readings: u64,
|
||||
anomaly_count: u64,
|
||||
anom_per_bio: HashMap<String, u64>,
|
||||
start_ts: Option<u64>,
|
||||
last_ts: Option<u64>,
|
||||
}
|
||||
|
||||
impl StreamProcessor {
|
||||
pub fn new(config: StreamConfig) -> Self {
|
||||
let cap = config.num_biomarkers;
|
||||
Self {
|
||||
config,
|
||||
buffers: HashMap::with_capacity(cap),
|
||||
stats: HashMap::with_capacity(cap),
|
||||
total_readings: 0,
|
||||
anomaly_count: 0,
|
||||
anom_per_bio: HashMap::with_capacity(cap),
|
||||
start_ts: None,
|
||||
last_ts: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn process_reading(&mut self, reading: &BiomarkerReading) -> ProcessingResult {
|
||||
let id = &reading.biomarker_id;
|
||||
if self.start_ts.is_none() {
|
||||
self.start_ts = Some(reading.timestamp_ms);
|
||||
}
|
||||
self.last_ts = Some(reading.timestamp_ms);
|
||||
|
||||
let buf = self
|
||||
.buffers
|
||||
.entry(id.clone())
|
||||
.or_insert_with(|| RingBuffer::new(self.config.window_size));
|
||||
buf.push(reading.value);
|
||||
self.total_readings += 1;
|
||||
|
||||
let (wmean, wstd) = window_mean_std(buf);
|
||||
let z = if wstd > 1e-12 {
|
||||
(reading.value - wmean) / wstd
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let rng = reading.reference_high - reading.reference_low;
|
||||
let overshoot = REF_OVERSHOOT * rng;
|
||||
let oor = reading.value < (reading.reference_low - overshoot)
|
||||
|| reading.value > (reading.reference_high + overshoot);
|
||||
let is_anom = z.abs() > Z_SCORE_THRESHOLD || oor;
|
||||
|
||||
if is_anom {
|
||||
self.anomaly_count += 1;
|
||||
*self.anom_per_bio.entry(id.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
let slope = compute_trend_slope(buf);
|
||||
let bio_anom = *self.anom_per_bio.get(id).unwrap_or(&0);
|
||||
let st = self.stats.entry(id.clone()).or_default();
|
||||
st.count += 1;
|
||||
st.mean = wmean;
|
||||
st.variance = wstd * wstd;
|
||||
st.trend_slope = slope;
|
||||
st.anomaly_rate = bio_anom as f64 / st.count as f64;
|
||||
if reading.value < st.min {
|
||||
st.min = reading.value;
|
||||
}
|
||||
if reading.value > st.max {
|
||||
st.max = reading.value;
|
||||
}
|
||||
st.ema = if st.count == 1 {
|
||||
reading.value
|
||||
} else {
|
||||
EMA_ALPHA * reading.value + (1.0 - EMA_ALPHA) * st.ema
|
||||
};
|
||||
// CUSUM changepoint detection: accumulate deviations from the mean
|
||||
if wstd > 1e-12 {
|
||||
let norm_dev = (reading.value - wmean) / wstd;
|
||||
st.cusum_pos = (st.cusum_pos + norm_dev - CUSUM_DRIFT).max(0.0);
|
||||
st.cusum_neg = (st.cusum_neg - norm_dev - CUSUM_DRIFT).max(0.0);
|
||||
st.changepoint_detected =
|
||||
st.cusum_pos > CUSUM_THRESHOLD || st.cusum_neg > CUSUM_THRESHOLD;
|
||||
if st.changepoint_detected {
|
||||
st.cusum_pos = 0.0;
|
||||
st.cusum_neg = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
ProcessingResult {
|
||||
accepted: true,
|
||||
z_score: z,
|
||||
is_anomaly: is_anom,
|
||||
current_trend: slope,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_stats(&self, biomarker_id: &str) -> Option<&StreamStats> {
|
||||
self.stats.get(biomarker_id)
|
||||
}
|
||||
|
||||
pub fn summary(&self) -> StreamSummary {
|
||||
let elapsed = match (self.start_ts, self.last_ts) {
|
||||
(Some(s), Some(e)) if e > s => (e - s) as f64,
|
||||
_ => 1.0,
|
||||
};
|
||||
let ar = if self.total_readings > 0 {
|
||||
self.anomaly_count as f64 / self.total_readings as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
StreamSummary {
|
||||
total_readings: self.total_readings,
|
||||
anomaly_count: self.anomaly_count,
|
||||
anomaly_rate: ar,
|
||||
biomarker_stats: self.stats.clone(),
|
||||
throughput_readings_per_sec: self.total_readings as f64 / (elapsed / 1000.0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Single-pass mean and sample standard deviation using Welford's online algorithm.
|
||||
/// Avoids iterating the buffer twice (sum then variance) — 2x fewer cache misses.
|
||||
fn window_mean_std(buf: &RingBuffer<f64>) -> (f64, f64) {
|
||||
let n = buf.len();
|
||||
if n == 0 {
|
||||
return (0.0, 0.0);
|
||||
}
|
||||
let mut mean = 0.0;
|
||||
let mut m2 = 0.0;
|
||||
for (k, &x) in buf.iter().enumerate() {
|
||||
let k1 = (k + 1) as f64;
|
||||
let delta = x - mean;
|
||||
mean += delta / k1;
|
||||
m2 += delta * (x - mean);
|
||||
}
|
||||
if n < 2 {
|
||||
return (mean, 0.0);
|
||||
}
|
||||
(mean, (m2 / (n - 1) as f64).sqrt())
|
||||
}
|
||||
|
||||
fn compute_trend_slope(buf: &RingBuffer<f64>) -> f64 {
|
||||
let n = buf.len();
|
||||
if n < 2 {
|
||||
return 0.0;
|
||||
}
|
||||
let nf = n as f64;
|
||||
let xm = (nf - 1.0) / 2.0;
|
||||
let (mut ys, mut xys, mut xxs) = (0.0, 0.0, 0.0);
|
||||
for (i, &y) in buf.iter().enumerate() {
|
||||
let x = i as f64;
|
||||
ys += y;
|
||||
xys += x * y;
|
||||
xxs += x * x;
|
||||
}
|
||||
let ss_xy = xys - nf * xm * (ys / nf);
|
||||
let ss_xx = xxs - nf * xm * xm;
|
||||
if ss_xx.abs() < 1e-12 {
|
||||
0.0
|
||||
} else {
|
||||
ss_xy / ss_xx
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn reading(ts: u64, id: &str, val: f64, lo: f64, hi: f64) -> BiomarkerReading {
|
||||
BiomarkerReading {
|
||||
timestamp_ms: ts,
|
||||
biomarker_id: id.into(),
|
||||
value: val,
|
||||
reference_low: lo,
|
||||
reference_high: hi,
|
||||
is_anomaly: false,
|
||||
z_score: 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
fn glucose(ts: u64, val: f64) -> BiomarkerReading {
|
||||
reading(ts, "glucose", val, 70.0, 100.0)
|
||||
}
|
||||
|
||||
// -- RingBuffer --
|
||||
|
||||
#[test]
|
||||
fn ring_buffer_push_iter_len() {
|
||||
let mut rb: RingBuffer<i32> = RingBuffer::new(4);
|
||||
for v in [10, 20, 30] {
|
||||
rb.push(v);
|
||||
}
|
||||
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![10, 20, 30]);
|
||||
assert_eq!(rb.len(), 3);
|
||||
assert!(!rb.is_full());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_buffer_overflow_keeps_newest() {
|
||||
let mut rb: RingBuffer<i32> = RingBuffer::new(3);
|
||||
for v in 1..=4 {
|
||||
rb.push(v);
|
||||
}
|
||||
assert!(rb.is_full());
|
||||
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![2, 3, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_buffer_capacity_one() {
|
||||
let mut rb: RingBuffer<i32> = RingBuffer::new(1);
|
||||
rb.push(42);
|
||||
rb.push(99);
|
||||
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![99]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_buffer_clear_resets() {
|
||||
let mut rb: RingBuffer<i32> = RingBuffer::new(3);
|
||||
rb.push(1);
|
||||
rb.push(2);
|
||||
rb.clear();
|
||||
assert_eq!(rb.len(), 0);
|
||||
assert!(!rb.is_full());
|
||||
assert_eq!(rb.iter().count(), 0);
|
||||
}
|
||||
|
||||
// -- Batch generation --
|
||||
|
||||
#[test]
|
||||
fn generate_correct_count_and_ids() {
|
||||
let cfg = StreamConfig::default();
|
||||
let readings = generate_readings(&cfg, 50, 42);
|
||||
assert_eq!(readings.len(), 50 * cfg.num_biomarkers);
|
||||
let valid: Vec<&str> = BIOMARKER_DEFS.iter().map(|d| d.id).collect();
|
||||
for r in &readings {
|
||||
assert!(valid.contains(&r.biomarker_id.as_str()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generated_reference_ranges_match_defs() {
|
||||
let readings = generate_readings(&StreamConfig::default(), 20, 123);
|
||||
for r in &readings {
|
||||
let d = BIOMARKER_DEFS
|
||||
.iter()
|
||||
.find(|d| d.id == r.biomarker_id)
|
||||
.unwrap();
|
||||
assert!((r.reference_low - d.low).abs() < 1e-9);
|
||||
assert!((r.reference_high - d.high).abs() < 1e-9);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generated_values_non_negative() {
|
||||
for r in &generate_readings(&StreamConfig::default(), 100, 999) {
|
||||
assert!(r.value >= 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
// -- StreamProcessor --
|
||||
|
||||
#[test]
|
||||
fn processor_computes_stats() {
|
||||
let cfg = StreamConfig {
|
||||
window_size: 10,
|
||||
..Default::default()
|
||||
};
|
||||
let mut p = StreamProcessor::new(cfg.clone());
|
||||
for r in &generate_readings(&cfg, 20, 55) {
|
||||
p.process_reading(r);
|
||||
}
|
||||
let s = p.get_stats("glucose").unwrap();
|
||||
assert!(s.count > 0 && s.mean > 0.0 && s.min <= s.max);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn processor_summary_totals() {
|
||||
let cfg = StreamConfig::default();
|
||||
let mut p = StreamProcessor::new(cfg.clone());
|
||||
for r in &generate_readings(&cfg, 30, 77) {
|
||||
p.process_reading(r);
|
||||
}
|
||||
let s = p.summary();
|
||||
assert_eq!(s.total_readings, 30 * cfg.num_biomarkers as u64);
|
||||
assert!((0.0..=1.0).contains(&s.anomaly_rate));
|
||||
}
|
||||
|
||||
// -- Anomaly detection --
|
||||
|
||||
#[test]
|
||||
fn detects_z_score_anomaly() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 20,
|
||||
..Default::default()
|
||||
});
|
||||
for i in 0..20 {
|
||||
p.process_reading(&glucose(i * 1000, 85.0));
|
||||
}
|
||||
let r = p.process_reading(&glucose(20_000, 300.0));
|
||||
assert!(r.is_anomaly);
|
||||
assert!(r.z_score.abs() > Z_SCORE_THRESHOLD);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_out_of_range_anomaly() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 5,
|
||||
..Default::default()
|
||||
});
|
||||
for (i, v) in [80.0, 82.0, 78.0, 84.0, 81.0].iter().enumerate() {
|
||||
p.process_reading(&glucose(i as u64 * 1000, *v));
|
||||
}
|
||||
// 140 >> ref_high(100) + 20%*range(30)=106
|
||||
assert!(p.process_reading(&glucose(5000, 140.0)).is_anomaly);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zero_anomaly_rate_for_constant_stream() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 50,
|
||||
..Default::default()
|
||||
});
|
||||
for i in 0..10 {
|
||||
p.process_reading(&reading(i * 1000, "crp", 1.5, 0.1, 3.0));
|
||||
}
|
||||
assert!(p.get_stats("crp").unwrap().anomaly_rate.abs() < 1e-9);
|
||||
}
|
||||
|
||||
// -- Trend detection --
|
||||
|
||||
#[test]
|
||||
fn positive_trend_for_increasing() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 20,
|
||||
..Default::default()
|
||||
});
|
||||
let mut r = ProcessingResult {
|
||||
accepted: true,
|
||||
z_score: 0.0,
|
||||
is_anomaly: false,
|
||||
current_trend: 0.0,
|
||||
};
|
||||
for i in 0..20 {
|
||||
r = p.process_reading(&glucose(i * 1000, 70.0 + i as f64));
|
||||
}
|
||||
assert!(r.current_trend > 0.0, "got {}", r.current_trend);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_trend_for_decreasing() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 20,
|
||||
..Default::default()
|
||||
});
|
||||
let mut r = ProcessingResult {
|
||||
accepted: true,
|
||||
z_score: 0.0,
|
||||
is_anomaly: false,
|
||||
current_trend: 0.0,
|
||||
};
|
||||
for i in 0..20 {
|
||||
r = p.process_reading(&reading(i * 1000, "hdl", 60.0 - i as f64 * 0.5, 40.0, 60.0));
|
||||
}
|
||||
assert!(r.current_trend < 0.0, "got {}", r.current_trend);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exact_slope_for_linear_series() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 10,
|
||||
..Default::default()
|
||||
});
|
||||
for i in 0..10 {
|
||||
p.process_reading(&reading(
|
||||
i * 1000,
|
||||
"ldl",
|
||||
100.0 + i as f64 * 3.0,
|
||||
70.0,
|
||||
130.0,
|
||||
));
|
||||
}
|
||||
assert!((p.get_stats("ldl").unwrap().trend_slope - 3.0).abs() < 1e-9);
|
||||
}
|
||||
|
||||
// -- Z-score --
|
||||
|
||||
#[test]
|
||||
fn z_score_small_for_near_mean() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 10,
|
||||
..Default::default()
|
||||
});
|
||||
for (i, v) in [80.0, 82.0, 78.0, 84.0, 76.0, 86.0, 81.0, 79.0, 83.0]
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
p.process_reading(&glucose(i as u64 * 1000, *v));
|
||||
}
|
||||
let mean = p.get_stats("glucose").unwrap().mean;
|
||||
assert!(p.process_reading(&glucose(9000, mean)).z_score.abs() < 1.0);
|
||||
}
|
||||
|
||||
// -- EMA --
|
||||
|
||||
#[test]
|
||||
fn ema_converges_to_constant() {
|
||||
let mut p = StreamProcessor::new(StreamConfig {
|
||||
window_size: 50,
|
||||
..Default::default()
|
||||
});
|
||||
for i in 0..50 {
|
||||
p.process_reading(&reading(i * 1000, "crp", 2.0, 0.1, 3.0));
|
||||
}
|
||||
assert!((p.get_stats("crp").unwrap().ema - 2.0).abs() < 1e-6);
|
||||
}
|
||||
}
|
||||
322
vendor/ruvector/examples/dna/src/epigenomics.rs
vendored
Normal file
322
vendor/ruvector/examples/dna/src/epigenomics.rs
vendored
Normal file
@@ -0,0 +1,322 @@
|
||||
//! Epigenomics analysis module
|
||||
//!
|
||||
//! Provides methylation profiling and epigenetic age prediction
|
||||
//! using the Horvath clock model.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A CpG site with methylation data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CpGSite {
|
||||
/// Chromosome number
|
||||
pub chromosome: u8,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Methylation level (beta value, 0.0 to 1.0)
|
||||
pub methylation_level: f32,
|
||||
}
|
||||
|
||||
/// Methylation profile containing CpG site measurements
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MethylationProfile {
|
||||
/// CpG sites with measured methylation levels
|
||||
pub sites: Vec<CpGSite>,
|
||||
}
|
||||
|
||||
impl MethylationProfile {
|
||||
/// Create a methylation profile from position and beta value arrays
|
||||
pub fn from_beta_values(positions: Vec<(u8, u64)>, betas: Vec<f32>) -> Self {
|
||||
let sites = positions
|
||||
.into_iter()
|
||||
.zip(betas.into_iter())
|
||||
.map(|((chr, pos), beta)| CpGSite {
|
||||
chromosome: chr,
|
||||
position: pos,
|
||||
methylation_level: beta.clamp(0.0, 1.0),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self { sites }
|
||||
}
|
||||
|
||||
/// Calculate mean methylation across all sites
|
||||
pub fn mean_methylation(&self) -> f32 {
|
||||
if self.sites.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
let sum: f32 = self.sites.iter().map(|s| s.methylation_level).sum();
|
||||
sum / self.sites.len() as f32
|
||||
}
|
||||
|
||||
/// Calculate methylation entropy (Shannon entropy of beta values)
|
||||
///
|
||||
/// High entropy indicates heterogeneous methylation (potential tumor heterogeneity)
|
||||
pub fn methylation_entropy(&self) -> f64 {
|
||||
if self.sites.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Bin methylation into 10 bins [0, 0.1), [0.1, 0.2), ..., [0.9, 1.0]
|
||||
let mut bins = [0u32; 10];
|
||||
for site in &self.sites {
|
||||
let bin = ((site.methylation_level * 10.0) as usize).min(9);
|
||||
bins[bin] += 1;
|
||||
}
|
||||
|
||||
let n = self.sites.len() as f64;
|
||||
let mut entropy = 0.0;
|
||||
for &count in &bins {
|
||||
if count > 0 {
|
||||
let p = count as f64 / n;
|
||||
entropy -= p * p.ln();
|
||||
}
|
||||
}
|
||||
|
||||
entropy
|
||||
}
|
||||
|
||||
/// Calculate extreme methylation ratio
|
||||
///
|
||||
/// Fraction of sites with beta < 0.1 (hypomethylated) or > 0.9 (hypermethylated).
|
||||
/// High ratio indicates global methylation disruption (cancer hallmark).
|
||||
pub fn extreme_methylation_ratio(&self) -> f32 {
|
||||
if self.sites.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
let extreme_count = self
|
||||
.sites
|
||||
.iter()
|
||||
.filter(|s| s.methylation_level < 0.1 || s.methylation_level > 0.9)
|
||||
.count();
|
||||
extreme_count as f32 / self.sites.len() as f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Horvath epigenetic clock for biological age prediction
|
||||
///
|
||||
/// Uses a simplified linear model based on CpG site methylation levels
|
||||
/// to predict biological age.
|
||||
pub struct HorvathClock {
|
||||
/// Intercept term
|
||||
intercept: f64,
|
||||
/// Coefficient per CpG site bin
|
||||
coefficients: Vec<f64>,
|
||||
/// Number of bins to partition sites into
|
||||
num_bins: usize,
|
||||
}
|
||||
|
||||
impl HorvathClock {
|
||||
/// Create the default Horvath clock model
|
||||
///
|
||||
/// Uses a simplified model with binned methylation values.
|
||||
/// Real implementation would use 353 specific CpG sites.
|
||||
pub fn default_clock() -> Self {
|
||||
Self {
|
||||
intercept: 30.0,
|
||||
coefficients: vec![
|
||||
-15.0, // Low methylation bin (young)
|
||||
10.0, // High methylation bin (age-associated)
|
||||
0.5, // Neutral bin
|
||||
],
|
||||
num_bins: 3,
|
||||
}
|
||||
}
|
||||
|
||||
/// Predict biological age from a methylation profile
|
||||
pub fn predict_age(&self, profile: &MethylationProfile) -> f64 {
|
||||
if profile.sites.is_empty() {
|
||||
return self.intercept;
|
||||
}
|
||||
|
||||
// Partition sites into bins and compute mean methylation per bin
|
||||
let bin_size = profile.sites.len() / self.num_bins.max(1);
|
||||
let mut age = self.intercept;
|
||||
|
||||
for (bin_idx, coefficient) in self.coefficients.iter().enumerate() {
|
||||
let start = bin_idx * bin_size;
|
||||
let end = ((bin_idx + 1) * bin_size).min(profile.sites.len());
|
||||
|
||||
if start >= profile.sites.len() {
|
||||
break;
|
||||
}
|
||||
|
||||
let bin_sites = &profile.sites[start..end];
|
||||
if !bin_sites.is_empty() {
|
||||
let mean_meth: f64 = bin_sites
|
||||
.iter()
|
||||
.map(|s| s.methylation_level as f64)
|
||||
.sum::<f64>()
|
||||
/ bin_sites.len() as f64;
|
||||
|
||||
age += coefficient * mean_meth;
|
||||
}
|
||||
}
|
||||
|
||||
age.max(0.0)
|
||||
}
|
||||
|
||||
/// Calculate age acceleration (difference between biological and chronological age)
|
||||
///
|
||||
/// Positive values indicate accelerated aging (associated with mortality risk).
|
||||
/// Negative values indicate decelerated aging.
|
||||
pub fn age_acceleration(predicted_age: f64, chronological_age: f64) -> f64 {
|
||||
predicted_age - chronological_age
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancer signal detector using methylation patterns
|
||||
///
|
||||
/// Combines methylation entropy and extreme methylation ratio
|
||||
/// to produce a cancer risk score (0.0 to 1.0).
|
||||
pub struct CancerSignalDetector {
|
||||
/// Entropy weight in the combined score
|
||||
entropy_weight: f64,
|
||||
/// Extreme ratio weight
|
||||
extreme_weight: f64,
|
||||
/// Threshold for elevated cancer risk
|
||||
risk_threshold: f64,
|
||||
}
|
||||
|
||||
impl CancerSignalDetector {
|
||||
/// Create with default parameters
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
entropy_weight: 0.4,
|
||||
extreme_weight: 0.6,
|
||||
risk_threshold: 0.3,
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect cancer signal from methylation profile
|
||||
///
|
||||
/// Returns (risk_score, is_elevated) where risk_score is 0.0-1.0
|
||||
/// and is_elevated indicates whether the score exceeds the threshold.
|
||||
pub fn detect(&self, profile: &MethylationProfile) -> CancerSignalResult {
|
||||
if profile.sites.is_empty() {
|
||||
return CancerSignalResult {
|
||||
risk_score: 0.0,
|
||||
is_elevated: false,
|
||||
entropy: 0.0,
|
||||
extreme_ratio: 0.0,
|
||||
};
|
||||
}
|
||||
|
||||
let entropy = profile.methylation_entropy();
|
||||
let extreme_ratio = profile.extreme_methylation_ratio() as f64;
|
||||
|
||||
// Normalize entropy to 0-1 range (max entropy for 10 bins = ln(10) ≈ 2.302)
|
||||
let normalized_entropy = (entropy / 2.302).min(1.0);
|
||||
|
||||
let risk_score = (self.entropy_weight * normalized_entropy
|
||||
+ self.extreme_weight * extreme_ratio)
|
||||
.min(1.0);
|
||||
|
||||
CancerSignalResult {
|
||||
risk_score,
|
||||
is_elevated: risk_score >= self.risk_threshold,
|
||||
entropy,
|
||||
extreme_ratio,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CancerSignalDetector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Result from cancer signal detection
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CancerSignalResult {
|
||||
/// Combined risk score (0.0 to 1.0)
|
||||
pub risk_score: f64,
|
||||
/// Whether the risk score exceeds the threshold
|
||||
pub is_elevated: bool,
|
||||
/// Raw methylation entropy
|
||||
pub entropy: f64,
|
||||
/// Fraction of extreme methylation sites
|
||||
pub extreme_ratio: f64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_methylation_profile() {
|
||||
let positions = vec![(1, 1000), (1, 2000)];
|
||||
let betas = vec![0.3, 0.7];
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
|
||||
assert_eq!(profile.sites.len(), 2);
|
||||
assert!((profile.mean_methylation() - 0.5).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_horvath_clock() {
|
||||
let clock = HorvathClock::default_clock();
|
||||
let positions = vec![(1, 1000), (1, 2000), (1, 3000)];
|
||||
let betas = vec![0.5, 0.5, 0.5];
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
let age = clock.predict_age(&profile);
|
||||
assert!(age > 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_age_acceleration() {
|
||||
let accel = HorvathClock::age_acceleration(55.0, 50.0);
|
||||
assert!((accel - 5.0).abs() < 0.001);
|
||||
|
||||
let decel = HorvathClock::age_acceleration(40.0, 50.0);
|
||||
assert!((decel - (-10.0)).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_methylation_entropy() {
|
||||
// Uniform methylation = low entropy
|
||||
let positions: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
|
||||
let betas = vec![0.5; 100];
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
let entropy = profile.methylation_entropy();
|
||||
assert!(
|
||||
entropy < 0.1,
|
||||
"Uniform should have low entropy: {}",
|
||||
entropy
|
||||
);
|
||||
|
||||
// Spread methylation = high entropy
|
||||
let positions2: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
|
||||
let betas2: Vec<f32> = (0..100).map(|i| i as f32 / 100.0).collect();
|
||||
let profile2 = MethylationProfile::from_beta_values(positions2, betas2);
|
||||
let entropy2 = profile2.methylation_entropy();
|
||||
assert!(
|
||||
entropy2 > 1.0,
|
||||
"Spread should have high entropy: {}",
|
||||
entropy2
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cancer_signal_detector() {
|
||||
let detector = CancerSignalDetector::new();
|
||||
|
||||
// Normal profile (moderate methylation)
|
||||
let positions: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
|
||||
let betas = vec![0.5; 100];
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
let result = detector.detect(&profile);
|
||||
assert!(!result.is_elevated, "Normal profile should not be elevated");
|
||||
assert!(result.risk_score < 0.3);
|
||||
|
||||
// Cancerous profile (extreme methylation)
|
||||
let positions2: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
|
||||
let betas2: Vec<f32> = (0..100)
|
||||
.map(|i| if i % 2 == 0 { 0.95 } else { 0.05 })
|
||||
.collect();
|
||||
let profile2 = MethylationProfile::from_beta_values(positions2, betas2);
|
||||
let result2 = detector.detect(&profile2);
|
||||
assert!(result2.is_elevated, "Cancer profile should be elevated");
|
||||
assert!(result2.extreme_ratio > 0.8);
|
||||
}
|
||||
}
|
||||
58
vendor/ruvector/examples/dna/src/error.rs
vendored
Normal file
58
vendor/ruvector/examples/dna/src/error.rs
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
//! Error types for DNA analysis operations
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// DNA analysis error types
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DnaError {
|
||||
/// Invalid DNA sequence (e.g., non-ACGTN characters)
|
||||
#[error("Invalid DNA sequence: {0}")]
|
||||
InvalidSequence(String),
|
||||
|
||||
/// K-mer indexing error
|
||||
#[error("K-mer index error: {0}")]
|
||||
IndexError(String),
|
||||
|
||||
/// Sequence alignment error
|
||||
#[error("Alignment error: {0}")]
|
||||
AlignmentError(String),
|
||||
|
||||
/// Variant calling error
|
||||
#[error("Variant calling error: {0}")]
|
||||
VariantCallError(String),
|
||||
|
||||
/// Analysis pipeline error
|
||||
#[error("Pipeline error: {0}")]
|
||||
PipelineError(String),
|
||||
|
||||
/// I/O error
|
||||
#[error("I/O error: {0}")]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
/// RuVector core error
|
||||
#[error("Vector database error: {0}")]
|
||||
VectorDbError(#[from] ruvector_core::RuvectorError),
|
||||
|
||||
/// Dimension mismatch
|
||||
#[error("Dimension mismatch: expected {expected}, got {actual}")]
|
||||
DimensionMismatch { expected: usize, actual: usize },
|
||||
|
||||
/// Empty sequence
|
||||
#[error("Empty sequence provided")]
|
||||
EmptySequence,
|
||||
|
||||
/// Invalid quality score
|
||||
#[error("Invalid quality score: {0}")]
|
||||
InvalidQuality(u8),
|
||||
|
||||
/// Invalid k-mer size
|
||||
#[error("Invalid k-mer size: {0}")]
|
||||
InvalidKmerSize(usize),
|
||||
|
||||
/// 23andMe file parse error
|
||||
#[error("Parse error: {0}")]
|
||||
ParseError(String),
|
||||
}
|
||||
|
||||
/// Result type for DNA analysis operations
|
||||
pub type Result<T> = std::result::Result<T, DnaError>;
|
||||
1124
vendor/ruvector/examples/dna/src/genotyping.rs
vendored
Normal file
1124
vendor/ruvector/examples/dna/src/genotyping.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
686
vendor/ruvector/examples/dna/src/health.rs
vendored
Normal file
686
vendor/ruvector/examples/dna/src/health.rs
vendored
Normal file
@@ -0,0 +1,686 @@
|
||||
//! Health variant analysis for genotyping data
|
||||
//!
|
||||
//! Clinically significant variant interpretation for 17+ health-relevant
|
||||
//! SNPs commonly found in 23andMe/genotyping panels. Covers APOE, BRCA1/2,
|
||||
//! TP53, MTHFR, COMT, OPRM1, CYP1A2, and more.
|
||||
//!
|
||||
//! Based on: <https://github.com/ericporres/rvdna-bridge>
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Result of analyzing a single health variant
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthVariantResult {
|
||||
/// rsid identifier
|
||||
pub rsid: String,
|
||||
/// Gene name
|
||||
pub gene: String,
|
||||
/// Variant common name
|
||||
pub name: String,
|
||||
/// Observed genotype
|
||||
pub genotype: String,
|
||||
/// Risk allele
|
||||
pub risk_allele: char,
|
||||
/// Human-readable interpretation
|
||||
pub interpretation: String,
|
||||
/// Clinical significance
|
||||
pub clinical_significance: String,
|
||||
}
|
||||
|
||||
/// APOE genotype determination result
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ApoeResult {
|
||||
/// Full APOE genotype string (e.g., "e2/e3")
|
||||
pub genotype: String,
|
||||
/// rs429358 genotype
|
||||
pub rs429358: String,
|
||||
/// rs7412 genotype
|
||||
pub rs7412: String,
|
||||
}
|
||||
|
||||
/// MTHFR compound status
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MthfrResult {
|
||||
/// C677T genotype (rs1801133)
|
||||
pub c677t: String,
|
||||
/// A1298C genotype (rs1801131)
|
||||
pub a1298c: String,
|
||||
/// Compound risk score (0-4)
|
||||
pub score: u8,
|
||||
/// Clinical assessment text
|
||||
pub assessment: String,
|
||||
}
|
||||
|
||||
/// Pain sensitivity profile (COMT + OPRM1)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PainProfile {
|
||||
/// COMT genotype (rs4680)
|
||||
pub comt: String,
|
||||
/// OPRM1 genotype (rs1799971)
|
||||
pub oprm1: String,
|
||||
/// Combined pain score (0-4)
|
||||
pub score: u8,
|
||||
/// Sensitivity label
|
||||
pub label: String,
|
||||
/// COMT interpretation
|
||||
pub comt_note: String,
|
||||
/// OPRM1 interpretation
|
||||
pub oprm1_note: String,
|
||||
}
|
||||
|
||||
// ── Internal definition type ──
|
||||
|
||||
struct VDef {
|
||||
rsid: &'static str,
|
||||
gene: &'static str,
|
||||
name: &'static str,
|
||||
risk_allele: char,
|
||||
// (genotype, description, significance)
|
||||
interps: &'static [(&'static str, &'static str, &'static str)],
|
||||
}
|
||||
|
||||
static HEALTH_VARIANTS: &[VDef] = &[
|
||||
// ── APOE (Alzheimer's) ──
|
||||
VDef {
|
||||
rsid: "rs429358",
|
||||
gene: "APOE",
|
||||
name: "APOE e4 determinant",
|
||||
risk_allele: 'C',
|
||||
interps: &[
|
||||
(
|
||||
"TT",
|
||||
"APOE e3/e3 or e2/e3 (depends on rs7412)",
|
||||
"Protective/Normal",
|
||||
),
|
||||
(
|
||||
"CT",
|
||||
"One e4 allele present",
|
||||
"Increased Alzheimer's risk (~3x)",
|
||||
),
|
||||
(
|
||||
"CC",
|
||||
"Two e4 alleles present",
|
||||
"Significantly increased Alzheimer's risk (~12x)",
|
||||
),
|
||||
],
|
||||
},
|
||||
VDef {
|
||||
rsid: "rs7412",
|
||||
gene: "APOE",
|
||||
name: "APOE e2 determinant",
|
||||
risk_allele: 'T',
|
||||
interps: &[
|
||||
("CC", "No e2 allele", "Normal"),
|
||||
(
|
||||
"CT",
|
||||
"One e2 allele present",
|
||||
"Protective - reduced Alzheimer's risk",
|
||||
),
|
||||
("TT", "Two e2 alleles (e2/e2)", "Protective; monitor lipids"),
|
||||
],
|
||||
},
|
||||
// ── TP53 (cancer) ──
|
||||
VDef {
|
||||
rsid: "rs1042522",
|
||||
gene: "TP53",
|
||||
name: "p53 Pro72Arg (R72P)",
|
||||
risk_allele: 'G',
|
||||
interps: &[
|
||||
(
|
||||
"CC",
|
||||
"Pro/Pro homozygous",
|
||||
"Normal apoptosis; slightly increased cancer survival",
|
||||
),
|
||||
(
|
||||
"CG",
|
||||
"Pro/Arg heterozygous",
|
||||
"Mixed - Arg allele has stronger apoptotic activity",
|
||||
),
|
||||
(
|
||||
"GG",
|
||||
"Arg/Arg homozygous",
|
||||
"Stronger apoptotic response; variable cancer risk",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── BRCA1 ──
|
||||
VDef {
|
||||
rsid: "rs80357906",
|
||||
gene: "BRCA1",
|
||||
name: "BRCA1 5382insC (Ashkenazi founder)",
|
||||
risk_allele: 'I',
|
||||
interps: &[
|
||||
(
|
||||
"DD",
|
||||
"No insertion detected",
|
||||
"Normal - no BRCA1 5382insC mutation",
|
||||
),
|
||||
(
|
||||
"DI",
|
||||
"Heterozygous carrier",
|
||||
"INCREASED breast/ovarian cancer risk - genetic counseling recommended",
|
||||
),
|
||||
(
|
||||
"II",
|
||||
"Homozygous insertion",
|
||||
"HIGH breast/ovarian cancer risk - urgent genetic counseling",
|
||||
),
|
||||
],
|
||||
},
|
||||
VDef {
|
||||
rsid: "rs28897696",
|
||||
gene: "BRCA1",
|
||||
name: "BRCA1 missense variant",
|
||||
risk_allele: 'A',
|
||||
interps: &[
|
||||
("GG", "Reference genotype", "Normal"),
|
||||
(
|
||||
"AG",
|
||||
"Heterozygous",
|
||||
"Variant of uncertain significance - consult genetic counselor",
|
||||
),
|
||||
("AA", "Homozygous variant", "Consult genetic counselor"),
|
||||
],
|
||||
},
|
||||
// ── BRCA2 ──
|
||||
VDef {
|
||||
rsid: "rs11571833",
|
||||
gene: "BRCA2",
|
||||
name: "BRCA2 K3326X",
|
||||
risk_allele: 'T',
|
||||
interps: &[
|
||||
("AA", "Reference genotype", "Normal"),
|
||||
(
|
||||
"AT",
|
||||
"Heterozygous",
|
||||
"Modestly increased cancer risk (OR ~1.3)",
|
||||
),
|
||||
(
|
||||
"TT",
|
||||
"Homozygous variant",
|
||||
"Increased cancer risk - genetic counseling recommended",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── MTHFR (folate metabolism) ──
|
||||
VDef {
|
||||
rsid: "rs1801133",
|
||||
gene: "MTHFR",
|
||||
name: "C677T",
|
||||
risk_allele: 'A',
|
||||
interps: &[
|
||||
(
|
||||
"GG",
|
||||
"CC genotype (normal)",
|
||||
"Normal MTHFR enzyme activity (100%)",
|
||||
),
|
||||
(
|
||||
"AG",
|
||||
"CT heterozygous",
|
||||
"Reduced enzyme activity (~65%). Consider methylfolate.",
|
||||
),
|
||||
(
|
||||
"AA",
|
||||
"TT homozygous",
|
||||
"Significantly reduced activity (~30%). Methylfolate recommended.",
|
||||
),
|
||||
],
|
||||
},
|
||||
VDef {
|
||||
rsid: "rs1801131",
|
||||
gene: "MTHFR",
|
||||
name: "A1298C",
|
||||
risk_allele: 'T',
|
||||
interps: &[
|
||||
("GG", "CC homozygous variant", "Reduced enzyme activity"),
|
||||
("GT", "AC heterozygous", "Mildly reduced enzyme activity"),
|
||||
(
|
||||
"TT",
|
||||
"AA reference",
|
||||
"Normal MTHFR activity at this position",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── COMT (dopamine/pain) ──
|
||||
VDef {
|
||||
rsid: "rs4680",
|
||||
gene: "COMT",
|
||||
name: "Val158Met",
|
||||
risk_allele: 'A',
|
||||
interps: &[
|
||||
(
|
||||
"GG",
|
||||
"Val/Val",
|
||||
"Higher COMT activity, lower dopamine. Better stress resilience.",
|
||||
),
|
||||
(
|
||||
"AG",
|
||||
"Val/Met heterozygous",
|
||||
"Intermediate COMT activity. Balanced dopamine.",
|
||||
),
|
||||
(
|
||||
"AA",
|
||||
"Met/Met",
|
||||
"Lower COMT activity, higher dopamine. Higher pain sensitivity.",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── OPRM1 (opioid receptor) ──
|
||||
VDef {
|
||||
rsid: "rs1799971",
|
||||
gene: "OPRM1",
|
||||
name: "A118G (Asn40Asp)",
|
||||
risk_allele: 'G',
|
||||
interps: &[
|
||||
("AA", "Asn/Asn", "Normal opioid sensitivity"),
|
||||
(
|
||||
"AG",
|
||||
"Asn/Asp heterozygous",
|
||||
"Reduced opioid sensitivity; may need higher doses.",
|
||||
),
|
||||
("GG", "Asp/Asp", "Significantly reduced opioid sensitivity."),
|
||||
],
|
||||
},
|
||||
// ── CYP1A2 (caffeine) ──
|
||||
VDef {
|
||||
rsid: "rs762551",
|
||||
gene: "CYP1A2",
|
||||
name: "Caffeine metabolism",
|
||||
risk_allele: 'C',
|
||||
interps: &[
|
||||
(
|
||||
"AA",
|
||||
"Fast metabolizer",
|
||||
"Rapid caffeine clearance. Coffee may REDUCE heart disease risk.",
|
||||
),
|
||||
(
|
||||
"AC",
|
||||
"Intermediate",
|
||||
"Moderate caffeine clearance. Moderate coffee intake recommended.",
|
||||
),
|
||||
(
|
||||
"CC",
|
||||
"Slow metabolizer",
|
||||
"Slow caffeine clearance. Excess coffee may INCREASE heart risk.",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── Lactose ──
|
||||
VDef {
|
||||
rsid: "rs4988235",
|
||||
gene: "MCM6/LCT",
|
||||
name: "Lactase persistence (European)",
|
||||
risk_allele: 'G',
|
||||
interps: &[
|
||||
(
|
||||
"AA",
|
||||
"Lactase persistent",
|
||||
"Likely lactose TOLERANT into adulthood",
|
||||
),
|
||||
(
|
||||
"AG",
|
||||
"Heterozygous",
|
||||
"Likely lactose tolerant (persistence is dominant)",
|
||||
),
|
||||
(
|
||||
"GG",
|
||||
"Lactase non-persistent",
|
||||
"Likely lactose INTOLERANT in adulthood",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── OXTR (oxytocin receptor) ──
|
||||
VDef {
|
||||
rsid: "rs53576",
|
||||
gene: "OXTR",
|
||||
name: "Oxytocin receptor",
|
||||
risk_allele: 'A',
|
||||
interps: &[
|
||||
(
|
||||
"GG",
|
||||
"GG genotype",
|
||||
"Higher empathy scores; better social cognition.",
|
||||
),
|
||||
(
|
||||
"AG",
|
||||
"AG heterozygous",
|
||||
"Intermediate empathy and social cognition.",
|
||||
),
|
||||
(
|
||||
"AA",
|
||||
"AA genotype",
|
||||
"May have lower empathy; potentially more resilient to social stress.",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── HTR2A (serotonin) ──
|
||||
VDef {
|
||||
rsid: "rs6311",
|
||||
gene: "HTR2A",
|
||||
name: "Serotonin 2A receptor (-1438G/A)",
|
||||
risk_allele: 'T',
|
||||
interps: &[
|
||||
("CC", "GG genotype", "Normal serotonin receptor expression"),
|
||||
(
|
||||
"CT",
|
||||
"GA heterozygous",
|
||||
"Slightly altered serotonin signaling",
|
||||
),
|
||||
(
|
||||
"TT",
|
||||
"AA genotype",
|
||||
"Altered serotonin receptor density; may affect SSRI response",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── ANKK1/DRD2 (dopamine) ──
|
||||
VDef {
|
||||
rsid: "rs1800497",
|
||||
gene: "ANKK1/DRD2",
|
||||
name: "Taq1A (dopamine receptor)",
|
||||
risk_allele: 'A',
|
||||
interps: &[
|
||||
("GG", "A2/A2", "Normal dopamine receptor density"),
|
||||
(
|
||||
"AG",
|
||||
"A1/A2 heterozygous",
|
||||
"Reduced D2 receptor density (~30% less). Reward-seeking.",
|
||||
),
|
||||
(
|
||||
"AA",
|
||||
"A1/A1",
|
||||
"Significantly reduced D2 receptor density. Higher addiction risk.",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── SLCO1B1 (statin metabolism) ──
|
||||
VDef {
|
||||
rsid: "rs4363657",
|
||||
gene: "SLCO1B1",
|
||||
name: "Statin transporter",
|
||||
risk_allele: 'C',
|
||||
interps: &[
|
||||
(
|
||||
"TT",
|
||||
"Reference",
|
||||
"Normal statin metabolism. Standard dosing.",
|
||||
),
|
||||
(
|
||||
"CT",
|
||||
"Heterozygous",
|
||||
"Increased statin myopathy risk (~4.5x). Consider lower dose.",
|
||||
),
|
||||
(
|
||||
"CC",
|
||||
"Homozygous variant",
|
||||
"High statin myopathy risk (~17x). Use lowest effective dose.",
|
||||
),
|
||||
],
|
||||
},
|
||||
// ── NQO1 (oxidative stress) ──
|
||||
VDef {
|
||||
rsid: "rs1800566",
|
||||
gene: "NQO1",
|
||||
name: "Pro187Ser (oxidative stress)",
|
||||
risk_allele: 'T',
|
||||
interps: &[
|
||||
("CC", "Pro/Pro (reference)", "Normal NQO1 enzyme activity"),
|
||||
(
|
||||
"CT",
|
||||
"Pro/Ser heterozygous",
|
||||
"Reduced NQO1 activity (~3x lower). Impaired detox.",
|
||||
),
|
||||
(
|
||||
"TT",
|
||||
"Ser/Ser",
|
||||
"No NQO1 activity. Significantly impaired quinone detoxification.",
|
||||
),
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
/// Analyze health variants from a genotype map (rsid -> genotype string).
|
||||
pub fn analyze_health_variants(genotypes: &HashMap<String, String>) -> Vec<HealthVariantResult> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
for def in HEALTH_VARIANTS {
|
||||
if let Some(gt) = genotypes.get(def.rsid) {
|
||||
let (desc, sig) = def
|
||||
.interps
|
||||
.iter()
|
||||
.find(|(g, _, _)| *g == gt.as_str())
|
||||
.map(|(_, d, s)| (d.to_string(), s.to_string()))
|
||||
.unwrap_or_else(|| {
|
||||
(
|
||||
format!("Genotype {} - not in standard table", gt),
|
||||
"Consult genetic counselor".to_string(),
|
||||
)
|
||||
});
|
||||
|
||||
results.push(HealthVariantResult {
|
||||
rsid: def.rsid.to_string(),
|
||||
gene: def.gene.to_string(),
|
||||
name: def.name.to_string(),
|
||||
genotype: gt.clone(),
|
||||
risk_allele: def.risk_allele,
|
||||
interpretation: desc,
|
||||
clinical_significance: sig,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Determine APOE genotype from rs429358 + rs7412 combination.
|
||||
pub fn determine_apoe(genotypes: &HashMap<String, String>) -> ApoeResult {
|
||||
let gt1 = genotypes.get("rs429358").cloned().unwrap_or_default();
|
||||
let gt2 = genotypes.get("rs7412").cloned().unwrap_or_default();
|
||||
|
||||
if gt1.is_empty() || gt2.is_empty() {
|
||||
return ApoeResult {
|
||||
genotype: "Unable to determine (missing data)".into(),
|
||||
rs429358: gt1,
|
||||
rs7412: gt2,
|
||||
};
|
||||
}
|
||||
|
||||
// e4 alleles = count of 'C' at rs429358
|
||||
let e4 = gt1.chars().filter(|&c| c == 'C').count();
|
||||
// e2 alleles = count of 'T' at rs7412
|
||||
let e2 = gt2.chars().filter(|&c| c == 'T').count();
|
||||
|
||||
let genotype = match (e4, e2) {
|
||||
(0, 0) => "e3/e3 (most common, baseline risk)".into(),
|
||||
(0, 1) => "e2/e3 (PROTECTIVE - reduced Alzheimer's risk)".into(),
|
||||
(0, 2) => "e2/e2 (protective; monitor for type III hyperlipoproteinemia)".into(),
|
||||
(1, 0) => "e3/e4 (increased Alzheimer's risk ~3x)".into(),
|
||||
(1, 1) => "e2/e4 (mixed - e2 partially offsets e4 risk)".into(),
|
||||
(2, _) => "e4/e4 (significantly increased Alzheimer's risk ~12x)".into(),
|
||||
_ => format!("Unusual combination: rs429358={}, rs7412={}", gt1, gt2),
|
||||
};
|
||||
|
||||
ApoeResult {
|
||||
genotype,
|
||||
rs429358: gt1,
|
||||
rs7412: gt2,
|
||||
}
|
||||
}
|
||||
|
||||
/// Analyze MTHFR compound status from C677T + A1298C.
|
||||
pub fn analyze_mthfr(genotypes: &HashMap<String, String>) -> MthfrResult {
|
||||
let c677t = genotypes.get("rs1801133").cloned().unwrap_or_default();
|
||||
let a1298c = genotypes.get("rs1801131").cloned().unwrap_or_default();
|
||||
|
||||
if c677t.is_empty() || a1298c.is_empty() {
|
||||
return MthfrResult {
|
||||
c677t,
|
||||
a1298c,
|
||||
score: 0,
|
||||
assessment: "Incomplete MTHFR data".into(),
|
||||
};
|
||||
}
|
||||
|
||||
let c_risk = match c677t.as_str() {
|
||||
"GG" => 0u8,
|
||||
"AG" => 1,
|
||||
"AA" => 2,
|
||||
_ => 0,
|
||||
};
|
||||
let a_risk = match a1298c.as_str() {
|
||||
"TT" => 0u8,
|
||||
"GT" => 1,
|
||||
"GG" => 2,
|
||||
_ => 0,
|
||||
};
|
||||
let score = c_risk + a_risk;
|
||||
|
||||
let assessment = match score {
|
||||
0 => "Normal MTHFR function. No supplementation needed.",
|
||||
1 => "Mildly reduced MTHFR. Consider methylfolate if homocysteine elevated.",
|
||||
2 => "Moderately reduced MTHFR. Methylfolate (L-5-MTHF) recommended.",
|
||||
3 => "Significantly reduced MTHFR (compound heterozygote). Methylfolate strongly recommended.",
|
||||
_ => "Severely reduced MTHFR. Methylfolate essential. Regular homocysteine monitoring.",
|
||||
};
|
||||
|
||||
MthfrResult {
|
||||
c677t,
|
||||
a1298c,
|
||||
score,
|
||||
assessment: assessment.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Analyze pain sensitivity profile from COMT + OPRM1.
|
||||
pub fn analyze_pain(genotypes: &HashMap<String, String>) -> Option<PainProfile> {
|
||||
let comt = genotypes.get("rs4680")?;
|
||||
let oprm1 = genotypes.get("rs1799971")?;
|
||||
|
||||
let mut score = 0u8;
|
||||
if comt == "AA" {
|
||||
score += 2;
|
||||
} else if comt == "AG" {
|
||||
score += 1;
|
||||
}
|
||||
if oprm1 == "GG" {
|
||||
score += 2;
|
||||
} else if oprm1 == "AG" {
|
||||
score += 1;
|
||||
}
|
||||
|
||||
let label = match score {
|
||||
0 => "Low",
|
||||
1 => "Low-Moderate",
|
||||
2 => "Moderate",
|
||||
3 => "Moderate-High",
|
||||
_ => "High",
|
||||
};
|
||||
|
||||
let comt_note = if comt.contains('A') {
|
||||
"Higher pain sensitivity"
|
||||
} else {
|
||||
"Lower pain sensitivity"
|
||||
};
|
||||
let oprm1_note = if oprm1.contains('G') {
|
||||
"Reduced opioid response"
|
||||
} else {
|
||||
"Normal opioid response"
|
||||
};
|
||||
|
||||
Some(PainProfile {
|
||||
comt: comt.clone(),
|
||||
oprm1: oprm1.clone(),
|
||||
score,
|
||||
label: label.into(),
|
||||
comt_note: comt_note.into(),
|
||||
oprm1_note: oprm1_note.into(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Category groupings for health variant display
|
||||
pub fn variant_categories() -> Vec<(&'static str, Vec<&'static str>)> {
|
||||
vec![
|
||||
("Cancer Risk", vec!["TP53", "BRCA1", "BRCA2", "NQO1"]),
|
||||
("Cardiovascular", vec!["SLCO1B1"]),
|
||||
(
|
||||
"Neurological",
|
||||
vec!["APOE", "COMT", "OPRM1", "OXTR", "HTR2A", "ANKK1/DRD2"],
|
||||
),
|
||||
("Metabolism", vec!["MTHFR", "CYP1A2", "MCM6/LCT"]),
|
||||
]
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_map(pairs: &[(&str, &str)]) -> HashMap<String, String> {
|
||||
pairs
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apoe_e3e3() {
|
||||
let gts = make_map(&[("rs429358", "TT"), ("rs7412", "CC")]);
|
||||
let r = determine_apoe(>s);
|
||||
assert!(r.genotype.contains("e3/e3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apoe_e2e3() {
|
||||
let gts = make_map(&[("rs429358", "TT"), ("rs7412", "CT")]);
|
||||
let r = determine_apoe(>s);
|
||||
assert!(r.genotype.contains("e2/e3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apoe_e4e4() {
|
||||
let gts = make_map(&[("rs429358", "CC"), ("rs7412", "CC")]);
|
||||
let r = determine_apoe(>s);
|
||||
assert!(r.genotype.contains("e4/e4"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mthfr_normal() {
|
||||
let gts = make_map(&[("rs1801133", "GG"), ("rs1801131", "TT")]);
|
||||
let r = analyze_mthfr(>s);
|
||||
assert_eq!(r.score, 0);
|
||||
assert!(r.assessment.contains("Normal"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mthfr_compound() {
|
||||
let gts = make_map(&[("rs1801133", "AG"), ("rs1801131", "GG")]);
|
||||
let r = analyze_mthfr(>s);
|
||||
assert_eq!(r.score, 3);
|
||||
assert!(r.assessment.contains("compound"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pain_low() {
|
||||
let gts = make_map(&[("rs4680", "GG"), ("rs1799971", "AA")]);
|
||||
let p = analyze_pain(>s).unwrap();
|
||||
assert_eq!(p.score, 0);
|
||||
assert_eq!(p.label, "Low");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pain_high() {
|
||||
let gts = make_map(&[("rs4680", "AA"), ("rs1799971", "GG")]);
|
||||
let p = analyze_pain(>s).unwrap();
|
||||
assert_eq!(p.score, 4);
|
||||
assert_eq!(p.label, "High");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_health_variants_lookup() {
|
||||
let gts = make_map(&[("rs762551", "AA"), ("rs4680", "AG")]);
|
||||
let results = analyze_health_variants(>s);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].gene, "COMT");
|
||||
assert_eq!(results[1].gene, "CYP1A2");
|
||||
}
|
||||
}
|
||||
511
vendor/ruvector/examples/dna/src/kmer.rs
vendored
Normal file
511
vendor/ruvector/examples/dna/src/kmer.rs
vendored
Normal file
@@ -0,0 +1,511 @@
|
||||
//! K-mer encoding and HNSW vector indexing for DNA sequences
|
||||
//!
|
||||
//! This module provides efficient k-mer based vector encoding for DNA sequences
|
||||
//! with HNSW indexing for fast similarity search. Implements both k-mer frequency
|
||||
//! vectors and MinHash sketching (Mash/sourmash algorithm).
|
||||
|
||||
use ruvector_core::{
|
||||
types::{DbOptions, DistanceMetric, HnswConfig, QuantizationConfig, SearchQuery},
|
||||
VectorDB, VectorEntry,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum KmerError {
|
||||
#[error("Invalid k-mer length: {0}")]
|
||||
InvalidKmerLength(usize),
|
||||
#[error("Invalid DNA sequence: {0}")]
|
||||
InvalidSequence(String),
|
||||
#[error("Database error: {0}")]
|
||||
DatabaseError(#[from] ruvector_core::RuvectorError),
|
||||
#[error("Empty sequence")]
|
||||
EmptySequence,
|
||||
}
|
||||
|
||||
type Result<T> = std::result::Result<T, KmerError>;
|
||||
|
||||
/// Nucleotide to 2-bit encoding: A=0, C=1, G=2, T=3
|
||||
#[inline]
|
||||
fn nucleotide_to_bits(nuc: u8) -> Option<u8> {
|
||||
match nuc.to_ascii_uppercase() {
|
||||
b'A' => Some(0),
|
||||
b'C' => Some(1),
|
||||
b'G' => Some(2),
|
||||
b'T' | b'U' => Some(3),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the reverse complement of a DNA sequence
|
||||
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&nuc| match nuc.to_ascii_uppercase() {
|
||||
b'A' => b'T',
|
||||
b'T' | b'U' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
n => n,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Returns the canonical k-mer (lexicographically smaller of k-mer and its reverse complement)
|
||||
pub fn canonical_kmer(kmer: &[u8]) -> Vec<u8> {
|
||||
let rc = reverse_complement(kmer);
|
||||
if kmer <= rc.as_slice() {
|
||||
kmer.to_vec()
|
||||
} else {
|
||||
rc
|
||||
}
|
||||
}
|
||||
|
||||
/// K-mer encoder that converts DNA sequences into frequency vectors
|
||||
pub struct KmerEncoder {
|
||||
k: usize,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
impl KmerEncoder {
|
||||
/// Create a new k-mer encoder for k-mers of length k
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `k` - Length of k-mers (typical values: 21, 31)
|
||||
///
|
||||
/// Uses feature hashing to limit dimensionality for large k
|
||||
pub fn new(k: usize) -> Result<Self> {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidKmerLength(k));
|
||||
}
|
||||
|
||||
// Calculate dimensions: min(4^k, 1024) using feature hashing
|
||||
let max_kmers = 4_usize.saturating_pow(k as u32);
|
||||
let dimensions = max_kmers.min(1024);
|
||||
|
||||
Ok(Self { k, dimensions })
|
||||
}
|
||||
|
||||
/// Get the number of dimensions in the encoded vector
|
||||
pub fn dimensions(&self) -> usize {
|
||||
self.dimensions
|
||||
}
|
||||
|
||||
/// Encode a DNA sequence into a k-mer frequency vector
|
||||
///
|
||||
/// Uses canonical k-mer hashing (min of forward/reverse-complement hash)
|
||||
/// to count strand-agnostic k-mers, then normalizes to unit vector.
|
||||
pub fn encode_sequence(&self, seq: &[u8]) -> Result<Vec<f32>> {
|
||||
if seq.len() < self.k {
|
||||
return Err(KmerError::EmptySequence);
|
||||
}
|
||||
|
||||
let mut counts = vec![0u32; self.dimensions];
|
||||
let mut total = 0u32;
|
||||
|
||||
// Extract all k-mers using a sliding window
|
||||
// Avoid Vec allocation by hashing both strands and taking min
|
||||
for window in seq.windows(self.k) {
|
||||
let fwd_hash = Self::fnv1a_hash(window);
|
||||
let rc_hash = Self::fnv1a_hash_rc(window);
|
||||
let canonical_hash = fwd_hash.min(rc_hash);
|
||||
let index = canonical_hash % self.dimensions;
|
||||
|
||||
counts[index] = counts[index].saturating_add(1);
|
||||
total = total.saturating_add(1);
|
||||
}
|
||||
|
||||
// Normalize to frequency vector and then to unit vector
|
||||
let inv_total = 1.0 / total as f32;
|
||||
let mut vector: Vec<f32> = counts
|
||||
.iter()
|
||||
.map(|&count| count as f32 * inv_total)
|
||||
.collect();
|
||||
|
||||
// L2 normalization
|
||||
let norm: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
let inv_norm = 1.0 / norm;
|
||||
vector.iter_mut().for_each(|x| *x *= inv_norm);
|
||||
}
|
||||
|
||||
Ok(vector)
|
||||
}
|
||||
|
||||
/// FNV-1a hash of a byte slice
|
||||
#[inline]
|
||||
fn fnv1a_hash(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data {
|
||||
hash ^= byte as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
|
||||
/// FNV-1a hash of reverse complement (avoids Vec allocation)
|
||||
#[inline]
|
||||
fn fnv1a_hash_rc(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data.iter().rev() {
|
||||
let comp = match byte.to_ascii_uppercase() {
|
||||
b'A' => b'T',
|
||||
b'T' | b'U' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
n => n,
|
||||
};
|
||||
hash ^= comp as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
|
||||
/// Hash a k-mer to an index using FNV-1a hash
|
||||
fn hash_kmer(&self, kmer: &[u8]) -> usize {
|
||||
Self::fnv1a_hash(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
/// MinHash sketch for fast sequence similarity (Mash/sourmash algorithm)
|
||||
pub struct MinHashSketch {
|
||||
num_hashes: usize,
|
||||
hashes: Vec<u64>,
|
||||
}
|
||||
|
||||
impl MinHashSketch {
|
||||
/// Create a new MinHash sketch with the given number of hashes
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `num_hashes` - Number of hash values to keep (typically 1000)
|
||||
pub fn new(num_hashes: usize) -> Self {
|
||||
Self {
|
||||
num_hashes,
|
||||
hashes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute MinHash signature for a DNA sequence
|
||||
pub fn sketch(&mut self, seq: &[u8], k: usize) -> Result<&[u64]> {
|
||||
if seq.len() < k {
|
||||
return Err(KmerError::EmptySequence);
|
||||
}
|
||||
|
||||
let mut all_hashes = Vec::with_capacity(seq.len() - k + 1);
|
||||
|
||||
// Hash all k-mers using dual-hash (no Vec allocation per k-mer)
|
||||
for window in seq.windows(k) {
|
||||
let fwd = Self::hash_kmer_64_slice(window);
|
||||
let rc = Self::hash_kmer_64_rc(window);
|
||||
all_hashes.push(fwd.min(rc));
|
||||
}
|
||||
|
||||
// Sort and keep the smallest num_hashes values
|
||||
all_hashes.sort_unstable();
|
||||
all_hashes.truncate(self.num_hashes);
|
||||
self.hashes = all_hashes;
|
||||
|
||||
Ok(&self.hashes)
|
||||
}
|
||||
|
||||
/// Compute Jaccard distance between two MinHash sketches
|
||||
pub fn jaccard_distance(&self, other: &MinHashSketch) -> f32 {
|
||||
if self.hashes.is_empty() || other.hashes.is_empty() {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
let mut intersection = 0;
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
|
||||
// Count intersection using sorted arrays
|
||||
while i < self.hashes.len() && j < other.hashes.len() {
|
||||
if self.hashes[i] == other.hashes[j] {
|
||||
intersection += 1;
|
||||
i += 1;
|
||||
j += 1;
|
||||
} else if self.hashes[i] < other.hashes[j] {
|
||||
i += 1;
|
||||
} else {
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let union = self.hashes.len() + other.hashes.len() - intersection;
|
||||
if union == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let jaccard_similarity = intersection as f32 / union as f32;
|
||||
1.0 - jaccard_similarity
|
||||
}
|
||||
|
||||
/// Hash a k-mer using MurmurHash3-like algorithm (forward strand)
|
||||
#[inline]
|
||||
fn hash_kmer_64_slice(kmer: &[u8]) -> u64 {
|
||||
const C1: u64 = 0x87c37b91114253d5;
|
||||
const C2: u64 = 0x4cf5ad432745937f;
|
||||
let mut h = 0u64;
|
||||
for &byte in kmer {
|
||||
let mut k = byte as u64;
|
||||
k = k.wrapping_mul(C1);
|
||||
k = k.rotate_left(31);
|
||||
k = k.wrapping_mul(C2);
|
||||
h ^= k;
|
||||
h = h.rotate_left(27);
|
||||
h = h.wrapping_mul(5).wrapping_add(0x52dce729);
|
||||
}
|
||||
h ^ kmer.len() as u64
|
||||
}
|
||||
|
||||
/// Hash reverse complement of a k-mer (no Vec allocation)
|
||||
#[inline]
|
||||
fn hash_kmer_64_rc(kmer: &[u8]) -> u64 {
|
||||
const C1: u64 = 0x87c37b91114253d5;
|
||||
const C2: u64 = 0x4cf5ad432745937f;
|
||||
let mut h = 0u64;
|
||||
for &byte in kmer.iter().rev() {
|
||||
let comp = match byte.to_ascii_uppercase() {
|
||||
b'A' => b'T',
|
||||
b'T' | b'U' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
n => n,
|
||||
};
|
||||
let mut k = comp as u64;
|
||||
k = k.wrapping_mul(C1);
|
||||
k = k.rotate_left(31);
|
||||
k = k.wrapping_mul(C2);
|
||||
h ^= k;
|
||||
h = h.rotate_left(27);
|
||||
h = h.wrapping_mul(5).wrapping_add(0x52dce729);
|
||||
}
|
||||
h ^ kmer.len() as u64
|
||||
}
|
||||
|
||||
/// Get the hashes
|
||||
pub fn hashes(&self) -> &[u64] {
|
||||
&self.hashes
|
||||
}
|
||||
}
|
||||
|
||||
/// Search result for k-mer index queries
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KmerSearchResult {
|
||||
pub id: String,
|
||||
pub score: f32,
|
||||
pub distance: f32,
|
||||
}
|
||||
|
||||
/// K-mer index wrapping VectorDB for sequence similarity search
|
||||
pub struct KmerIndex {
|
||||
db: VectorDB,
|
||||
encoder: KmerEncoder,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
impl KmerIndex {
|
||||
/// Create a new k-mer index
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `k` - K-mer length
|
||||
/// * `dimensions` - Vector dimensions (should match encoder dimensions)
|
||||
pub fn new(k: usize, dimensions: usize) -> Result<Self> {
|
||||
let encoder = KmerEncoder::new(k)?;
|
||||
|
||||
// Verify dimensions match
|
||||
if encoder.dimensions() != dimensions {
|
||||
return Err(KmerError::InvalidKmerLength(k));
|
||||
}
|
||||
|
||||
let options = DbOptions {
|
||||
dimensions,
|
||||
distance_metric: DistanceMetric::Cosine,
|
||||
storage_path: format!("./kmer_index_k{}.db", k),
|
||||
hnsw_config: Some(HnswConfig {
|
||||
m: 32,
|
||||
ef_construction: 200,
|
||||
ef_search: 100,
|
||||
max_elements: 1_000_000,
|
||||
}),
|
||||
quantization: Some(QuantizationConfig::Scalar),
|
||||
};
|
||||
|
||||
let db = VectorDB::new(options)?;
|
||||
|
||||
Ok(Self { db, encoder, k })
|
||||
}
|
||||
|
||||
/// Index a single DNA sequence
|
||||
pub fn index_sequence(&self, id: &str, sequence: &[u8]) -> Result<()> {
|
||||
let vector = self.encoder.encode_sequence(sequence)?;
|
||||
|
||||
let entry = VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector,
|
||||
metadata: Some({
|
||||
let mut meta = HashMap::new();
|
||||
meta.insert("length".to_string(), serde_json::json!(sequence.len()));
|
||||
meta.insert("k".to_string(), serde_json::json!(self.k));
|
||||
meta
|
||||
}),
|
||||
};
|
||||
|
||||
self.db.insert(entry)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Index multiple sequences in a batch
|
||||
pub fn index_batch(&self, sequences: Vec<(&str, &[u8])>) -> Result<()> {
|
||||
let entries: Result<Vec<VectorEntry>> = sequences
|
||||
.into_iter()
|
||||
.map(|(id, seq)| {
|
||||
let vector = self.encoder.encode_sequence(seq)?;
|
||||
Ok(VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector,
|
||||
metadata: Some({
|
||||
let mut meta = HashMap::new();
|
||||
meta.insert("length".to_string(), serde_json::json!(seq.len()));
|
||||
meta.insert("k".to_string(), serde_json::json!(self.k));
|
||||
meta
|
||||
}),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.db.insert_batch(entries?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Search for similar sequences
|
||||
pub fn search_similar(&self, query: &[u8], top_k: usize) -> Result<Vec<KmerSearchResult>> {
|
||||
let query_vector = self.encoder.encode_sequence(query)?;
|
||||
|
||||
let search_query = SearchQuery {
|
||||
vector: query_vector,
|
||||
k: top_k,
|
||||
filter: None,
|
||||
ef_search: None,
|
||||
};
|
||||
|
||||
let results = self.db.search(search_query)?;
|
||||
|
||||
Ok(results
|
||||
.into_iter()
|
||||
.map(|r| KmerSearchResult {
|
||||
id: r.id,
|
||||
score: r.score,
|
||||
distance: r.score,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Search for sequences with similarity above a threshold
|
||||
pub fn search_with_threshold(
|
||||
&self,
|
||||
query: &[u8],
|
||||
threshold: f32,
|
||||
) -> Result<Vec<KmerSearchResult>> {
|
||||
// Search with a larger k to ensure we get all candidates
|
||||
let results = self.search_similar(query, 100)?;
|
||||
|
||||
Ok(results
|
||||
.into_iter()
|
||||
.filter(|r| r.distance <= threshold)
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Get the k-mer length
|
||||
pub fn k(&self) -> usize {
|
||||
self.k
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_nucleotide_encoding() {
|
||||
assert_eq!(nucleotide_to_bits(b'A'), Some(0));
|
||||
assert_eq!(nucleotide_to_bits(b'C'), Some(1));
|
||||
assert_eq!(nucleotide_to_bits(b'G'), Some(2));
|
||||
assert_eq!(nucleotide_to_bits(b'T'), Some(3));
|
||||
assert_eq!(nucleotide_to_bits(b'a'), Some(0));
|
||||
assert_eq!(nucleotide_to_bits(b'N'), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reverse_complement() {
|
||||
let seq = b"ATCG";
|
||||
let rc = reverse_complement(seq);
|
||||
assert_eq!(rc, b"CGAT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_canonical_kmer() {
|
||||
let kmer1 = b"ATCG";
|
||||
let kmer2 = b"CGAT"; // reverse complement
|
||||
|
||||
let canon1 = canonical_kmer(kmer1);
|
||||
let canon2 = canonical_kmer(kmer2);
|
||||
|
||||
assert_eq!(canon1, canon2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_encoder_creation() {
|
||||
let encoder = KmerEncoder::new(3).unwrap();
|
||||
assert_eq!(encoder.k, 3);
|
||||
assert_eq!(encoder.dimensions(), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_encoder_large_k() {
|
||||
let encoder = KmerEncoder::new(21).unwrap();
|
||||
assert_eq!(encoder.k, 21);
|
||||
assert_eq!(encoder.dimensions(), 1024); // Capped by feature hashing
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_sequence() {
|
||||
let encoder = KmerEncoder::new(3).unwrap();
|
||||
let seq = b"ATCGATCG";
|
||||
let vector = encoder.encode_sequence(seq).unwrap();
|
||||
|
||||
assert_eq!(vector.len(), encoder.dimensions());
|
||||
|
||||
// Check L2 normalization
|
||||
let norm: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 1e-5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minhash_sketch() {
|
||||
let mut sketch = MinHashSketch::new(100);
|
||||
let seq = b"ATCGATCGATCGATCGATCG";
|
||||
|
||||
sketch.sketch(seq, 5).unwrap();
|
||||
assert!(sketch.hashes().len() <= 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jaccard_distance() {
|
||||
let mut sketch1 = MinHashSketch::new(100);
|
||||
let mut sketch2 = MinHashSketch::new(100);
|
||||
|
||||
let seq1 = b"ATCGATCGATCGATCGATCG";
|
||||
let seq2 = b"ATCGATCGATCGATCGATCG"; // Identical
|
||||
|
||||
sketch1.sketch(seq1, 5).unwrap();
|
||||
sketch2.sketch(seq2, 5).unwrap();
|
||||
|
||||
let distance = sketch1.jaccard_distance(&sketch2);
|
||||
assert!(distance < 0.01); // Should be very similar
|
||||
}
|
||||
}
|
||||
365
vendor/ruvector/examples/dna/src/kmer_pagerank.rs
vendored
Normal file
365
vendor/ruvector/examples/dna/src/kmer_pagerank.rs
vendored
Normal file
@@ -0,0 +1,365 @@
|
||||
//! K-mer Graph PageRank for DNA Sequence Ranking
|
||||
//!
|
||||
//! Builds a k-mer co-occurrence graph from DNA sequences and uses
|
||||
//! ruvector-solver's Forward Push Personalized PageRank (PPR) to rank
|
||||
//! sequences by structural centrality in the k-mer overlap network.
|
||||
//!
|
||||
//! This enables identifying the most "representative" sequences in a
|
||||
//! collection — those whose k-mer profiles are most connected to others.
|
||||
|
||||
use ruvector_solver::forward_push::ForwardPushSolver;
|
||||
use ruvector_solver::types::CsrMatrix;
|
||||
|
||||
/// Result of PageRank-based sequence ranking
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SequenceRank {
|
||||
/// Index of the sequence in the input collection
|
||||
pub index: usize,
|
||||
/// PageRank score (higher = more central)
|
||||
pub score: f64,
|
||||
}
|
||||
|
||||
/// K-mer graph builder and PageRank ranker.
|
||||
///
|
||||
/// Constructs a weighted graph where:
|
||||
/// - Nodes are sequences
|
||||
/// - Edge weight(i, j) = number of shared k-mers between sequences i and j
|
||||
///
|
||||
/// Then uses Forward Push PPR to compute centrality scores.
|
||||
pub struct KmerGraphRanker {
|
||||
k: usize,
|
||||
hash_dimensions: usize,
|
||||
}
|
||||
|
||||
impl KmerGraphRanker {
|
||||
/// Create a new ranker with the given k-mer length.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `k` - K-mer length (typical: 11-31)
|
||||
/// * `hash_dimensions` - Number of hash buckets for k-mer fingerprints (default: 256)
|
||||
pub fn new(k: usize, hash_dimensions: usize) -> Self {
|
||||
Self { k, hash_dimensions }
|
||||
}
|
||||
|
||||
/// Build a k-mer fingerprint vector for a DNA sequence.
|
||||
///
|
||||
/// Uses FNV-1a hashing with canonical k-mers (min of forward/reverse-complement)
|
||||
/// to produce a fixed-size frequency vector.
|
||||
fn fingerprint(&self, seq: &[u8]) -> Vec<f64> {
|
||||
if seq.len() < self.k {
|
||||
return vec![0.0; self.hash_dimensions];
|
||||
}
|
||||
|
||||
let mut counts = vec![0u32; self.hash_dimensions];
|
||||
|
||||
for window in seq.windows(self.k) {
|
||||
let fwd = Self::fnv1a_hash(window);
|
||||
let rc = Self::fnv1a_hash_rc(window);
|
||||
let canonical = fwd.min(rc);
|
||||
counts[canonical % self.hash_dimensions] += 1;
|
||||
}
|
||||
|
||||
// Normalize to probability distribution
|
||||
let total: u32 = counts.iter().sum();
|
||||
if total == 0 {
|
||||
return vec![0.0; self.hash_dimensions];
|
||||
}
|
||||
let inv = 1.0 / total as f64;
|
||||
counts.iter().map(|&c| c as f64 * inv).collect()
|
||||
}
|
||||
|
||||
/// Compute cosine similarity between two fingerprint vectors.
|
||||
fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
|
||||
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
|
||||
let norm_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
|
||||
|
||||
if norm_a < 1e-15 || norm_b < 1e-15 {
|
||||
return 0.0;
|
||||
}
|
||||
dot / (norm_a * norm_b)
|
||||
}
|
||||
|
||||
/// Build the k-mer overlap graph as a column-stochastic transition matrix.
|
||||
///
|
||||
/// Edge weights are cosine similarities between k-mer fingerprints,
|
||||
/// normalized to form a stochastic matrix (columns sum to 1).
|
||||
fn build_transition_matrix(&self, sequences: &[&[u8]], threshold: f64) -> CsrMatrix<f64> {
|
||||
let n = sequences.len();
|
||||
let fingerprints: Vec<Vec<f64>> =
|
||||
sequences.iter().map(|seq| self.fingerprint(seq)).collect();
|
||||
|
||||
// Build weighted adjacency with thresholding
|
||||
let mut col_sums = vec![0.0f64; n];
|
||||
let mut entries: Vec<(usize, usize, f64)> = Vec::new();
|
||||
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i == j {
|
||||
continue;
|
||||
}
|
||||
let sim = Self::cosine_similarity(&fingerprints[i], &fingerprints[j]);
|
||||
if sim > threshold {
|
||||
entries.push((i, j, sim));
|
||||
col_sums[j] += sim;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize columns to make stochastic
|
||||
// Also add self-loops for isolated nodes
|
||||
let mut normalized: Vec<(usize, usize, f64)> = entries
|
||||
.into_iter()
|
||||
.map(|(i, j, w)| {
|
||||
let norm = if col_sums[j] > 1e-15 {
|
||||
col_sums[j]
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
(i, j, w / norm)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Add self-loops for isolated nodes (dangling node handling)
|
||||
for j in 0..n {
|
||||
if col_sums[j] < 1e-15 {
|
||||
normalized.push((j, j, 1.0));
|
||||
}
|
||||
}
|
||||
|
||||
CsrMatrix::<f64>::from_coo(n, n, normalized)
|
||||
}
|
||||
|
||||
/// Rank sequences by PageRank centrality in the k-mer overlap graph.
|
||||
///
|
||||
/// Uses ruvector-solver's Forward Push algorithm for sublinear-time
|
||||
/// Personalized PageRank computation.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `sequences` - Collection of DNA sequences (as byte slices)
|
||||
/// * `alpha` - Teleportation probability (default: 0.15)
|
||||
/// * `epsilon` - PPR approximation tolerance (default: 1e-6)
|
||||
/// * `similarity_threshold` - Minimum cosine similarity to create an edge (default: 0.1)
|
||||
///
|
||||
/// # Returns
|
||||
/// Sequences ranked by descending PageRank score
|
||||
pub fn rank_sequences(
|
||||
&self,
|
||||
sequences: &[&[u8]],
|
||||
alpha: f64,
|
||||
epsilon: f64,
|
||||
similarity_threshold: f64,
|
||||
) -> Vec<SequenceRank> {
|
||||
let n = sequences.len();
|
||||
if n == 0 {
|
||||
return vec![];
|
||||
}
|
||||
if n == 1 {
|
||||
return vec![SequenceRank {
|
||||
index: 0,
|
||||
score: 1.0,
|
||||
}];
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
|
||||
|
||||
// Use Forward Push PPR from each node, accumulate global PageRank
|
||||
let solver = ForwardPushSolver::new(alpha, epsilon);
|
||||
let mut global_rank = vec![0.0f64; n];
|
||||
|
||||
// Compute PPR from each node (or a representative subset for large graphs)
|
||||
let num_seeds = n.min(50); // Limit seeds for large collections
|
||||
let step = if n > num_seeds { n / num_seeds } else { 1 };
|
||||
|
||||
for seed_idx in (0..n).step_by(step) {
|
||||
match solver.ppr_from_source(&matrix, seed_idx) {
|
||||
Ok(ppr_result) => {
|
||||
for (node, score) in ppr_result {
|
||||
if node < n {
|
||||
global_rank[node] += score;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
// If PPR fails for this seed, skip it
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize
|
||||
let total: f64 = global_rank.iter().sum();
|
||||
if total > 1e-15 {
|
||||
let inv = 1.0 / total;
|
||||
for score in &mut global_rank {
|
||||
*score *= inv;
|
||||
}
|
||||
}
|
||||
|
||||
// Build ranked results
|
||||
let mut results: Vec<SequenceRank> = global_rank
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(index, score)| SequenceRank { index, score })
|
||||
.collect();
|
||||
|
||||
// Sort by score descending
|
||||
results.sort_by(|a, b| {
|
||||
b.score
|
||||
.partial_cmp(&a.score)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Compute pairwise PageRank similarity between two specific sequences
|
||||
/// within the context of a collection.
|
||||
///
|
||||
/// Uses Forward Push PPR from the source sequence and returns the
|
||||
/// PPR score at the target sequence.
|
||||
pub fn pairwise_similarity(
|
||||
&self,
|
||||
sequences: &[&[u8]],
|
||||
source: usize,
|
||||
target: usize,
|
||||
alpha: f64,
|
||||
epsilon: f64,
|
||||
similarity_threshold: f64,
|
||||
) -> f64 {
|
||||
if source >= sequences.len() || target >= sequences.len() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
|
||||
let solver = ForwardPushSolver::new(alpha, epsilon);
|
||||
|
||||
match solver.ppr_from_source(&matrix, source) {
|
||||
Ok(ppr_result) => ppr_result
|
||||
.into_iter()
|
||||
.find(|(node, _)| *node == target)
|
||||
.map(|(_, score)| score)
|
||||
.unwrap_or(0.0),
|
||||
Err(_) => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fnv1a_hash(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data {
|
||||
hash ^= byte as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fnv1a_hash_rc(data: &[u8]) -> usize {
|
||||
const FNV_OFFSET: u64 = 14695981039346656037;
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut hash = FNV_OFFSET;
|
||||
for &byte in data.iter().rev() {
|
||||
let comp = match byte.to_ascii_uppercase() {
|
||||
b'A' => b'T',
|
||||
b'T' | b'U' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
n => n,
|
||||
};
|
||||
hash ^= comp as u64;
|
||||
hash = hash.wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
hash as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_fingerprint() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq = b"ATCGATCGATCG";
|
||||
let fp = ranker.fingerprint(seq);
|
||||
assert_eq!(fp.len(), 64);
|
||||
|
||||
// Should be a probability distribution (sums to ~1)
|
||||
let sum: f64 = fp.iter().sum();
|
||||
assert!((sum - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_identical() {
|
||||
let a = vec![1.0, 2.0, 3.0];
|
||||
let b = vec![1.0, 2.0, 3.0];
|
||||
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
|
||||
assert!((sim - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_orthogonal() {
|
||||
let a = vec![1.0, 0.0];
|
||||
let b = vec![0.0, 1.0];
|
||||
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
|
||||
assert!(sim.abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_sequences_basic() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq1 = b"ATCGATCGATCGATCG";
|
||||
let seq2 = b"ATCGATCGATCGATCG"; // identical to seq1
|
||||
let seq3 = b"GCTAGCTAGCTAGCTA"; // different
|
||||
|
||||
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.01);
|
||||
|
||||
assert_eq!(ranks.len(), 3);
|
||||
|
||||
// All ranks should sum to 1
|
||||
let total: f64 = ranks.iter().map(|r| r.score).sum();
|
||||
assert!((total - 1.0).abs() < 1e-5);
|
||||
|
||||
// Identical sequences should have similar ranks
|
||||
let rank_0 = ranks.iter().find(|r| r.index == 0).unwrap().score;
|
||||
let rank_1 = ranks.iter().find(|r| r.index == 1).unwrap().score;
|
||||
assert!((rank_0 - rank_1).abs() < 0.3); // roughly similar
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_empty() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let sequences: Vec<&[u8]> = vec![];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
|
||||
assert!(ranks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_single() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let sequences: Vec<&[u8]> = vec![b"ATCGATCG"];
|
||||
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
|
||||
assert_eq!(ranks.len(), 1);
|
||||
assert!((ranks[0].score - 1.0).abs() < 1e-10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pairwise_similarity() {
|
||||
let ranker = KmerGraphRanker::new(3, 64);
|
||||
let seq1 = b"ATCGATCGATCGATCG";
|
||||
let seq2 = b"ATCGATCGATCGATCG";
|
||||
let seq3 = b"NNNNNNNNNNNNNNNN"; // very different
|
||||
|
||||
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
|
||||
|
||||
let sim_01 = ranker.pairwise_similarity(&sequences, 0, 1, 0.15, 1e-4, 0.01);
|
||||
let sim_02 = ranker.pairwise_similarity(&sequences, 0, 2, 0.15, 1e-4, 0.01);
|
||||
|
||||
// Identical sequences should have higher similarity
|
||||
assert!(sim_01 >= sim_02);
|
||||
}
|
||||
}
|
||||
84
vendor/ruvector/examples/dna/src/lib.rs
vendored
Normal file
84
vendor/ruvector/examples/dna/src/lib.rs
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
//! # rvDNA — AI-Native Genomic Analysis
|
||||
//!
|
||||
//! Fast, accurate genomic analysis in pure Rust with WASM support.
|
||||
//! Includes the `.rvdna` binary file format for storing pre-computed
|
||||
//! AI features alongside raw DNA sequences.
|
||||
//!
|
||||
//! - **K-mer HNSW Indexing**: Sequence similarity search via vector embeddings
|
||||
//! - **Smith-Waterman Alignment**: Local alignment with CIGAR and mapping quality
|
||||
//! - **Bayesian Variant Calling**: SNP/indel detection with Phred quality scores
|
||||
//! - **Protein Translation**: DNA-to-protein with GNN contact graph prediction
|
||||
//! - **Epigenomics**: Methylation profiling and Horvath biological age clock
|
||||
//! - **Pharmacogenomics**: CYP enzyme star allele calling and drug recommendations
|
||||
//! - **Pipeline Orchestration**: DAG-based multi-stage execution
|
||||
//! - **RVDNA Format**: AI-native binary file format with pre-computed tensors
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![allow(clippy::all)]
|
||||
|
||||
pub mod alignment;
|
||||
pub mod biomarker;
|
||||
pub mod biomarker_stream;
|
||||
pub mod epigenomics;
|
||||
pub mod error;
|
||||
pub mod genotyping;
|
||||
pub mod health;
|
||||
pub mod kmer;
|
||||
pub mod kmer_pagerank;
|
||||
pub mod pharma;
|
||||
pub mod pipeline;
|
||||
pub mod protein;
|
||||
pub mod real_data;
|
||||
pub mod rvdna;
|
||||
pub mod types;
|
||||
pub mod variant;
|
||||
|
||||
pub use alignment::{AlignmentConfig, SmithWaterman};
|
||||
pub use epigenomics::{
|
||||
CancerSignalDetector, CancerSignalResult, CpGSite, HorvathClock, MethylationProfile,
|
||||
};
|
||||
pub use error::{DnaError, Result};
|
||||
pub use pharma::{
|
||||
call_cyp2c19_allele, call_star_allele, get_recommendations, predict_cyp2c19_phenotype,
|
||||
predict_phenotype, Cyp2c19Allele, DrugRecommendation, MetabolizerPhenotype, PharmaVariant,
|
||||
StarAllele,
|
||||
};
|
||||
pub use protein::{isoelectric_point, molecular_weight, translate_dna, AminoAcid};
|
||||
pub use rvdna::{
|
||||
decode_2bit, encode_2bit, fasta_to_rvdna, Codec, KmerVectorBlock, RvdnaHeader, RvdnaReader,
|
||||
RvdnaStats, RvdnaWriter, SparseAttention, VariantTensor,
|
||||
};
|
||||
pub use types::{
|
||||
AlignmentResult, AnalysisConfig, CigarOp, ContactGraph, DnaSequence, GenomicPosition,
|
||||
KmerIndex, Nucleotide, ProteinResidue, ProteinSequence, QualityScore, Variant,
|
||||
};
|
||||
pub use variant::{
|
||||
FilterStatus, Genotype, PileupColumn, VariantCall, VariantCaller, VariantCallerConfig,
|
||||
};
|
||||
|
||||
pub use ruvector_core::{
|
||||
types::{DbOptions, DistanceMetric, HnswConfig, SearchQuery, SearchResult, VectorEntry},
|
||||
VectorDB,
|
||||
};
|
||||
|
||||
pub use biomarker::{BiomarkerClassification, BiomarkerProfile, BiomarkerReference, CategoryScore};
|
||||
pub use biomarker_stream::{
|
||||
BiomarkerReading, RingBuffer, StreamConfig, StreamProcessor, StreamStats,
|
||||
};
|
||||
pub use genotyping::{
|
||||
CallConfidence, CypDiplotype, GenomeBuild, GenotypeAnalysis, GenotypeData, Snp,
|
||||
};
|
||||
pub use health::{ApoeResult, HealthVariantResult, MthfrResult, PainProfile};
|
||||
pub use kmer_pagerank::{KmerGraphRanker, SequenceRank};
|
||||
|
||||
/// Prelude module for common imports
|
||||
pub mod prelude {
|
||||
pub use crate::alignment::*;
|
||||
pub use crate::epigenomics::*;
|
||||
pub use crate::error::{DnaError, Result};
|
||||
pub use crate::kmer::*;
|
||||
pub use crate::pharma::*;
|
||||
pub use crate::protein::*;
|
||||
pub use crate::types::*;
|
||||
pub use crate::variant::*;
|
||||
}
|
||||
427
vendor/ruvector/examples/dna/src/main.rs
vendored
Normal file
427
vendor/ruvector/examples/dna/src/main.rs
vendored
Normal file
@@ -0,0 +1,427 @@
|
||||
//! DNA Analyzer Demo - RuVector Genomic Analysis Pipeline
|
||||
//!
|
||||
//! Demonstrates SOTA genomic analysis using:
|
||||
//! - Real human gene sequences (HBB, TP53, BRCA1, CYP2D6, INS)
|
||||
//! - HNSW k-mer indexing for fast sequence search
|
||||
//! - Attention-based sequence alignment
|
||||
//! - Variant calling from pileup data
|
||||
//! - Protein translation and contact prediction
|
||||
//! - Epigenetic age prediction (Horvath clock)
|
||||
//! - Pharmacogenomic star allele calling
|
||||
//! - RVDNA AI-native file format with pre-computed tensors
|
||||
|
||||
use ::rvdna::prelude::*;
|
||||
use ::rvdna::{
|
||||
alignment::{AlignmentConfig, SmithWaterman},
|
||||
epigenomics::{HorvathClock, MethylationProfile},
|
||||
genotyping, pharma,
|
||||
protein::translate_dna,
|
||||
real_data,
|
||||
rvdna::{
|
||||
self, Codec, KmerVectorBlock, RvdnaReader, RvdnaWriter, SparseAttention, VariantTensor,
|
||||
},
|
||||
variant::{PileupColumn, VariantCaller, VariantCallerConfig},
|
||||
};
|
||||
use rand::Rng;
|
||||
use tracing::{info, Level};
|
||||
use tracing_subscriber::FmtSubscriber;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Check for 23andMe file argument
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
if args.len() > 1 {
|
||||
return run_23andme(&args[1]);
|
||||
}
|
||||
|
||||
let subscriber = FmtSubscriber::builder()
|
||||
.with_max_level(Level::INFO)
|
||||
.finish();
|
||||
tracing::subscriber::set_global_default(subscriber)?;
|
||||
|
||||
info!("RuVector DNA Analyzer - Genomic Analysis Pipeline");
|
||||
info!("================================================");
|
||||
info!("Using real human gene sequences from NCBI RefSeq");
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 1: Load real human gene sequences
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 1: Loading real human gene sequences");
|
||||
let total_start = std::time::Instant::now();
|
||||
|
||||
let hbb = DnaSequence::from_str(real_data::HBB_CODING_SEQUENCE)?;
|
||||
let tp53 = DnaSequence::from_str(real_data::TP53_EXONS_5_8)?;
|
||||
let brca1 = DnaSequence::from_str(real_data::BRCA1_EXON11_FRAGMENT)?;
|
||||
let cyp2d6 = DnaSequence::from_str(real_data::CYP2D6_CODING)?;
|
||||
let insulin = DnaSequence::from_str(real_data::INS_CODING)?;
|
||||
|
||||
info!(
|
||||
" HBB (hemoglobin beta): {} bp [chr11, sickle cell gene]",
|
||||
hbb.len()
|
||||
);
|
||||
info!(
|
||||
" TP53 (tumor suppressor): {} bp [chr17, exons 5-8]",
|
||||
tp53.len()
|
||||
);
|
||||
info!(
|
||||
" BRCA1 (DNA repair): {} bp [chr17, exon 11 fragment]",
|
||||
brca1.len()
|
||||
);
|
||||
info!(
|
||||
" CYP2D6 (drug metabolism): {} bp [chr22, pharmacogenomic]",
|
||||
cyp2d6.len()
|
||||
);
|
||||
info!(
|
||||
" INS (insulin): {} bp [chr11, preproinsulin]",
|
||||
insulin.len()
|
||||
);
|
||||
|
||||
let gc_hbb = calculate_gc_content(&hbb);
|
||||
let gc_tp53 = calculate_gc_content(&tp53);
|
||||
info!(" HBB GC content: {:.1}%", gc_hbb * 100.0);
|
||||
info!(" TP53 GC content: {:.1}%", gc_tp53 * 100.0);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 2: K-mer similarity search across gene panel
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 2: K-mer similarity search across gene panel");
|
||||
let kmer_start = std::time::Instant::now();
|
||||
|
||||
let hbb_vec = hbb.to_kmer_vector(11, 512)?;
|
||||
let tp53_vec = tp53.to_kmer_vector(11, 512)?;
|
||||
let brca1_vec = brca1.to_kmer_vector(11, 512)?;
|
||||
let cyp2d6_vec = cyp2d6.to_kmer_vector(11, 512)?;
|
||||
let ins_vec = insulin.to_kmer_vector(11, 512)?;
|
||||
|
||||
let sim_hbb_tp53 = cosine_similarity(&hbb_vec, &tp53_vec);
|
||||
let sim_hbb_brca1 = cosine_similarity(&hbb_vec, &brca1_vec);
|
||||
let sim_tp53_brca1 = cosine_similarity(&tp53_vec, &brca1_vec);
|
||||
let sim_hbb_cyp2d6 = cosine_similarity(&hbb_vec, &cyp2d6_vec);
|
||||
|
||||
info!(" K-mer similarity matrix (cosine, k=11, d=512):");
|
||||
info!(" HBB vs TP53: {:.4}", sim_hbb_tp53);
|
||||
info!(" HBB vs BRCA1: {:.4}", sim_hbb_brca1);
|
||||
info!(" TP53 vs BRCA1: {:.4}", sim_tp53_brca1);
|
||||
info!(" HBB vs CYP2D6:{:.4}", sim_hbb_cyp2d6);
|
||||
info!(" K-mer encoding time: {:?}", kmer_start.elapsed());
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 3: Align HBB query fragment against full HBB
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 3: Smith-Waterman alignment on HBB");
|
||||
let align_start = std::time::Instant::now();
|
||||
|
||||
// Extract a 50bp fragment from the middle of HBB (simulating a sequencing read)
|
||||
let hbb_str = hbb.to_string();
|
||||
let fragment_start = 100;
|
||||
let fragment_end = (fragment_start + 50).min(hbb_str.len());
|
||||
let query_fragment = DnaSequence::from_str(&hbb_str[fragment_start..fragment_end])?;
|
||||
|
||||
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
||||
let alignment = aligner.align(&query_fragment, &hbb)?;
|
||||
|
||||
info!(
|
||||
" Query: HBB[{}..{}] ({} bp read)",
|
||||
fragment_start,
|
||||
fragment_end,
|
||||
query_fragment.len()
|
||||
);
|
||||
info!(" Alignment score: {}", alignment.score);
|
||||
info!(
|
||||
" Mapped position: {} (expected: {})",
|
||||
alignment.mapped_position.position, fragment_start
|
||||
);
|
||||
info!(" Mapping quality: {}", alignment.mapping_quality.value());
|
||||
info!(" CIGAR: {} ops", alignment.cigar.len());
|
||||
info!(" Alignment time: {:?}", align_start.elapsed());
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 4: Variant calling on HBB (sickle cell region)
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 4: Variant calling on HBB (sickle cell detection)");
|
||||
let variant_start = std::time::Instant::now();
|
||||
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
let hbb_bytes = hbb_str.as_bytes();
|
||||
let mut variant_count = 0;
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// Simulate sequencing reads across HBB with a sickle cell mutation at position 20
|
||||
let sickle_pos = real_data::hbb_variants::SICKLE_CELL_POS;
|
||||
for i in 0..hbb_bytes.len().min(200) {
|
||||
let depth = rng.gen_range(20..51);
|
||||
let bases: Vec<u8> = (0..depth)
|
||||
.map(|_| {
|
||||
if i == sickle_pos && rng.gen::<f32>() < 0.5 {
|
||||
b'T' // Simulate heterozygous sickle cell (A→T at codon 6)
|
||||
} else if rng.gen::<f32>() < 0.98 {
|
||||
hbb_bytes[i]
|
||||
} else {
|
||||
[b'A', b'C', b'G', b'T'][rng.gen_range(0..4)]
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let qualities: Vec<u8> = (0..depth).map(|_| rng.gen_range(25..41)).collect();
|
||||
|
||||
let pileup = PileupColumn {
|
||||
bases,
|
||||
qualities,
|
||||
position: i as u64,
|
||||
chromosome: 11,
|
||||
};
|
||||
|
||||
if let Some(call) = caller.call_snp(&pileup, hbb_bytes[i]) {
|
||||
variant_count += 1;
|
||||
if i == sickle_pos {
|
||||
info!(
|
||||
" ** Sickle cell variant at pos {}: ref={} alt={} depth={} qual={}",
|
||||
i, call.ref_allele as char, call.alt_allele as char, call.depth, call.quality
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(" Positions analyzed: {}", hbb_bytes.len().min(200));
|
||||
info!(" Total variants detected: {}", variant_count);
|
||||
info!(" Variant calling time: {:?}", variant_start.elapsed());
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 5: Translate HBB → hemoglobin beta protein
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 5: Protein translation - HBB to Hemoglobin Beta");
|
||||
let protein_start = std::time::Instant::now();
|
||||
|
||||
let amino_acids = translate_dna(hbb_bytes);
|
||||
let protein_str: String = amino_acids.iter().map(|aa| aa.to_char()).collect();
|
||||
|
||||
info!(" Protein length: {} amino acids", amino_acids.len());
|
||||
info!(
|
||||
" First 20 aa: {}",
|
||||
if protein_str.len() > 20 {
|
||||
&protein_str[..20]
|
||||
} else {
|
||||
&protein_str
|
||||
}
|
||||
);
|
||||
info!(" Expected: MVHLTPEEKSAVTALWGKVN (hemoglobin beta N-terminus)");
|
||||
|
||||
// Build contact graph for the hemoglobin protein
|
||||
if amino_acids.len() >= 10 {
|
||||
let residues: Vec<ProteinResidue> = amino_acids
|
||||
.iter()
|
||||
.map(|aa| match aa.to_char() {
|
||||
'A' => ProteinResidue::A,
|
||||
'R' => ProteinResidue::R,
|
||||
'N' => ProteinResidue::N,
|
||||
'D' => ProteinResidue::D,
|
||||
'C' => ProteinResidue::C,
|
||||
'E' => ProteinResidue::E,
|
||||
'Q' => ProteinResidue::Q,
|
||||
'G' => ProteinResidue::G,
|
||||
'H' => ProteinResidue::H,
|
||||
'I' => ProteinResidue::I,
|
||||
'L' => ProteinResidue::L,
|
||||
'K' => ProteinResidue::K,
|
||||
'M' => ProteinResidue::M,
|
||||
'F' => ProteinResidue::F,
|
||||
'P' => ProteinResidue::P,
|
||||
'S' => ProteinResidue::S,
|
||||
'T' => ProteinResidue::T,
|
||||
'W' => ProteinResidue::W,
|
||||
'Y' => ProteinResidue::Y,
|
||||
'V' => ProteinResidue::V,
|
||||
_ => ProteinResidue::X,
|
||||
})
|
||||
.collect();
|
||||
let protein_seq = ProteinSequence::new(residues);
|
||||
let graph = protein_seq.build_contact_graph(8.0)?;
|
||||
let contacts = protein_seq.predict_contacts(&graph)?;
|
||||
|
||||
info!(" Contact graph: {} edges", graph.edges.len());
|
||||
info!(" Top 3 predicted contacts:");
|
||||
for (i, (r1, r2, score)) in contacts.iter().take(3).enumerate() {
|
||||
info!(
|
||||
" {}. Residues {} <-> {} (score: {:.3})",
|
||||
i + 1,
|
||||
r1,
|
||||
r2,
|
||||
score
|
||||
);
|
||||
}
|
||||
}
|
||||
info!(" Protein analysis time: {:?}", protein_start.elapsed());
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 6: Epigenetic age prediction
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 6: Epigenetic age prediction (Horvath clock)");
|
||||
let epi_start = std::time::Instant::now();
|
||||
|
||||
let positions: Vec<(u8, u64)> = (0..500).map(|i| (1, i * 1000)).collect();
|
||||
let betas: Vec<f32> = (0..500).map(|_| rng.gen_range(0.1..0.9)).collect();
|
||||
|
||||
let profile = MethylationProfile::from_beta_values(positions, betas);
|
||||
let clock = HorvathClock::default_clock();
|
||||
let predicted_age = clock.predict_age(&profile);
|
||||
|
||||
info!(" CpG sites analyzed: {}", profile.sites.len());
|
||||
info!(" Mean methylation: {:.3}", profile.mean_methylation());
|
||||
info!(" Predicted biological age: {:.1} years", predicted_age);
|
||||
info!(" Epigenomics time: {:?}", epi_start.elapsed());
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 7: Pharmacogenomics (CYP2D6 from real sequence)
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 7: Pharmacogenomic analysis (CYP2D6)");
|
||||
|
||||
let cyp2d6_variants = vec![(42130692, b'G', b'A')]; // *4 defining variant
|
||||
let allele1 = pharma::call_star_allele(&cyp2d6_variants);
|
||||
let allele2 = pharma::StarAllele::Star10; // *10: common in East Asian populations
|
||||
let phenotype = pharma::predict_phenotype(&allele1, &allele2);
|
||||
|
||||
info!(" CYP2D6 sequence: {} bp analyzed", cyp2d6.len());
|
||||
info!(
|
||||
" Allele 1: {:?} (activity: {:.1})",
|
||||
allele1,
|
||||
allele1.activity_score()
|
||||
);
|
||||
info!(
|
||||
" Allele 2: {:?} (activity: {:.1})",
|
||||
allele2,
|
||||
allele2.activity_score()
|
||||
);
|
||||
info!(" Metabolizer phenotype: {:?}", phenotype);
|
||||
|
||||
let recommendations = pharma::get_recommendations("CYP2D6", &phenotype);
|
||||
for rec in &recommendations {
|
||||
info!(
|
||||
" - {}: {} (dose: {:.1}x)",
|
||||
rec.drug, rec.recommendation, rec.dose_factor
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Stage 8: RVDNA AI-Native Format Demo
|
||||
// -----------------------------------------------------------------------
|
||||
info!("\nStage 8: RVDNA AI-Native File Format");
|
||||
let rvdna_start = std::time::Instant::now();
|
||||
|
||||
// Convert HBB to RVDNA format with pre-computed k-mer vectors
|
||||
let rvdna_bytes = rvdna::fasta_to_rvdna(real_data::HBB_CODING_SEQUENCE, 11, 512, 500)?;
|
||||
|
||||
info!(" FASTA → RVDNA conversion:");
|
||||
info!(" Input: {} bases (ASCII, 1 byte/base)", hbb.len());
|
||||
info!(" Output: {} bytes (RVDNA binary)", rvdna_bytes.len());
|
||||
info!(
|
||||
" Ratio: {:.2}x compression (sequence section)",
|
||||
hbb.len() as f64 / rvdna_bytes.len() as f64
|
||||
);
|
||||
|
||||
// Read back and validate
|
||||
let reader = RvdnaReader::from_bytes(rvdna_bytes)?;
|
||||
let restored = reader.read_sequence()?;
|
||||
assert_eq!(restored.to_string(), hbb.to_string(), "Lossless roundtrip");
|
||||
|
||||
let kmer_blocks = reader.read_kmer_vectors()?;
|
||||
let stats = reader.stats();
|
||||
|
||||
info!(" RVDNA file stats:");
|
||||
info!(" Format version: {}", reader.header.version);
|
||||
info!(
|
||||
" Sequence section: {} bytes ({:.1} bits/base)",
|
||||
stats.section_sizes[0], stats.bits_per_base
|
||||
);
|
||||
info!(
|
||||
" K-mer vectors: {} blocks pre-computed",
|
||||
kmer_blocks.len()
|
||||
);
|
||||
|
||||
if !kmer_blocks.is_empty() {
|
||||
info!(
|
||||
" Vector dims: {}, k={}",
|
||||
kmer_blocks[0].dimensions, kmer_blocks[0].k
|
||||
);
|
||||
// Demonstrate instant similarity search from pre-computed vectors
|
||||
let tp53_query = tp53.to_kmer_vector(11, 512)?;
|
||||
let sim = kmer_blocks[0].cosine_similarity(&tp53_query);
|
||||
info!(
|
||||
" Instant HBB vs TP53 similarity: {:.4} (from pre-indexed)",
|
||||
sim
|
||||
);
|
||||
}
|
||||
|
||||
info!(" RVDNA format time: {:?}", rvdna_start.elapsed());
|
||||
|
||||
// Compare format sizes
|
||||
info!("\n Format Comparison (HBB gene, {} bp):", hbb.len());
|
||||
info!(" FASTA (ASCII): {} bytes (8 bits/base)", hbb.len());
|
||||
info!(
|
||||
" RVDNA (2-bit): {} bytes (seq section)",
|
||||
stats.section_sizes[0]
|
||||
);
|
||||
info!(
|
||||
" RVDNA (total): {} bytes (seq + k-mer vectors + metadata)",
|
||||
stats.total_size
|
||||
);
|
||||
info!(" Pre-computed: k-mer vectors, ready for HNSW search");
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Summary
|
||||
// -----------------------------------------------------------------------
|
||||
let total_time = total_start.elapsed();
|
||||
info!("\nPipeline Summary");
|
||||
info!("==================");
|
||||
info!(" Genes analyzed: 5 (HBB, TP53, BRCA1, CYP2D6, INS)");
|
||||
info!(
|
||||
" Total bases: {} bp",
|
||||
hbb.len() + tp53.len() + brca1.len() + cyp2d6.len() + insulin.len()
|
||||
);
|
||||
info!(
|
||||
" Variants called: {} (in HBB sickle cell region)",
|
||||
variant_count
|
||||
);
|
||||
info!(" Hemoglobin protein: {} amino acids", amino_acids.len());
|
||||
info!(" Predicted age: {:.1} years", predicted_age);
|
||||
info!(" CYP2D6 phenotype: {:?}", phenotype);
|
||||
info!(
|
||||
" RVDNA format: {} bytes ({} sections)",
|
||||
stats.total_size,
|
||||
stats.section_sizes.iter().filter(|&&s| s > 0).count()
|
||||
);
|
||||
info!(" Total pipeline time: {:?}", total_time);
|
||||
|
||||
info!("\nAnalysis complete!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Cosine similarity between two vectors
|
||||
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
||||
let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if mag_a == 0.0 || mag_b == 0.0 {
|
||||
0.0
|
||||
} else {
|
||||
dot / (mag_a * mag_b)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate GC content of DNA sequence
|
||||
fn calculate_gc_content(sequence: &DnaSequence) -> f64 {
|
||||
let gc_count = sequence
|
||||
.bases()
|
||||
.iter()
|
||||
.filter(|&&b| b == Nucleotide::G || b == Nucleotide::C)
|
||||
.count();
|
||||
gc_count as f64 / sequence.len() as f64
|
||||
}
|
||||
|
||||
/// Run 23andMe genotyping analysis pipeline
|
||||
fn run_23andme(path: &str) -> anyhow::Result<()> {
|
||||
let file =
|
||||
std::fs::File::open(path).map_err(|e| anyhow::anyhow!("Cannot open {}: {}", path, e))?;
|
||||
let analysis =
|
||||
genotyping::analyze(file).map_err(|e| anyhow::anyhow!("Analysis failed: {}", e))?;
|
||||
print!("{}", genotyping::format_report(&analysis));
|
||||
Ok(())
|
||||
}
|
||||
417
vendor/ruvector/examples/dna/src/pharma.rs
vendored
Normal file
417
vendor/ruvector/examples/dna/src/pharma.rs
vendored
Normal file
@@ -0,0 +1,417 @@
|
||||
//! Pharmacogenomics module
|
||||
//!
|
||||
//! Provides CYP enzyme star allele calling and metabolizer phenotype
|
||||
//! prediction for pharmacogenomic analysis.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// CYP2D6 star allele classification
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum StarAllele {
|
||||
/// *1 - Normal function (wild-type)
|
||||
Star1,
|
||||
/// *2 - Normal function
|
||||
Star2,
|
||||
/// *3 - No function (frameshift)
|
||||
Star3,
|
||||
/// *4 - No function (splicing defect)
|
||||
Star4,
|
||||
/// *5 - No function (gene deletion)
|
||||
Star5,
|
||||
/// *6 - No function (frameshift)
|
||||
Star6,
|
||||
/// *10 - Decreased function
|
||||
Star10,
|
||||
/// *17 - Decreased function
|
||||
Star17,
|
||||
/// *41 - Decreased function
|
||||
Star41,
|
||||
/// Unknown allele
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl StarAllele {
|
||||
/// Get the activity score for this allele
|
||||
pub fn activity_score(&self) -> f64 {
|
||||
match self {
|
||||
StarAllele::Star1 | StarAllele::Star2 => 1.0,
|
||||
StarAllele::Star10 | StarAllele::Star17 | StarAllele::Star41 => 0.5,
|
||||
StarAllele::Star3 | StarAllele::Star4 | StarAllele::Star5 | StarAllele::Star6 => 0.0,
|
||||
StarAllele::Unknown => 0.5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Drug metabolizer phenotype
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum MetabolizerPhenotype {
|
||||
/// Ultra-rapid metabolizer (activity score > 2.0)
|
||||
UltraRapid,
|
||||
/// Normal metabolizer (1.0 <= activity score <= 2.0)
|
||||
Normal,
|
||||
/// Intermediate metabolizer (0.5 <= activity score < 1.0)
|
||||
Intermediate,
|
||||
/// Poor metabolizer (activity score < 0.5)
|
||||
Poor,
|
||||
}
|
||||
|
||||
/// Pharmacogenomic variant for a specific gene
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PharmaVariant {
|
||||
/// Gene name (e.g., "CYP2D6")
|
||||
pub gene: String,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference allele
|
||||
pub ref_allele: u8,
|
||||
/// Alternate allele
|
||||
pub alt_allele: u8,
|
||||
/// Clinical significance
|
||||
pub significance: String,
|
||||
}
|
||||
|
||||
/// CYP2C19 star allele classification
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Cyp2c19Allele {
|
||||
/// *1 - Normal function (wild-type)
|
||||
Star1,
|
||||
/// *2 - No function (rs4244285, c.681G>A, splicing defect)
|
||||
Star2,
|
||||
/// *3 - No function (rs4986893, c.636G>A, premature stop)
|
||||
Star3,
|
||||
/// *17 - Increased function (rs12248560, c.-806C>T)
|
||||
Star17,
|
||||
/// Unknown allele
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Cyp2c19Allele {
|
||||
/// Get the activity score for this allele (CPIC guidelines)
|
||||
pub fn activity_score(&self) -> f64 {
|
||||
match self {
|
||||
Cyp2c19Allele::Star1 => 1.0,
|
||||
Cyp2c19Allele::Star17 => 1.5, // Increased function
|
||||
Cyp2c19Allele::Star2 | Cyp2c19Allele::Star3 => 0.0,
|
||||
Cyp2c19Allele::Unknown => 0.5,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Call CYP2C19 star allele from observed variants
|
||||
pub fn call_cyp2c19_allele(variants: &[(u64, u8, u8)]) -> Cyp2c19Allele {
|
||||
for &(pos, ref_allele, alt_allele) in variants {
|
||||
match (pos, ref_allele, alt_allele) {
|
||||
// *2: G>A at rs4244285 (c.681G>A, splicing defect)
|
||||
(96541616, b'G', b'A') => return Cyp2c19Allele::Star2,
|
||||
// *3: G>A at rs4986893 (c.636G>A, premature stop codon)
|
||||
(96540410, b'G', b'A') => return Cyp2c19Allele::Star3,
|
||||
// *17: C>T at rs12248560 (c.-806C>T, increased expression)
|
||||
(96522463, b'C', b'T') => return Cyp2c19Allele::Star17,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Cyp2c19Allele::Star1
|
||||
}
|
||||
|
||||
/// Predict CYP2C19 metabolizer phenotype from diplotype
|
||||
pub fn predict_cyp2c19_phenotype(
|
||||
allele1: &Cyp2c19Allele,
|
||||
allele2: &Cyp2c19Allele,
|
||||
) -> MetabolizerPhenotype {
|
||||
let total_activity = allele1.activity_score() + allele2.activity_score();
|
||||
if total_activity > 2.0 {
|
||||
MetabolizerPhenotype::UltraRapid
|
||||
} else if total_activity >= 1.0 {
|
||||
MetabolizerPhenotype::Normal
|
||||
} else if total_activity >= 0.5 {
|
||||
MetabolizerPhenotype::Intermediate
|
||||
} else {
|
||||
MetabolizerPhenotype::Poor
|
||||
}
|
||||
}
|
||||
|
||||
/// Call CYP2D6 star allele from observed variants
|
||||
///
|
||||
/// Uses a simplified lookup table based on key defining variants.
|
||||
pub fn call_star_allele(variants: &[(u64, u8, u8)]) -> StarAllele {
|
||||
for &(pos, ref_allele, alt_allele) in variants {
|
||||
match (pos, ref_allele, alt_allele) {
|
||||
// *4: G>A at intron 3/exon 4 boundary (rs3892097)
|
||||
(42130692, b'G', b'A') => return StarAllele::Star4,
|
||||
// *5: whole gene deletion
|
||||
(42126611, b'T', b'-') => return StarAllele::Star5,
|
||||
// *3: frameshift (A deletion at rs35742686)
|
||||
(42127941, b'A', b'-') => return StarAllele::Star3,
|
||||
// *6: T deletion at rs5030655
|
||||
(42127803, b'T', b'-') => return StarAllele::Star6,
|
||||
// *10: C>T at rs1065852
|
||||
(42126938, b'C', b'T') => return StarAllele::Star10,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
StarAllele::Star1 // Wild-type
|
||||
}
|
||||
|
||||
/// Predict metabolizer phenotype from diplotype (two alleles)
|
||||
pub fn predict_phenotype(allele1: &StarAllele, allele2: &StarAllele) -> MetabolizerPhenotype {
|
||||
let total_activity = allele1.activity_score() + allele2.activity_score();
|
||||
|
||||
if total_activity > 2.0 {
|
||||
MetabolizerPhenotype::UltraRapid
|
||||
} else if total_activity >= 1.0 {
|
||||
MetabolizerPhenotype::Normal
|
||||
} else if total_activity >= 0.5 {
|
||||
MetabolizerPhenotype::Intermediate
|
||||
} else {
|
||||
MetabolizerPhenotype::Poor
|
||||
}
|
||||
}
|
||||
|
||||
/// Drug recommendation based on metabolizer phenotype
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DrugRecommendation {
|
||||
/// Drug name
|
||||
pub drug: String,
|
||||
/// Gene involved
|
||||
pub gene: String,
|
||||
/// Recommendation text
|
||||
pub recommendation: String,
|
||||
/// Dosing adjustment factor (1.0 = standard dose)
|
||||
pub dose_factor: f64,
|
||||
}
|
||||
|
||||
/// Get drug recommendations for a given phenotype
|
||||
pub fn get_recommendations(
|
||||
gene: &str,
|
||||
phenotype: &MetabolizerPhenotype,
|
||||
) -> Vec<DrugRecommendation> {
|
||||
match (gene, phenotype) {
|
||||
("CYP2D6", MetabolizerPhenotype::Poor) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Codeine".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation:
|
||||
"AVOID codeine; no conversion to morphine. Use alternative analgesic."
|
||||
.to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Tramadol".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "AVOID tramadol; reduced efficacy. Use alternative analgesic."
|
||||
.to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Tamoxifen".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Consider alternative endocrine therapy (aromatase inhibitor)."
|
||||
.to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Ondansetron".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Use standard dose; may have increased exposure.".to_string(),
|
||||
dose_factor: 0.75,
|
||||
},
|
||||
],
|
||||
("CYP2D6", MetabolizerPhenotype::UltraRapid) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Codeine".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation:
|
||||
"AVOID codeine; risk of fatal toxicity from ultra-rapid morphine conversion."
|
||||
.to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Tramadol".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "AVOID tramadol; risk of respiratory depression.".to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
],
|
||||
("CYP2D6", MetabolizerPhenotype::Intermediate) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Codeine".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Use lower dose or alternative analgesic.".to_string(),
|
||||
dose_factor: 0.5,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Tamoxifen".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Consider higher dose or alternative therapy.".to_string(),
|
||||
dose_factor: 0.75,
|
||||
},
|
||||
],
|
||||
("CYP2C19", MetabolizerPhenotype::Poor) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Clopidogrel (Plavix)".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "AVOID clopidogrel; use prasugrel or ticagrelor instead."
|
||||
.to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Voriconazole".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Reduce dose by 50%; monitor for toxicity.".to_string(),
|
||||
dose_factor: 0.5,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "PPIs (omeprazole)".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Reduce dose; slower clearance increases exposure.".to_string(),
|
||||
dose_factor: 0.5,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Escitalopram".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Consider 50% dose reduction.".to_string(),
|
||||
dose_factor: 0.5,
|
||||
},
|
||||
],
|
||||
("CYP2C19", MetabolizerPhenotype::UltraRapid) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Clopidogrel (Plavix)".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Standard dosing (enhanced activation is beneficial).".to_string(),
|
||||
dose_factor: 1.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Omeprazole".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Increase dose; rapid clearance reduces efficacy.".to_string(),
|
||||
dose_factor: 2.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Voriconazole".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Use alternative antifungal.".to_string(),
|
||||
dose_factor: 0.0,
|
||||
},
|
||||
],
|
||||
("CYP2C19", MetabolizerPhenotype::Intermediate) => vec![
|
||||
DrugRecommendation {
|
||||
drug: "Clopidogrel (Plavix)".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Consider alternative antiplatelet or increased dose.".to_string(),
|
||||
dose_factor: 1.5,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "PPIs (omeprazole)".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation:
|
||||
"Standard dose likely adequate; may have slightly increased exposure."
|
||||
.to_string(),
|
||||
dose_factor: 1.0,
|
||||
},
|
||||
DrugRecommendation {
|
||||
drug: "Escitalopram".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Use standard dose; monitor response.".to_string(),
|
||||
dose_factor: 1.0,
|
||||
},
|
||||
],
|
||||
_ => vec![DrugRecommendation {
|
||||
drug: "Standard".to_string(),
|
||||
gene: gene.to_string(),
|
||||
recommendation: "Use standard dosing".to_string(),
|
||||
dose_factor: 1.0,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_star_allele_calling() {
|
||||
// Wild-type
|
||||
assert_eq!(call_star_allele(&[]), StarAllele::Star1);
|
||||
|
||||
// *4 variant
|
||||
let star4 = call_star_allele(&[(42130692, b'G', b'A')]);
|
||||
assert_eq!(star4, StarAllele::Star4);
|
||||
assert_eq!(star4.activity_score(), 0.0);
|
||||
|
||||
// *10 variant (decreased function)
|
||||
let star10 = call_star_allele(&[(42126938, b'C', b'T')]);
|
||||
assert_eq!(star10, StarAllele::Star10);
|
||||
assert_eq!(star10.activity_score(), 0.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phenotype_prediction() {
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star1, &StarAllele::Star1),
|
||||
MetabolizerPhenotype::Normal
|
||||
);
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star1, &StarAllele::Star4),
|
||||
MetabolizerPhenotype::Normal
|
||||
);
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star4, &StarAllele::Star10),
|
||||
MetabolizerPhenotype::Intermediate
|
||||
);
|
||||
assert_eq!(
|
||||
predict_phenotype(&StarAllele::Star4, &StarAllele::Star4),
|
||||
MetabolizerPhenotype::Poor
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drug_recommendations() {
|
||||
let recs = get_recommendations("CYP2D6", &MetabolizerPhenotype::Poor);
|
||||
assert!(recs.len() >= 1);
|
||||
assert_eq!(recs[0].dose_factor, 0.0);
|
||||
|
||||
let recs_normal = get_recommendations("CYP2D6", &MetabolizerPhenotype::Normal);
|
||||
assert_eq!(recs_normal[0].dose_factor, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cyp2c19_allele_calling() {
|
||||
assert_eq!(call_cyp2c19_allele(&[]), Cyp2c19Allele::Star1);
|
||||
|
||||
let star2 = call_cyp2c19_allele(&[(96541616, b'G', b'A')]);
|
||||
assert_eq!(star2, Cyp2c19Allele::Star2);
|
||||
assert_eq!(star2.activity_score(), 0.0);
|
||||
|
||||
let star17 = call_cyp2c19_allele(&[(96522463, b'C', b'T')]);
|
||||
assert_eq!(star17, Cyp2c19Allele::Star17);
|
||||
assert_eq!(star17.activity_score(), 1.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cyp2c19_phenotype() {
|
||||
assert_eq!(
|
||||
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star17, &Cyp2c19Allele::Star17),
|
||||
MetabolizerPhenotype::UltraRapid
|
||||
);
|
||||
assert_eq!(
|
||||
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star2, &Cyp2c19Allele::Star2),
|
||||
MetabolizerPhenotype::Poor
|
||||
);
|
||||
assert_eq!(
|
||||
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star1, &Cyp2c19Allele::Star2),
|
||||
MetabolizerPhenotype::Normal
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cyp2c19_drug_recommendations() {
|
||||
let recs = get_recommendations("CYP2C19", &MetabolizerPhenotype::Poor);
|
||||
assert!(recs.len() >= 1);
|
||||
assert_eq!(recs[0].drug, "Clopidogrel (Plavix)");
|
||||
assert_eq!(recs[0].dose_factor, 0.0);
|
||||
|
||||
let recs_ultra = get_recommendations("CYP2C19", &MetabolizerPhenotype::UltraRapid);
|
||||
assert!(recs_ultra.len() >= 2);
|
||||
}
|
||||
}
|
||||
496
vendor/ruvector/examples/dna/src/pipeline.rs
vendored
Normal file
496
vendor/ruvector/examples/dna/src/pipeline.rs
vendored
Normal file
@@ -0,0 +1,496 @@
|
||||
//! DAG-based genomic analysis pipeline orchestrator
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::types::{DnaSequence, KmerIndex, Nucleotide, ProteinResidue, ProteinSequence};
|
||||
use ruvector_core::types::{SearchQuery, VectorEntry};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Pipeline configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PipelineConfig {
|
||||
/// K-mer size (default: 21)
|
||||
pub k: usize,
|
||||
/// Attention window size (default: 512)
|
||||
pub window_size: usize,
|
||||
/// Variant calling min depth (default: 10)
|
||||
pub min_depth: usize,
|
||||
/// Min variant quality (default: 20)
|
||||
pub min_quality: u8,
|
||||
}
|
||||
|
||||
impl Default for PipelineConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
k: 21,
|
||||
window_size: 512,
|
||||
min_depth: 10,
|
||||
min_quality: 20,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// K-mer analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct KmerAnalysisResult {
|
||||
/// Total k-mers extracted
|
||||
pub total_kmers: usize,
|
||||
/// Unique k-mers found
|
||||
pub unique_kmers: usize,
|
||||
/// GC content ratio
|
||||
pub gc_content: f64,
|
||||
/// Top similar sequences
|
||||
pub top_similar_sequences: Vec<SimilarSequence>,
|
||||
}
|
||||
|
||||
/// Similar sequence match
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SimilarSequence {
|
||||
/// Sequence identifier
|
||||
pub id: String,
|
||||
/// Similarity score
|
||||
pub similarity: f32,
|
||||
/// Position in the index
|
||||
pub position: usize,
|
||||
}
|
||||
|
||||
/// Variant call result
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VariantCall {
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference base
|
||||
pub reference: Nucleotide,
|
||||
/// Alternate base
|
||||
pub alternate: Nucleotide,
|
||||
/// Variant quality
|
||||
pub quality: u8,
|
||||
/// Read depth
|
||||
pub depth: usize,
|
||||
/// Allele frequency
|
||||
pub allele_frequency: f64,
|
||||
}
|
||||
|
||||
/// Pileup column for variant calling
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PileupColumn {
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference base
|
||||
pub reference: Nucleotide,
|
||||
/// Observed bases
|
||||
pub bases: Vec<Nucleotide>,
|
||||
/// Quality scores
|
||||
pub qualities: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Protein analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProteinAnalysisResult {
|
||||
/// Amino acid sequence (single letter codes)
|
||||
pub sequence: String,
|
||||
/// Protein length
|
||||
pub length: usize,
|
||||
/// Predicted contacts as (i, j, score)
|
||||
pub predicted_contacts: Vec<(usize, usize, f32)>,
|
||||
/// Secondary structure prediction (H/E/C)
|
||||
pub secondary_structure: Vec<char>,
|
||||
}
|
||||
|
||||
/// Full pipeline analysis results
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FullAnalysisResult {
|
||||
/// K-mer statistics
|
||||
pub kmer_stats: KmerAnalysisResult,
|
||||
/// Called variants
|
||||
pub variants: Vec<VariantCall>,
|
||||
/// Protein analysis results
|
||||
pub proteins: Vec<ProteinAnalysisResult>,
|
||||
/// Execution time in milliseconds
|
||||
pub execution_time_ms: u128,
|
||||
}
|
||||
|
||||
/// Genomic analysis pipeline orchestrator
|
||||
pub struct GenomicPipeline {
|
||||
config: PipelineConfig,
|
||||
}
|
||||
|
||||
impl GenomicPipeline {
|
||||
/// Create new pipeline with configuration
|
||||
pub fn new(config: PipelineConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Run k-mer analysis on sequences
|
||||
pub fn run_kmer_analysis(&self, sequences: &[(&str, &[u8])]) -> Result<KmerAnalysisResult> {
|
||||
let mut total_kmers = 0;
|
||||
let mut kmer_set = std::collections::HashSet::new();
|
||||
let mut gc_count = 0;
|
||||
let mut total_bases = 0;
|
||||
|
||||
// Create temporary k-mer index
|
||||
let index = KmerIndex::new(self.config.k, 384, ":memory:")?;
|
||||
|
||||
for (id, seq) in sequences {
|
||||
// Extract k-mers
|
||||
if seq.len() < self.config.k {
|
||||
continue;
|
||||
}
|
||||
|
||||
total_bases += seq.len();
|
||||
|
||||
for window in seq.windows(self.config.k) {
|
||||
total_kmers += 1;
|
||||
kmer_set.insert(window.to_vec());
|
||||
|
||||
// Count GC content
|
||||
for &base in window {
|
||||
if base == b'G' || base == b'C' {
|
||||
gc_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert sequence to vector and index
|
||||
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(seq))?;
|
||||
|
||||
if let Ok(vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
|
||||
let entry = VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector,
|
||||
metadata: None,
|
||||
};
|
||||
let _ = index.db().insert(entry);
|
||||
}
|
||||
}
|
||||
|
||||
let gc_content = if total_bases > 0 {
|
||||
(gc_count as f64) / (total_bases as f64)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Find similar sequences using HNSW search
|
||||
let mut top_similar = Vec::new();
|
||||
if !sequences.is_empty() {
|
||||
if let Some((query_id, query_seq)) = sequences.first() {
|
||||
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(query_seq))?;
|
||||
|
||||
if let Ok(query_vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
|
||||
let search_query = SearchQuery {
|
||||
vector: query_vector,
|
||||
k: 5,
|
||||
filter: None,
|
||||
ef_search: None,
|
||||
};
|
||||
if let Ok(results) = index.db().search(search_query) {
|
||||
for result in results {
|
||||
if result.id != *query_id {
|
||||
top_similar.push(SimilarSequence {
|
||||
id: result.id.clone(),
|
||||
similarity: result.score,
|
||||
position: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(KmerAnalysisResult {
|
||||
total_kmers,
|
||||
unique_kmers: kmer_set.len(),
|
||||
gc_content,
|
||||
top_similar_sequences: top_similar,
|
||||
})
|
||||
}
|
||||
|
||||
/// Run variant calling against reference
|
||||
pub fn run_variant_calling(
|
||||
&self,
|
||||
pileups: &[PileupColumn],
|
||||
_reference: &[u8],
|
||||
) -> Result<Vec<VariantCall>> {
|
||||
let mut variants = Vec::new();
|
||||
|
||||
for pileup in pileups {
|
||||
if pileup.bases.len() < self.config.min_depth {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Count allele frequencies
|
||||
let mut allele_counts: HashMap<Nucleotide, usize> = HashMap::new();
|
||||
for &base in &pileup.bases {
|
||||
*allele_counts.entry(base).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
// Find most common alternate allele
|
||||
let _ref_count = allele_counts.get(&pileup.reference).copied().unwrap_or(0);
|
||||
|
||||
for (&allele, &count) in &allele_counts {
|
||||
if allele == pileup.reference || allele == Nucleotide::N {
|
||||
continue;
|
||||
}
|
||||
|
||||
let allele_freq = count as f64 / pileup.bases.len() as f64;
|
||||
|
||||
// Call variant if alternate allele frequency is significant
|
||||
if allele_freq > 0.2 && count >= 3 {
|
||||
// Calculate quality score from supporting reads
|
||||
let quality = pileup
|
||||
.qualities
|
||||
.iter()
|
||||
.take(count)
|
||||
.map(|&q| q as u16)
|
||||
.sum::<u16>()
|
||||
.min(255) as u8;
|
||||
|
||||
if quality >= self.config.min_quality {
|
||||
variants.push(VariantCall {
|
||||
position: pileup.position,
|
||||
reference: pileup.reference,
|
||||
alternate: allele,
|
||||
quality,
|
||||
depth: pileup.bases.len(),
|
||||
allele_frequency: allele_freq,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(variants)
|
||||
}
|
||||
|
||||
/// Translate DNA to protein and analyze structure
|
||||
pub fn run_protein_analysis(&self, dna: &[u8]) -> Result<ProteinAnalysisResult> {
|
||||
// Translate DNA to protein using standard genetic code
|
||||
let protein = self.translate_dna(dna)?;
|
||||
|
||||
// Predict contacts using heuristic scoring
|
||||
let contacts = self.predict_protein_contacts(&protein)?;
|
||||
|
||||
// Simple secondary structure prediction
|
||||
let secondary_structure = self.predict_secondary_structure(&protein);
|
||||
|
||||
Ok(ProteinAnalysisResult {
|
||||
sequence: protein.residues().iter().map(|r| r.to_char()).collect(),
|
||||
length: protein.len(),
|
||||
predicted_contacts: contacts,
|
||||
secondary_structure,
|
||||
})
|
||||
}
|
||||
|
||||
/// Run full analysis pipeline
|
||||
pub fn run_full_pipeline(
|
||||
&self,
|
||||
sequence: &[u8],
|
||||
reference: &[u8],
|
||||
) -> Result<FullAnalysisResult> {
|
||||
let start = Instant::now();
|
||||
|
||||
// Stage 1: K-mer analysis
|
||||
let kmer_stats =
|
||||
self.run_kmer_analysis(&[("query", sequence), ("reference", reference)])?;
|
||||
|
||||
// Stage 2: Variant calling - generate pileups from sequence
|
||||
let pileups = self.generate_pileups(sequence, reference)?;
|
||||
let variants = self.run_variant_calling(&pileups, reference)?;
|
||||
|
||||
// Stage 3: Protein analysis - find ORFs and translate
|
||||
let proteins = self.find_orfs_and_translate(sequence)?;
|
||||
|
||||
let execution_time_ms = start.elapsed().as_millis();
|
||||
|
||||
Ok(FullAnalysisResult {
|
||||
kmer_stats,
|
||||
variants,
|
||||
proteins,
|
||||
execution_time_ms,
|
||||
})
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
|
||||
/// Translate DNA to protein
|
||||
fn translate_dna(&self, dna: &[u8]) -> Result<ProteinSequence> {
|
||||
let mut residues = Vec::new();
|
||||
|
||||
for codon in dna.chunks(3) {
|
||||
if codon.len() < 3 {
|
||||
break;
|
||||
}
|
||||
|
||||
let aa = self.codon_to_amino_acid(codon);
|
||||
if aa == ProteinResidue::X {
|
||||
break; // Stop codon
|
||||
}
|
||||
residues.push(aa);
|
||||
}
|
||||
|
||||
Ok(ProteinSequence::new(residues))
|
||||
}
|
||||
|
||||
/// Map codon to amino acid (simplified genetic code)
|
||||
fn codon_to_amino_acid(&self, codon: &[u8]) -> ProteinResidue {
|
||||
match codon {
|
||||
b"ATG" => ProteinResidue::M,
|
||||
b"TGG" => ProteinResidue::W,
|
||||
b"TTT" | b"TTC" => ProteinResidue::F,
|
||||
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => ProteinResidue::L,
|
||||
b"ATT" | b"ATC" | b"ATA" => ProteinResidue::I,
|
||||
b"GTT" | b"GTC" | b"GTA" | b"GTG" => ProteinResidue::V,
|
||||
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => ProteinResidue::S,
|
||||
b"CCT" | b"CCC" | b"CCA" | b"CCG" => ProteinResidue::P,
|
||||
b"ACT" | b"ACC" | b"ACA" | b"ACG" => ProteinResidue::T,
|
||||
b"GCT" | b"GCC" | b"GCA" | b"GCG" => ProteinResidue::A,
|
||||
b"TAT" | b"TAC" => ProteinResidue::Y,
|
||||
b"CAT" | b"CAC" => ProteinResidue::H,
|
||||
b"CAA" | b"CAG" => ProteinResidue::Q,
|
||||
b"AAT" | b"AAC" => ProteinResidue::N,
|
||||
b"AAA" | b"AAG" => ProteinResidue::K,
|
||||
b"GAT" | b"GAC" => ProteinResidue::D,
|
||||
b"GAA" | b"GAG" => ProteinResidue::E,
|
||||
b"TGT" | b"TGC" => ProteinResidue::C,
|
||||
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => ProteinResidue::R,
|
||||
b"GGT" | b"GGC" | b"GGA" | b"GGG" => ProteinResidue::G,
|
||||
_ => ProteinResidue::X, // Stop or unknown
|
||||
}
|
||||
}
|
||||
|
||||
/// Predict protein contacts using residue property heuristics
|
||||
fn predict_protein_contacts(
|
||||
&self,
|
||||
protein: &ProteinSequence,
|
||||
) -> Result<Vec<(usize, usize, f32)>> {
|
||||
let residues = protein.residues();
|
||||
let n = residues.len();
|
||||
|
||||
if n < 5 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Compute residue feature scores
|
||||
let features: Vec<f32> = residues
|
||||
.iter()
|
||||
.map(|r| r.to_char() as u8 as f32 / 255.0)
|
||||
.collect();
|
||||
|
||||
// Predict contacts: pairs of residues >4 apart with similar features
|
||||
let mut contacts = Vec::new();
|
||||
for i in 0..n {
|
||||
for j in (i + 5)..n {
|
||||
let score = (features[i] + features[j]) / 2.0;
|
||||
if score > 0.5 {
|
||||
contacts.push((i, j, score));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
contacts.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
|
||||
contacts.truncate(10);
|
||||
Ok(contacts)
|
||||
}
|
||||
|
||||
/// Simple secondary structure prediction
|
||||
fn predict_secondary_structure(&self, protein: &ProteinSequence) -> Vec<char> {
|
||||
protein
|
||||
.residues()
|
||||
.iter()
|
||||
.map(|r| match r {
|
||||
ProteinResidue::A | ProteinResidue::E | ProteinResidue::L | ProteinResidue::M => {
|
||||
'H'
|
||||
}
|
||||
ProteinResidue::V | ProteinResidue::I | ProteinResidue::Y | ProteinResidue::F => {
|
||||
'E'
|
||||
}
|
||||
_ => 'C',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Generate pileups from sequence alignment
|
||||
fn generate_pileups(&self, sequence: &[u8], reference: &[u8]) -> Result<Vec<PileupColumn>> {
|
||||
let mut pileups = Vec::new();
|
||||
let min_len = sequence.len().min(reference.len());
|
||||
|
||||
for i in 0..min_len {
|
||||
let ref_base = match reference[i] {
|
||||
b'A' => Nucleotide::A,
|
||||
b'C' => Nucleotide::C,
|
||||
b'G' => Nucleotide::G,
|
||||
b'T' => Nucleotide::T,
|
||||
_ => Nucleotide::N,
|
||||
};
|
||||
|
||||
let seq_base = match sequence[i] {
|
||||
b'A' => Nucleotide::A,
|
||||
b'C' => Nucleotide::C,
|
||||
b'G' => Nucleotide::G,
|
||||
b'T' => Nucleotide::T,
|
||||
_ => Nucleotide::N,
|
||||
};
|
||||
|
||||
// Simulate coverage depth
|
||||
let depth = 15 + (i % 10);
|
||||
let bases = vec![seq_base; depth];
|
||||
let qualities = vec![30; depth];
|
||||
|
||||
pileups.push(PileupColumn {
|
||||
position: i as u64,
|
||||
reference: ref_base,
|
||||
bases,
|
||||
qualities,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(pileups)
|
||||
}
|
||||
|
||||
/// Find ORFs and translate to proteins
|
||||
fn find_orfs_and_translate(&self, sequence: &[u8]) -> Result<Vec<ProteinAnalysisResult>> {
|
||||
let mut proteins = Vec::new();
|
||||
|
||||
// Look for ATG start codons
|
||||
for i in 0..sequence.len().saturating_sub(30) {
|
||||
if sequence[i..].starts_with(b"ATG") {
|
||||
let orf = &sequence[i..];
|
||||
if let Ok(protein_result) = self.run_protein_analysis(orf) {
|
||||
if protein_result.length >= 10 {
|
||||
proteins.push(protein_result);
|
||||
if proteins.len() >= 3 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(proteins)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_creation() {
|
||||
let config = PipelineConfig::default();
|
||||
let pipeline = GenomicPipeline::new(config);
|
||||
assert_eq!(pipeline.config.k, 21);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmer_analysis() {
|
||||
let config = PipelineConfig::default();
|
||||
let pipeline = GenomicPipeline::new(config);
|
||||
|
||||
let sequences = vec![("seq1", b"ACGTACGTACGTACGTACGTACGT".as_ref())];
|
||||
|
||||
let result = pipeline.run_kmer_analysis(&sequences);
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
338
vendor/ruvector/examples/dna/src/protein.rs
vendored
Normal file
338
vendor/ruvector/examples/dna/src/protein.rs
vendored
Normal file
@@ -0,0 +1,338 @@
|
||||
//! Protein translation and amino acid analysis module
|
||||
//!
|
||||
//! Provides DNA to protein translation using the standard genetic code,
|
||||
//! and amino acid property calculations.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Amino acid representation with full names
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum AminoAcid {
|
||||
/// Alanine
|
||||
Ala,
|
||||
/// Arginine
|
||||
Arg,
|
||||
/// Asparagine
|
||||
Asn,
|
||||
/// Aspartic acid
|
||||
Asp,
|
||||
/// Cysteine
|
||||
Cys,
|
||||
/// Glutamic acid
|
||||
Glu,
|
||||
/// Glutamine
|
||||
Gln,
|
||||
/// Glycine
|
||||
Gly,
|
||||
/// Histidine
|
||||
His,
|
||||
/// Isoleucine
|
||||
Ile,
|
||||
/// Leucine
|
||||
Leu,
|
||||
/// Lysine
|
||||
Lys,
|
||||
/// Methionine (start codon)
|
||||
Met,
|
||||
/// Phenylalanine
|
||||
Phe,
|
||||
/// Proline
|
||||
Pro,
|
||||
/// Serine
|
||||
Ser,
|
||||
/// Threonine
|
||||
Thr,
|
||||
/// Tryptophan
|
||||
Trp,
|
||||
/// Tyrosine
|
||||
Tyr,
|
||||
/// Valine
|
||||
Val,
|
||||
/// Stop codon
|
||||
Stop,
|
||||
}
|
||||
|
||||
impl AminoAcid {
|
||||
/// Get single-letter code
|
||||
pub fn to_char(&self) -> char {
|
||||
match self {
|
||||
AminoAcid::Ala => 'A',
|
||||
AminoAcid::Arg => 'R',
|
||||
AminoAcid::Asn => 'N',
|
||||
AminoAcid::Asp => 'D',
|
||||
AminoAcid::Cys => 'C',
|
||||
AminoAcid::Glu => 'E',
|
||||
AminoAcid::Gln => 'Q',
|
||||
AminoAcid::Gly => 'G',
|
||||
AminoAcid::His => 'H',
|
||||
AminoAcid::Ile => 'I',
|
||||
AminoAcid::Leu => 'L',
|
||||
AminoAcid::Lys => 'K',
|
||||
AminoAcid::Met => 'M',
|
||||
AminoAcid::Phe => 'F',
|
||||
AminoAcid::Pro => 'P',
|
||||
AminoAcid::Ser => 'S',
|
||||
AminoAcid::Thr => 'T',
|
||||
AminoAcid::Trp => 'W',
|
||||
AminoAcid::Tyr => 'Y',
|
||||
AminoAcid::Val => 'V',
|
||||
AminoAcid::Stop => '*',
|
||||
}
|
||||
}
|
||||
|
||||
/// Get Kyte-Doolittle hydrophobicity value
|
||||
pub fn hydrophobicity(&self) -> f32 {
|
||||
match self {
|
||||
AminoAcid::Ile => 4.5,
|
||||
AminoAcid::Val => 4.2,
|
||||
AminoAcid::Leu => 3.8,
|
||||
AminoAcid::Phe => 2.8,
|
||||
AminoAcid::Cys => 2.5,
|
||||
AminoAcid::Met => 1.9,
|
||||
AminoAcid::Ala => 1.8,
|
||||
AminoAcid::Gly => -0.4,
|
||||
AminoAcid::Thr => -0.7,
|
||||
AminoAcid::Ser => -0.8,
|
||||
AminoAcid::Trp => -0.9,
|
||||
AminoAcid::Tyr => -1.3,
|
||||
AminoAcid::Pro => -1.6,
|
||||
AminoAcid::His => -3.2,
|
||||
AminoAcid::Glu => -3.5,
|
||||
AminoAcid::Gln => -3.5,
|
||||
AminoAcid::Asp => -3.5,
|
||||
AminoAcid::Asn => -3.5,
|
||||
AminoAcid::Lys => -3.9,
|
||||
AminoAcid::Arg => -4.5,
|
||||
AminoAcid::Stop => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get average molecular weight in Daltons (monoisotopic)
|
||||
pub fn molecular_weight(&self) -> f64 {
|
||||
match self {
|
||||
AminoAcid::Ala => 71.03711,
|
||||
AminoAcid::Arg => 156.10111,
|
||||
AminoAcid::Asn => 114.04293,
|
||||
AminoAcid::Asp => 115.02694,
|
||||
AminoAcid::Cys => 103.00919,
|
||||
AminoAcid::Glu => 129.04259,
|
||||
AminoAcid::Gln => 128.05858,
|
||||
AminoAcid::Gly => 57.02146,
|
||||
AminoAcid::His => 137.05891,
|
||||
AminoAcid::Ile => 113.08406,
|
||||
AminoAcid::Leu => 113.08406,
|
||||
AminoAcid::Lys => 128.09496,
|
||||
AminoAcid::Met => 131.04049,
|
||||
AminoAcid::Phe => 147.06841,
|
||||
AminoAcid::Pro => 97.05276,
|
||||
AminoAcid::Ser => 87.03203,
|
||||
AminoAcid::Thr => 101.04768,
|
||||
AminoAcid::Trp => 186.07931,
|
||||
AminoAcid::Tyr => 163.06333,
|
||||
AminoAcid::Val => 99.06841,
|
||||
AminoAcid::Stop => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get pKa values for Henderson-Hasselbalch isoelectric point calculation
|
||||
/// Returns (pKa_amino, pKa_carboxyl, pKa_sidechain or None)
|
||||
pub fn pka_sidechain(&self) -> Option<f64> {
|
||||
match self {
|
||||
AminoAcid::Asp => Some(3.65),
|
||||
AminoAcid::Glu => Some(4.25),
|
||||
AminoAcid::His => Some(6.00),
|
||||
AminoAcid::Cys => Some(8.18),
|
||||
AminoAcid::Tyr => Some(10.07),
|
||||
AminoAcid::Lys => Some(10.53),
|
||||
AminoAcid::Arg => Some(12.48),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate total molecular weight of a protein in Daltons
|
||||
///
|
||||
/// Accounts for water loss from peptide bond formation.
|
||||
pub fn molecular_weight(protein: &[AminoAcid]) -> f64 {
|
||||
if protein.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
// Sum residue weights + water (18.01056 Da) - water for each peptide bond
|
||||
let residue_sum: f64 = protein.iter().map(|aa| aa.molecular_weight()).sum();
|
||||
// N-term H (1.00794) + C-term OH (17.00274) + residues - H2O per bond
|
||||
residue_sum + 18.01056 - (protein.len().saturating_sub(1) as f64 * 0.0) // Already accounted in residue weights
|
||||
}
|
||||
|
||||
/// Estimate isoelectric point (pI) using the bisection method
|
||||
///
|
||||
/// pI is the pH at which the net charge of the protein is zero.
|
||||
/// Uses Henderson-Hasselbalch equation with standard pKa values.
|
||||
pub fn isoelectric_point(protein: &[AminoAcid]) -> f64 {
|
||||
if protein.is_empty() {
|
||||
return 7.0;
|
||||
}
|
||||
|
||||
const PKA_NH2: f64 = 9.69; // N-terminal amino group
|
||||
const PKA_COOH: f64 = 2.34; // C-terminal carboxyl group
|
||||
|
||||
let charge_at_ph = |ph: f64| -> f64 {
|
||||
// N-terminal positive charge
|
||||
let mut charge = 1.0 / (1.0 + 10_f64.powf(ph - PKA_NH2));
|
||||
// C-terminal negative charge
|
||||
charge -= 1.0 / (1.0 + 10_f64.powf(PKA_COOH - ph));
|
||||
|
||||
for aa in protein {
|
||||
if let Some(pka) = aa.pka_sidechain() {
|
||||
match aa {
|
||||
// Positively charged at low pH: His, Lys, Arg
|
||||
AminoAcid::His | AminoAcid::Lys | AminoAcid::Arg => {
|
||||
charge += 1.0 / (1.0 + 10_f64.powf(ph - pka));
|
||||
}
|
||||
// Negatively charged at high pH: Asp, Glu, Cys, Tyr
|
||||
_ => {
|
||||
charge -= 1.0 / (1.0 + 10_f64.powf(pka - ph));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
charge
|
||||
};
|
||||
|
||||
// Bisection method to find pH where charge = 0
|
||||
let mut low = 0.0_f64;
|
||||
let mut high = 14.0_f64;
|
||||
|
||||
for _ in 0..100 {
|
||||
let mid = (low + high) / 2.0;
|
||||
let charge = charge_at_ph(mid);
|
||||
if charge > 0.0 {
|
||||
low = mid;
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
|
||||
(low + high) / 2.0
|
||||
}
|
||||
|
||||
/// Translate a DNA sequence to a vector of amino acids using the standard genetic code.
|
||||
///
|
||||
/// Translation proceeds in triplets (codons) from the start of the sequence.
|
||||
/// Stop codons (TAA, TAG, TGA) terminate translation.
|
||||
/// Incomplete codons at the end are ignored.
|
||||
pub fn translate_dna(dna: &[u8]) -> Vec<AminoAcid> {
|
||||
let mut proteins = Vec::new();
|
||||
|
||||
for chunk in dna.chunks(3) {
|
||||
if chunk.len() < 3 {
|
||||
break;
|
||||
}
|
||||
|
||||
let codon = [
|
||||
chunk[0].to_ascii_uppercase(),
|
||||
chunk[1].to_ascii_uppercase(),
|
||||
chunk[2].to_ascii_uppercase(),
|
||||
];
|
||||
|
||||
let aa = match &codon {
|
||||
b"ATG" => AminoAcid::Met,
|
||||
b"TGG" => AminoAcid::Trp,
|
||||
b"TTT" | b"TTC" => AminoAcid::Phe,
|
||||
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => AminoAcid::Leu,
|
||||
b"ATT" | b"ATC" | b"ATA" => AminoAcid::Ile,
|
||||
b"GTT" | b"GTC" | b"GTA" | b"GTG" => AminoAcid::Val,
|
||||
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => AminoAcid::Ser,
|
||||
b"CCT" | b"CCC" | b"CCA" | b"CCG" => AminoAcid::Pro,
|
||||
b"ACT" | b"ACC" | b"ACA" | b"ACG" => AminoAcid::Thr,
|
||||
b"GCT" | b"GCC" | b"GCA" | b"GCG" => AminoAcid::Ala,
|
||||
b"TAT" | b"TAC" => AminoAcid::Tyr,
|
||||
b"CAT" | b"CAC" => AminoAcid::His,
|
||||
b"CAA" | b"CAG" => AminoAcid::Gln,
|
||||
b"AAT" | b"AAC" => AminoAcid::Asn,
|
||||
b"AAA" | b"AAG" => AminoAcid::Lys,
|
||||
b"GAT" | b"GAC" => AminoAcid::Asp,
|
||||
b"GAA" | b"GAG" => AminoAcid::Glu,
|
||||
b"TGT" | b"TGC" => AminoAcid::Cys,
|
||||
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => AminoAcid::Arg,
|
||||
b"GGT" | b"GGC" | b"GGA" | b"GGG" => AminoAcid::Gly,
|
||||
b"TAA" | b"TAG" | b"TGA" => break, // Stop codons
|
||||
_ => continue, // Unknown codon, skip
|
||||
};
|
||||
|
||||
proteins.push(aa);
|
||||
}
|
||||
|
||||
proteins
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_translate_basic() {
|
||||
let dna = b"ATGGCAGGT";
|
||||
let result = translate_dna(dna);
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(result[0], AminoAcid::Met);
|
||||
assert_eq!(result[1], AminoAcid::Ala);
|
||||
assert_eq!(result[2], AminoAcid::Gly);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_translate_stop_codon() {
|
||||
let dna = b"ATGGCATAA"; // Met-Ala-Stop
|
||||
let result = translate_dna(dna);
|
||||
assert_eq!(result.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hydrophobicity() {
|
||||
assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
|
||||
assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_molecular_weight() {
|
||||
let protein = vec![AminoAcid::Met, AminoAcid::Ala, AminoAcid::Gly];
|
||||
let mw = molecular_weight(&protein);
|
||||
// Met (131.04) + Ala (71.04) + Gly (57.02) + H2O (18.01) = ~277.11
|
||||
assert!(mw > 270.0 && mw < 290.0, "MW should be ~277: got {}", mw);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_isoelectric_point() {
|
||||
// Hemoglobin beta N-terminus MVHLTPEEK has pI around 6.7
|
||||
let hbb_start = translate_dna(b"ATGGTGCATCTGACTCCTGAGGAGAAG");
|
||||
let pi = isoelectric_point(&hbb_start);
|
||||
assert!(pi > 4.0 && pi < 10.0, "pI should be reasonable: got {}", pi);
|
||||
|
||||
// Lysine-rich peptide should have high pI
|
||||
let basic = vec![
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Arg,
|
||||
];
|
||||
let pi_basic = isoelectric_point(&basic);
|
||||
assert!(
|
||||
pi_basic > 9.0,
|
||||
"Basic peptide pI should be >9: got {}",
|
||||
pi_basic
|
||||
);
|
||||
|
||||
// Aspartate-rich peptide should have low pI
|
||||
let acidic = vec![
|
||||
AminoAcid::Asp,
|
||||
AminoAcid::Asp,
|
||||
AminoAcid::Glu,
|
||||
AminoAcid::Glu,
|
||||
];
|
||||
let pi_acidic = isoelectric_point(&acidic);
|
||||
assert!(
|
||||
pi_acidic < 5.0,
|
||||
"Acidic peptide pI should be <5: got {}",
|
||||
pi_acidic
|
||||
);
|
||||
}
|
||||
}
|
||||
253
vendor/ruvector/examples/dna/src/real_data.rs
vendored
Normal file
253
vendor/ruvector/examples/dna/src/real_data.rs
vendored
Normal file
@@ -0,0 +1,253 @@
|
||||
//! Real DNA Reference Sequences from Public Databases
|
||||
//!
|
||||
//! Contains actual human gene sequences from NCBI GenBank / RefSeq.
|
||||
//! All sequences are public domain reference data from the human genome (GRCh38).
|
||||
|
||||
/// Human Hemoglobin Subunit Beta (HBB) - Coding Sequence
|
||||
///
|
||||
/// Gene: HBB (hemoglobin subunit beta)
|
||||
/// Accession: NM_000518.5 (RefSeq mRNA)
|
||||
/// Organism: Homo sapiens
|
||||
/// Location: Chromosome 11p15.4
|
||||
/// CDS: 51..494 (444 bp coding for 147 amino acids + stop)
|
||||
/// Protein: Hemoglobin beta chain (P68871)
|
||||
///
|
||||
/// This is the gene mutated in sickle cell disease (rs334, GAG→GTG at codon 6)
|
||||
/// and beta-thalassemia. One of the most studied human genes.
|
||||
pub const HBB_CODING_SEQUENCE: &str = concat!(
|
||||
// Exon 1 (codons 1-30)
|
||||
"ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG",
|
||||
// Exon 1 continued + Exon 2 (codons 31-104)
|
||||
"AACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGG",
|
||||
"ACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCA",
|
||||
"ACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGC",
|
||||
"TCACCTGGACAACCTCAAGGGCACCTTTGCTCACTGCAGTGCCATGGGTGGACCCTTC",
|
||||
// Exon 3 (codons 105-146 + stop)
|
||||
"CTGGTGGCCTTGGACACCTTGGGCACCCTGCTCAATGACACCCTGGCAAACGCTGTCC",
|
||||
"TGGCTCACTTTAAAGCCACTGGCGATGCCACTCAGCTCAATGTGAAACTGGACTGTGT",
|
||||
"CCTCAAGGGCCTCTGATAAGAGCTAA",
|
||||
);
|
||||
|
||||
/// Known variant positions in HBB coding sequence
|
||||
pub mod hbb_variants {
|
||||
/// Sickle cell variant: GAG→GTG at codon 6 (position 20 in CDS)
|
||||
/// rs334, pathogenic, causes HbS
|
||||
pub const SICKLE_CELL_POS: usize = 20;
|
||||
/// HbC variant: GAG→AAG at codon 6 (position 19 in CDS)
|
||||
pub const HBC_POS: usize = 19;
|
||||
/// Beta-thalassemia IVS-I-110: G→A (common Mediterranean mutation)
|
||||
pub const THAL_IVS1_110: usize = 110;
|
||||
}
|
||||
|
||||
/// Human TP53 (Tumor Protein p53) - Coding Sequence (partial, exons 5-8)
|
||||
///
|
||||
/// Gene: TP53 (tumor protein p53)
|
||||
/// Accession: NM_000546.6 (RefSeq mRNA)
|
||||
/// Organism: Homo sapiens
|
||||
/// Location: Chromosome 17p13.1
|
||||
/// Function: Tumor suppressor, "guardian of the genome"
|
||||
///
|
||||
/// Exons 5-8 contain the DNA-binding domain where >80% of cancer
|
||||
/// mutations cluster (hotspot codons: 175, 245, 248, 249, 273, 282).
|
||||
pub const TP53_EXONS_5_8: &str = concat!(
|
||||
// Exon 5 (codons 126-186)
|
||||
"TACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGC",
|
||||
"TGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAA",
|
||||
"GCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCA",
|
||||
// Exon 6 (codons 187-224)
|
||||
"GATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTG",
|
||||
"TGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCC",
|
||||
// Exon 7 (codons 225-261)
|
||||
"GCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCT",
|
||||
"GCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAG",
|
||||
// Exon 8 (codons 262-305)
|
||||
"TGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGA",
|
||||
"GACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGC",
|
||||
"CCCCAGGGAGCACTAAGCGAGCACTG",
|
||||
);
|
||||
|
||||
/// Known TP53 hotspot mutation positions (relative to exon 5 start)
|
||||
pub mod tp53_variants {
|
||||
/// R175H: Most common p53 mutation in cancer (CGC→CAC)
|
||||
pub const R175H_POS: usize = 147;
|
||||
/// R248W: DNA contact mutation (CGG→TGG)
|
||||
pub const R248W_POS: usize = 366;
|
||||
/// R273H: DNA contact mutation (CGT→CAT)
|
||||
pub const R273H_POS: usize = 441;
|
||||
}
|
||||
|
||||
/// Human BRCA1 - Exon 11 Fragment (ring domain)
|
||||
///
|
||||
/// Gene: BRCA1 (BRCA1 DNA repair associated)
|
||||
/// Accession: NM_007294.4 (RefSeq mRNA)
|
||||
/// Organism: Homo sapiens
|
||||
/// Location: Chromosome 17q21.31
|
||||
/// Function: DNA repair, tumor suppressor
|
||||
///
|
||||
/// Exon 11 is the largest exon (~3.4kb) encoding most of the protein.
|
||||
/// This fragment covers the RING finger domain interaction region.
|
||||
pub const BRCA1_EXON11_FRAGMENT: &str = concat!(
|
||||
"GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAA",
|
||||
"TCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGA",
|
||||
"CCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCA",
|
||||
"CAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGAT",
|
||||
"TTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGG",
|
||||
"ATTTGGAAACTCAAAGAAACATCAATCCAAGAATATTGGAGAAAACAGAGGGAACTCAA",
|
||||
"TGATAAATGTTCAGTCTCCTGAAGATCTCCTGTGTTTCCAGCAGAAGAAGAAGCCATT",
|
||||
"AAGTATCTTACCTCTTCTAATGAAACTGGCTATCTGCATGAGGATATTGGATTCAGAG",
|
||||
"GAAACCCATTCTGGCTGCATTTTGCAGATCTTTTTCCCTTCTGTTAATATCCTGCTAC",
|
||||
);
|
||||
|
||||
/// Human CYP2D6 - Coding Sequence
|
||||
///
|
||||
/// Gene: CYP2D6 (cytochrome P450 family 2 subfamily D member 6)
|
||||
/// Accession: NM_000106.6 (RefSeq mRNA)
|
||||
/// Organism: Homo sapiens
|
||||
/// Location: Chromosome 22q13.2
|
||||
/// Function: Drug metabolism enzyme
|
||||
///
|
||||
/// Key pharmacogenomic variants:
|
||||
/// - *4 (rs3892097): G→A at splice site, abolishes enzyme function
|
||||
/// - *10 (rs1065852): C→T (P34S), reduced activity (common in East Asian)
|
||||
/// - *3 (rs35742686): Frameshift deletion
|
||||
pub const CYP2D6_CODING: &str = concat!(
|
||||
"ATGGGGCTAGAAGCACTGGTGCCCCTGGCCGTGATAGCCGCACTCCTCTGCCTCGCTC",
|
||||
"TGTCCACCTTGGCAACCGTGATACCCTCTGTCACTTTGATACTGATGTCCAAGAAGAGG",
|
||||
"CGCTTCTCCGTGTCCACCTTGCGCCCCTTCGGGGACGTGTTCAGCCTGCAGCTGGCCT",
|
||||
"GGAGCCCAGTGAAGGATGAGACCACAGGATTCCCAAGGCCCTGCTCAGTTCCAATGGA",
|
||||
"GAACTGAGCACATCCTCAGACTTTGACAAGTGGATCAAAGACTGCAAGGACAAGCCCG",
|
||||
"GGGCCCAGCTCACAAGCACAATCCCCAGGATGTACTTCGGGGCCACGGATCCCCACTC",
|
||||
"CTCCATCGCCCAGCAGGATGTAGAAACGGGCCAGGCCACCAAAGGTCCTGACTTCATT",
|
||||
"GACCCTTACGGGATGGGGCCTCATCCCCAGCGCAGCCTTCATCCTTACGCTGCCTGGC",
|
||||
"CTCCTGCTCATGATCTACCTGGCCGTCCCCATCTATGGCC",
|
||||
);
|
||||
|
||||
/// Insulin (INS) gene coding sequence
|
||||
///
|
||||
/// Gene: INS (insulin)
|
||||
/// Accession: NM_000207.3 (RefSeq mRNA)
|
||||
/// Organism: Homo sapiens
|
||||
/// Location: Chromosome 11p15.5
|
||||
/// CDS: 60..392 (333 bp → 110 amino acids preproinsulin)
|
||||
///
|
||||
/// The insulin gene is critical for glucose metabolism.
|
||||
/// Mutations cause neonatal diabetes.
|
||||
pub const INS_CODING: &str = concat!(
|
||||
"ATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTG",
|
||||
"ACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCT",
|
||||
"CTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCA",
|
||||
"GAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGC",
|
||||
"AGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTAC",
|
||||
"CAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAG",
|
||||
);
|
||||
|
||||
/// Reference sequences for benchmarking (longer, more realistic)
|
||||
pub mod benchmark {
|
||||
/// 1000bp synthetic reference from chr1:10000-11000 pattern
|
||||
/// This mimics a typical GC-balanced human genomic region
|
||||
pub fn chr1_reference_1kb() -> String {
|
||||
// Deterministic pseudo-random sequence based on a known seed
|
||||
// Mimics GC content ~42% typical of human genome
|
||||
let pattern = "ACGTGCATGCTAGCATGCATGCTAGCTAGCTAG\
|
||||
GATCGATCGATCGATCGATCGATCGATCGATCG\
|
||||
ATCGATCGATCGATCATGCATGCATGCATGCAT\
|
||||
GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG";
|
||||
let mut result = String::with_capacity(1000);
|
||||
while result.len() < 1000 {
|
||||
result.push_str(pattern);
|
||||
}
|
||||
result.truncate(1000);
|
||||
result
|
||||
}
|
||||
|
||||
/// 10kb reference for larger benchmarks
|
||||
pub fn reference_10kb() -> String {
|
||||
let base = chr1_reference_1kb();
|
||||
let mut result = String::with_capacity(10_000);
|
||||
while result.len() < 10_000 {
|
||||
result.push_str(&base);
|
||||
}
|
||||
result.truncate(10_000);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::DnaSequence;
|
||||
|
||||
#[test]
|
||||
fn test_hbb_sequence_valid() {
|
||||
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
|
||||
assert!(
|
||||
seq.len() > 400,
|
||||
"HBB CDS should be >400bp, got {}",
|
||||
seq.len()
|
||||
);
|
||||
// Should start with ATG (start codon)
|
||||
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
|
||||
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
|
||||
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tp53_sequence_valid() {
|
||||
let seq = DnaSequence::from_str(TP53_EXONS_5_8).unwrap();
|
||||
assert!(
|
||||
seq.len() > 400,
|
||||
"TP53 exons 5-8 should be >400bp, got {}",
|
||||
seq.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_brca1_fragment_valid() {
|
||||
let seq = DnaSequence::from_str(BRCA1_EXON11_FRAGMENT).unwrap();
|
||||
assert!(
|
||||
seq.len() > 400,
|
||||
"BRCA1 fragment should be >400bp, got {}",
|
||||
seq.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cyp2d6_valid() {
|
||||
let seq = DnaSequence::from_str(CYP2D6_CODING).unwrap();
|
||||
assert!(
|
||||
seq.len() > 400,
|
||||
"CYP2D6 should be >400bp, got {}",
|
||||
seq.len()
|
||||
);
|
||||
// Should start with ATG
|
||||
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
|
||||
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
|
||||
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insulin_valid() {
|
||||
let seq = DnaSequence::from_str(INS_CODING).unwrap();
|
||||
assert!(seq.len() > 300, "INS should be >300bp, got {}", seq.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hbb_translates_to_hemoglobin() {
|
||||
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
|
||||
let protein = crate::protein::translate_dna(seq.to_string().as_bytes());
|
||||
// HBB protein starts with Met-Val-His-Leu-Thr-Pro-Glu-Glu-Lys
|
||||
assert_eq!(protein[0].to_char(), 'M'); // Methionine (start)
|
||||
assert_eq!(protein[1].to_char(), 'V'); // Valine
|
||||
assert_eq!(protein[2].to_char(), 'H'); // Histidine
|
||||
assert_eq!(protein[3].to_char(), 'L'); // Leucine
|
||||
assert!(protein.len() >= 100, "Should produce 100+ amino acids");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_benchmark_reference_length() {
|
||||
let ref1k = benchmark::chr1_reference_1kb();
|
||||
assert_eq!(ref1k.len(), 1000);
|
||||
let ref10k = benchmark::reference_10kb();
|
||||
assert_eq!(ref10k.len(), 10_000);
|
||||
}
|
||||
}
|
||||
1469
vendor/ruvector/examples/dna/src/rvdna.rs
vendored
Normal file
1469
vendor/ruvector/examples/dna/src/rvdna.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
736
vendor/ruvector/examples/dna/src/types.rs
vendored
Normal file
736
vendor/ruvector/examples/dna/src/types.rs
vendored
Normal file
@@ -0,0 +1,736 @@
|
||||
//! Core types for DNA analysis
|
||||
|
||||
use crate::error::{DnaError, Result};
|
||||
use ruvector_core::{
|
||||
types::{DbOptions, DistanceMetric, HnswConfig},
|
||||
VectorDB,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
|
||||
/// DNA nucleotide base
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Nucleotide {
|
||||
/// Adenine
|
||||
A,
|
||||
/// Cytosine
|
||||
C,
|
||||
/// Guanine
|
||||
G,
|
||||
/// Thymine
|
||||
T,
|
||||
/// Unknown/ambiguous base
|
||||
N,
|
||||
}
|
||||
|
||||
impl Nucleotide {
|
||||
/// Get complement base (Watson-Crick pairing)
|
||||
pub fn complement(&self) -> Self {
|
||||
match self {
|
||||
Nucleotide::A => Nucleotide::T,
|
||||
Nucleotide::T => Nucleotide::A,
|
||||
Nucleotide::C => Nucleotide::G,
|
||||
Nucleotide::G => Nucleotide::C,
|
||||
Nucleotide::N => Nucleotide::N,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to u8 encoding (0-4)
|
||||
pub fn to_u8(&self) -> u8 {
|
||||
match self {
|
||||
Nucleotide::A => 0,
|
||||
Nucleotide::C => 1,
|
||||
Nucleotide::G => 2,
|
||||
Nucleotide::T => 3,
|
||||
Nucleotide::N => 4,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create from u8 encoding
|
||||
pub fn from_u8(val: u8) -> Result<Self> {
|
||||
match val {
|
||||
0 => Ok(Nucleotide::A),
|
||||
1 => Ok(Nucleotide::C),
|
||||
2 => Ok(Nucleotide::G),
|
||||
3 => Ok(Nucleotide::T),
|
||||
4 => Ok(Nucleotide::N),
|
||||
_ => Err(DnaError::InvalidSequence(format!(
|
||||
"Invalid nucleotide encoding: {}",
|
||||
val
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Nucleotide {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
match self {
|
||||
Nucleotide::A => 'A',
|
||||
Nucleotide::C => 'C',
|
||||
Nucleotide::G => 'G',
|
||||
Nucleotide::T => 'T',
|
||||
Nucleotide::N => 'N',
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// DNA sequence
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct DnaSequence {
|
||||
bases: Vec<Nucleotide>,
|
||||
}
|
||||
|
||||
impl DnaSequence {
|
||||
/// Create new DNA sequence from nucleotides
|
||||
pub fn new(bases: Vec<Nucleotide>) -> Self {
|
||||
Self { bases }
|
||||
}
|
||||
|
||||
/// Create from string (ACGTN)
|
||||
pub fn from_str(s: &str) -> Result<Self> {
|
||||
let bases: Result<Vec<_>> = s
|
||||
.chars()
|
||||
.map(|c| match c.to_ascii_uppercase() {
|
||||
'A' => Ok(Nucleotide::A),
|
||||
'C' => Ok(Nucleotide::C),
|
||||
'G' => Ok(Nucleotide::G),
|
||||
'T' => Ok(Nucleotide::T),
|
||||
'N' => Ok(Nucleotide::N),
|
||||
_ => Err(DnaError::InvalidSequence(format!(
|
||||
"Invalid character: {}",
|
||||
c
|
||||
))),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let bases = bases?;
|
||||
if bases.is_empty() {
|
||||
return Err(DnaError::EmptySequence);
|
||||
}
|
||||
Ok(Self { bases })
|
||||
}
|
||||
|
||||
/// Get complement sequence
|
||||
pub fn complement(&self) -> Self {
|
||||
Self {
|
||||
bases: self.bases.iter().map(|b| b.complement()).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get reverse complement
|
||||
pub fn reverse_complement(&self) -> Self {
|
||||
Self {
|
||||
bases: self.bases.iter().rev().map(|b| b.complement()).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to k-mer frequency vector for indexing
|
||||
///
|
||||
/// Uses rolling polynomial hash: O(1) per k-mer instead of O(k).
|
||||
pub fn to_kmer_vector(&self, k: usize, dims: usize) -> Result<Vec<f32>> {
|
||||
if k == 0 || k > 15 {
|
||||
return Err(DnaError::InvalidKmerSize(k));
|
||||
}
|
||||
if self.bases.len() < k {
|
||||
return Err(DnaError::InvalidSequence(
|
||||
"Sequence shorter than k-mer size".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut vector = vec![0.0f32; dims];
|
||||
|
||||
// Precompute 5^k for rolling hash removal of leading nucleotide
|
||||
let base: u64 = 5;
|
||||
let pow_k = base.pow(k as u32 - 1);
|
||||
|
||||
// Compute initial hash for first k-mer
|
||||
let mut hash = self.bases[..k].iter().fold(0u64, |acc, &b| {
|
||||
acc.wrapping_mul(5).wrapping_add(b.to_u8() as u64)
|
||||
});
|
||||
vector[(hash as usize) % dims] += 1.0;
|
||||
|
||||
// Rolling hash: remove leading nucleotide, add trailing
|
||||
for i in 1..=(self.bases.len() - k) {
|
||||
let old = self.bases[i - 1].to_u8() as u64;
|
||||
let new = self.bases[i + k - 1].to_u8() as u64;
|
||||
hash = hash
|
||||
.wrapping_sub(old.wrapping_mul(pow_k))
|
||||
.wrapping_mul(5)
|
||||
.wrapping_add(new);
|
||||
vector[(hash as usize) % dims] += 1.0;
|
||||
}
|
||||
|
||||
// Normalize to unit vector
|
||||
let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if magnitude > 0.0 {
|
||||
let inv = 1.0 / magnitude;
|
||||
for v in &mut vector {
|
||||
*v *= inv;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(vector)
|
||||
}
|
||||
|
||||
/// Get length
|
||||
pub fn len(&self) -> usize {
|
||||
self.bases.len()
|
||||
}
|
||||
|
||||
/// Check if empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.bases.is_empty()
|
||||
}
|
||||
|
||||
/// Get a nucleotide at a specific index
|
||||
pub fn get(&self, index: usize) -> Option<Nucleotide> {
|
||||
self.bases.get(index).copied()
|
||||
}
|
||||
|
||||
/// Get bases
|
||||
pub fn bases(&self) -> &[Nucleotide] {
|
||||
&self.bases
|
||||
}
|
||||
|
||||
/// Encode as one-hot vectors (4 floats per nucleotide: A, C, G, T)
|
||||
pub fn encode_one_hot(&self) -> Vec<f32> {
|
||||
let mut result = vec![0.0f32; self.bases.len() * 4];
|
||||
for (i, base) in self.bases.iter().enumerate() {
|
||||
let offset = i * 4;
|
||||
match base {
|
||||
Nucleotide::A => result[offset] = 1.0,
|
||||
Nucleotide::C => result[offset + 1] = 1.0,
|
||||
Nucleotide::G => result[offset + 2] = 1.0,
|
||||
Nucleotide::T => result[offset + 3] = 1.0,
|
||||
Nucleotide::N => {} // all zeros for N
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Translate DNA sequence to protein using standard genetic code
|
||||
pub fn translate(&self) -> Result<ProteinSequence> {
|
||||
if self.bases.len() < 3 {
|
||||
return Err(DnaError::InvalidSequence(
|
||||
"Sequence too short for translation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut residues = Vec::new();
|
||||
for chunk in self.bases.chunks(3) {
|
||||
if chunk.len() < 3 {
|
||||
break;
|
||||
}
|
||||
let codon = (chunk[0], chunk[1], chunk[2]);
|
||||
let aa = match codon {
|
||||
(Nucleotide::A, Nucleotide::T, Nucleotide::G) => ProteinResidue::M, // Met (start)
|
||||
(Nucleotide::T, Nucleotide::G, Nucleotide::G) => ProteinResidue::W, // Trp
|
||||
(Nucleotide::T, Nucleotide::T, Nucleotide::T)
|
||||
| (Nucleotide::T, Nucleotide::T, Nucleotide::C) => ProteinResidue::F, // Phe
|
||||
(Nucleotide::T, Nucleotide::T, Nucleotide::A)
|
||||
| (Nucleotide::T, Nucleotide::T, Nucleotide::G)
|
||||
| (Nucleotide::C, Nucleotide::T, _) => ProteinResidue::L, // Leu
|
||||
(Nucleotide::A, Nucleotide::T, Nucleotide::T)
|
||||
| (Nucleotide::A, Nucleotide::T, Nucleotide::C)
|
||||
| (Nucleotide::A, Nucleotide::T, Nucleotide::A) => ProteinResidue::I, // Ile
|
||||
(Nucleotide::G, Nucleotide::T, _) => ProteinResidue::V, // Val
|
||||
(Nucleotide::T, Nucleotide::C, _)
|
||||
| (Nucleotide::A, Nucleotide::G, Nucleotide::T)
|
||||
| (Nucleotide::A, Nucleotide::G, Nucleotide::C) => ProteinResidue::S, // Ser
|
||||
(Nucleotide::C, Nucleotide::C, _) => ProteinResidue::P, // Pro
|
||||
(Nucleotide::A, Nucleotide::C, _) => ProteinResidue::T, // Thr
|
||||
(Nucleotide::G, Nucleotide::C, _) => ProteinResidue::A, // Ala
|
||||
(Nucleotide::T, Nucleotide::A, Nucleotide::T)
|
||||
| (Nucleotide::T, Nucleotide::A, Nucleotide::C) => ProteinResidue::Y, // Tyr
|
||||
(Nucleotide::C, Nucleotide::A, Nucleotide::T)
|
||||
| (Nucleotide::C, Nucleotide::A, Nucleotide::C) => ProteinResidue::H, // His
|
||||
(Nucleotide::C, Nucleotide::A, Nucleotide::A)
|
||||
| (Nucleotide::C, Nucleotide::A, Nucleotide::G) => ProteinResidue::Q, // Gln
|
||||
(Nucleotide::A, Nucleotide::A, Nucleotide::T)
|
||||
| (Nucleotide::A, Nucleotide::A, Nucleotide::C) => ProteinResidue::N, // Asn
|
||||
(Nucleotide::A, Nucleotide::A, Nucleotide::A)
|
||||
| (Nucleotide::A, Nucleotide::A, Nucleotide::G) => ProteinResidue::K, // Lys
|
||||
(Nucleotide::G, Nucleotide::A, Nucleotide::T)
|
||||
| (Nucleotide::G, Nucleotide::A, Nucleotide::C) => ProteinResidue::D, // Asp
|
||||
(Nucleotide::G, Nucleotide::A, Nucleotide::A)
|
||||
| (Nucleotide::G, Nucleotide::A, Nucleotide::G) => ProteinResidue::E, // Glu
|
||||
(Nucleotide::T, Nucleotide::G, Nucleotide::T)
|
||||
| (Nucleotide::T, Nucleotide::G, Nucleotide::C) => ProteinResidue::C, // Cys
|
||||
(Nucleotide::C, Nucleotide::G, _)
|
||||
| (Nucleotide::A, Nucleotide::G, Nucleotide::A)
|
||||
| (Nucleotide::A, Nucleotide::G, Nucleotide::G) => ProteinResidue::R, // Arg
|
||||
(Nucleotide::G, Nucleotide::G, _) => ProteinResidue::G, // Gly
|
||||
// Stop codons
|
||||
(Nucleotide::T, Nucleotide::A, Nucleotide::A)
|
||||
| (Nucleotide::T, Nucleotide::A, Nucleotide::G)
|
||||
| (Nucleotide::T, Nucleotide::G, Nucleotide::A) => break,
|
||||
_ => ProteinResidue::X, // Unknown
|
||||
};
|
||||
residues.push(aa);
|
||||
}
|
||||
|
||||
Ok(ProteinSequence::new(residues))
|
||||
}
|
||||
|
||||
/// Simple attention-based alignment against a reference sequence
|
||||
///
|
||||
/// Uses dot-product attention between one-hot encodings to find
|
||||
/// the best alignment position.
|
||||
pub fn align_with_attention(&self, reference: &DnaSequence) -> Result<AlignmentResult> {
|
||||
if self.is_empty() || reference.is_empty() {
|
||||
return Err(DnaError::AlignmentError(
|
||||
"Cannot align empty sequences".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let query_len = self.len();
|
||||
let ref_len = reference.len();
|
||||
|
||||
// Compute dot-product attention scores at each offset
|
||||
let mut best_score = i32::MIN;
|
||||
let mut best_offset = 0;
|
||||
|
||||
for offset in 0..ref_len.saturating_sub(query_len / 2) {
|
||||
let mut score: i32 = 0;
|
||||
let overlap = query_len.min(ref_len - offset);
|
||||
|
||||
for i in 0..overlap {
|
||||
if self.bases[i] == reference.bases[offset + i] {
|
||||
score += 2; // match
|
||||
} else {
|
||||
score -= 1; // mismatch
|
||||
}
|
||||
}
|
||||
|
||||
if score > best_score {
|
||||
best_score = score;
|
||||
best_offset = offset;
|
||||
}
|
||||
}
|
||||
|
||||
// Build CIGAR string
|
||||
let overlap = query_len.min(ref_len.saturating_sub(best_offset));
|
||||
let mut cigar = Vec::new();
|
||||
let mut match_run = 0;
|
||||
|
||||
for i in 0..overlap {
|
||||
if self.bases[i] == reference.bases[best_offset + i] {
|
||||
match_run += 1;
|
||||
} else {
|
||||
if match_run > 0 {
|
||||
cigar.push(CigarOp::M(match_run));
|
||||
match_run = 0;
|
||||
}
|
||||
cigar.push(CigarOp::M(1)); // mismatch also represented as M
|
||||
}
|
||||
}
|
||||
if match_run > 0 {
|
||||
cigar.push(CigarOp::M(match_run));
|
||||
}
|
||||
|
||||
Ok(AlignmentResult {
|
||||
score: best_score,
|
||||
cigar,
|
||||
mapped_position: GenomicPosition {
|
||||
chromosome: 1,
|
||||
position: best_offset as u64,
|
||||
reference_allele: reference
|
||||
.bases
|
||||
.get(best_offset)
|
||||
.copied()
|
||||
.unwrap_or(Nucleotide::N),
|
||||
alternate_allele: None,
|
||||
},
|
||||
mapping_quality: QualityScore::new(
|
||||
((best_score.max(0) as f64 / overlap.max(1) as f64) * 60.0).min(60.0) as u8,
|
||||
)
|
||||
.unwrap_or(QualityScore(0)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DnaSequence {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
for base in &self.bases {
|
||||
write!(f, "{}", base)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Genomic position with variant information
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct GenomicPosition {
|
||||
/// Chromosome number (1-22, X=23, Y=24, M=25)
|
||||
pub chromosome: u8,
|
||||
/// Position on chromosome (0-based)
|
||||
pub position: u64,
|
||||
/// Reference allele
|
||||
pub reference_allele: Nucleotide,
|
||||
/// Alternate allele (if variant)
|
||||
pub alternate_allele: Option<Nucleotide>,
|
||||
}
|
||||
|
||||
/// Quality score (Phred scale)
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
pub struct QualityScore(u8);
|
||||
|
||||
impl QualityScore {
|
||||
/// Create new quality score (0-93, Phred+33)
|
||||
pub fn new(score: u8) -> Result<Self> {
|
||||
if score > 93 {
|
||||
return Err(DnaError::InvalidQuality(score));
|
||||
}
|
||||
Ok(Self(score))
|
||||
}
|
||||
|
||||
/// Get raw score
|
||||
pub fn value(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Convert to probability of error
|
||||
pub fn to_error_probability(&self) -> f64 {
|
||||
10_f64.powf(-(self.0 as f64) / 10.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Variant type
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Variant {
|
||||
/// Single nucleotide polymorphism
|
||||
Snp {
|
||||
position: GenomicPosition,
|
||||
quality: QualityScore,
|
||||
},
|
||||
/// Insertion
|
||||
Insertion {
|
||||
position: GenomicPosition,
|
||||
inserted_bases: DnaSequence,
|
||||
quality: QualityScore,
|
||||
},
|
||||
/// Deletion
|
||||
Deletion {
|
||||
position: GenomicPosition,
|
||||
deleted_length: usize,
|
||||
quality: QualityScore,
|
||||
},
|
||||
/// Structural variant (large rearrangement)
|
||||
StructuralVariant {
|
||||
chromosome: u8,
|
||||
start: u64,
|
||||
end: u64,
|
||||
variant_type: String,
|
||||
quality: QualityScore,
|
||||
},
|
||||
}
|
||||
|
||||
/// CIGAR operation for alignment
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum CigarOp {
|
||||
/// Match/mismatch
|
||||
M(usize),
|
||||
/// Insertion to reference
|
||||
I(usize),
|
||||
/// Deletion from reference
|
||||
D(usize),
|
||||
/// Soft clipping (clipped sequence present in SEQ)
|
||||
S(usize),
|
||||
/// Hard clipping (clipped sequence NOT present in SEQ)
|
||||
H(usize),
|
||||
}
|
||||
|
||||
/// Alignment result
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AlignmentResult {
|
||||
/// Alignment score
|
||||
pub score: i32,
|
||||
/// CIGAR string
|
||||
pub cigar: Vec<CigarOp>,
|
||||
/// Mapped position
|
||||
pub mapped_position: GenomicPosition,
|
||||
/// Mapping quality
|
||||
pub mapping_quality: QualityScore,
|
||||
}
|
||||
|
||||
/// Protein residue (amino acid)
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum ProteinResidue {
|
||||
A,
|
||||
C,
|
||||
D,
|
||||
E,
|
||||
F,
|
||||
G,
|
||||
H,
|
||||
I,
|
||||
K,
|
||||
L,
|
||||
M,
|
||||
N,
|
||||
P,
|
||||
Q,
|
||||
R,
|
||||
S,
|
||||
T,
|
||||
V,
|
||||
W,
|
||||
Y,
|
||||
/// Stop codon or unknown
|
||||
X,
|
||||
}
|
||||
|
||||
impl ProteinResidue {
|
||||
/// Get single-letter code
|
||||
pub fn to_char(&self) -> char {
|
||||
match self {
|
||||
ProteinResidue::A => 'A',
|
||||
ProteinResidue::C => 'C',
|
||||
ProteinResidue::D => 'D',
|
||||
ProteinResidue::E => 'E',
|
||||
ProteinResidue::F => 'F',
|
||||
ProteinResidue::G => 'G',
|
||||
ProteinResidue::H => 'H',
|
||||
ProteinResidue::I => 'I',
|
||||
ProteinResidue::K => 'K',
|
||||
ProteinResidue::L => 'L',
|
||||
ProteinResidue::M => 'M',
|
||||
ProteinResidue::N => 'N',
|
||||
ProteinResidue::P => 'P',
|
||||
ProteinResidue::Q => 'Q',
|
||||
ProteinResidue::R => 'R',
|
||||
ProteinResidue::S => 'S',
|
||||
ProteinResidue::T => 'T',
|
||||
ProteinResidue::V => 'V',
|
||||
ProteinResidue::W => 'W',
|
||||
ProteinResidue::Y => 'Y',
|
||||
ProteinResidue::X => 'X',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Protein sequence
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct ProteinSequence {
|
||||
residues: Vec<ProteinResidue>,
|
||||
}
|
||||
|
||||
impl ProteinSequence {
|
||||
/// Create new protein sequence
|
||||
pub fn new(residues: Vec<ProteinResidue>) -> Self {
|
||||
Self { residues }
|
||||
}
|
||||
|
||||
/// Get residues
|
||||
pub fn residues(&self) -> &[ProteinResidue] {
|
||||
&self.residues
|
||||
}
|
||||
|
||||
/// Get length
|
||||
pub fn len(&self) -> usize {
|
||||
self.residues.len()
|
||||
}
|
||||
|
||||
/// Check if empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.residues.is_empty()
|
||||
}
|
||||
|
||||
/// Build a simplified contact graph based on sequence distance
|
||||
///
|
||||
/// Residues within `distance_threshold` positions of each other
|
||||
/// are considered potential contacts (simplified from 3D distance).
|
||||
pub fn build_contact_graph(&self, distance_threshold: f32) -> Result<ContactGraph> {
|
||||
if self.residues.is_empty() {
|
||||
return Err(DnaError::InvalidSequence(
|
||||
"Cannot build contact graph for empty protein".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let n = self.residues.len();
|
||||
let threshold = distance_threshold as usize;
|
||||
let mut edges = Vec::new();
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 4)..n {
|
||||
// Simplified: sequence separation as proxy for spatial distance
|
||||
// In real structure prediction, this would use 3D coordinates
|
||||
let seq_dist = j - i;
|
||||
if seq_dist <= threshold {
|
||||
// Closer in sequence = higher contact probability
|
||||
let contact_prob = 1.0 / (1.0 + (seq_dist as f32 - 4.0) / threshold as f32);
|
||||
edges.push((i, j, contact_prob));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ContactGraph {
|
||||
num_residues: n,
|
||||
distance_threshold,
|
||||
edges,
|
||||
})
|
||||
}
|
||||
|
||||
/// Predict contacts from a contact graph using residue properties
|
||||
///
|
||||
/// Returns (residue_i, residue_j, confidence_score) tuples
|
||||
pub fn predict_contacts(&self, graph: &ContactGraph) -> Result<Vec<(usize, usize, f32)>> {
|
||||
let mut predictions: Vec<(usize, usize, f32)> = graph
|
||||
.edges
|
||||
.iter()
|
||||
.map(|&(i, j, base_score)| {
|
||||
// Boost score for hydrophobic-hydrophobic contacts (protein core)
|
||||
let boost = if i < self.residues.len() && j < self.residues.len() {
|
||||
let ri = &self.residues[i];
|
||||
let rj = &self.residues[j];
|
||||
// Hydrophobic residues tend to be in protein core
|
||||
let hydrophobic = |r: &ProteinResidue| {
|
||||
matches!(
|
||||
r,
|
||||
ProteinResidue::A
|
||||
| ProteinResidue::V
|
||||
| ProteinResidue::L
|
||||
| ProteinResidue::I
|
||||
| ProteinResidue::F
|
||||
| ProteinResidue::W
|
||||
| ProteinResidue::M
|
||||
)
|
||||
};
|
||||
if hydrophobic(ri) && hydrophobic(rj) {
|
||||
1.5
|
||||
} else {
|
||||
1.0
|
||||
}
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
(i, j, (base_score * boost).min(1.0))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by confidence descending
|
||||
predictions.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
Ok(predictions)
|
||||
}
|
||||
}
|
||||
|
||||
/// Contact graph for protein structure analysis
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ContactGraph {
|
||||
/// Number of residues
|
||||
pub num_residues: usize,
|
||||
/// Distance threshold used
|
||||
pub distance_threshold: f32,
|
||||
/// Edges: (residue_i, residue_j, distance)
|
||||
pub edges: Vec<(usize, usize, f32)>,
|
||||
}
|
||||
|
||||
/// K-mer index using RuVector HNSW
|
||||
pub struct KmerIndex {
|
||||
db: VectorDB,
|
||||
k: usize,
|
||||
dims: usize,
|
||||
}
|
||||
|
||||
impl KmerIndex {
|
||||
/// Create new k-mer index
|
||||
pub fn new(k: usize, dims: usize, storage_path: &str) -> Result<Self> {
|
||||
let options = DbOptions {
|
||||
dimensions: dims,
|
||||
distance_metric: DistanceMetric::Cosine,
|
||||
storage_path: storage_path.to_string(),
|
||||
hnsw_config: Some(HnswConfig {
|
||||
m: 16,
|
||||
ef_construction: 200,
|
||||
ef_search: 100,
|
||||
max_elements: 1_000_000,
|
||||
}),
|
||||
quantization: None,
|
||||
};
|
||||
|
||||
let db = VectorDB::new(options)?;
|
||||
Ok(Self { db, k, dims })
|
||||
}
|
||||
|
||||
/// Get underlying VectorDB
|
||||
pub fn db(&self) -> &VectorDB {
|
||||
&self.db
|
||||
}
|
||||
|
||||
/// Get k-mer size
|
||||
pub fn k(&self) -> usize {
|
||||
self.k
|
||||
}
|
||||
|
||||
/// Get dimensions
|
||||
pub fn dims(&self) -> usize {
|
||||
self.dims
|
||||
}
|
||||
}
|
||||
|
||||
/// Analysis configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AnalysisConfig {
|
||||
/// K-mer size for indexing
|
||||
pub kmer_size: usize,
|
||||
/// Vector dimensions
|
||||
pub vector_dims: usize,
|
||||
/// Minimum quality score for variants
|
||||
pub min_quality: u8,
|
||||
/// Alignment match score
|
||||
pub match_score: i32,
|
||||
/// Alignment mismatch penalty
|
||||
pub mismatch_penalty: i32,
|
||||
/// Alignment gap open penalty
|
||||
pub gap_open_penalty: i32,
|
||||
/// Alignment gap extend penalty
|
||||
pub gap_extend_penalty: i32,
|
||||
/// Additional pipeline parameters
|
||||
pub parameters: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl Default for AnalysisConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
kmer_size: 11,
|
||||
vector_dims: 512,
|
||||
min_quality: 20,
|
||||
match_score: 2,
|
||||
mismatch_penalty: -1,
|
||||
gap_open_penalty: -3,
|
||||
gap_extend_penalty: -1,
|
||||
parameters: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_nucleotide_complement() {
|
||||
assert_eq!(Nucleotide::A.complement(), Nucleotide::T);
|
||||
assert_eq!(Nucleotide::G.complement(), Nucleotide::C);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dna_sequence() {
|
||||
let seq = DnaSequence::from_str("ACGT").unwrap();
|
||||
assert_eq!(seq.len(), 4);
|
||||
assert_eq!(seq.to_string(), "ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reverse_complement() {
|
||||
let seq = DnaSequence::from_str("ACGT").unwrap();
|
||||
let rc = seq.reverse_complement();
|
||||
assert_eq!(rc.to_string(), "ACGT");
|
||||
}
|
||||
}
|
||||
319
vendor/ruvector/examples/dna/src/variant.rs
vendored
Normal file
319
vendor/ruvector/examples/dna/src/variant.rs
vendored
Normal file
@@ -0,0 +1,319 @@
|
||||
//! Variant calling module for DNA analysis
|
||||
//!
|
||||
//! Provides SNP and indel calling from pileup data.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Pileup column representing reads aligned at a single position
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PileupColumn {
|
||||
/// Observed bases from aligned reads
|
||||
pub bases: Vec<u8>,
|
||||
/// Quality scores for each base
|
||||
pub qualities: Vec<u8>,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Chromosome number
|
||||
pub chromosome: u8,
|
||||
}
|
||||
|
||||
/// Genotype classification
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Genotype {
|
||||
/// Homozygous reference (0/0)
|
||||
HomRef,
|
||||
/// Heterozygous (0/1)
|
||||
Het,
|
||||
/// Homozygous alternate (1/1)
|
||||
HomAlt,
|
||||
}
|
||||
|
||||
/// Variant filter status
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum FilterStatus {
|
||||
/// Passed all filters
|
||||
Pass,
|
||||
/// Failed quality filter
|
||||
LowQuality,
|
||||
/// Failed depth filter
|
||||
LowDepth,
|
||||
}
|
||||
|
||||
/// Called variant
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VariantCall {
|
||||
/// Chromosome number
|
||||
pub chromosome: u8,
|
||||
/// Genomic position
|
||||
pub position: u64,
|
||||
/// Reference allele
|
||||
pub ref_allele: u8,
|
||||
/// Alternate allele
|
||||
pub alt_allele: u8,
|
||||
/// Variant quality (Phred-scaled)
|
||||
pub quality: f64,
|
||||
/// Genotype call
|
||||
pub genotype: Genotype,
|
||||
/// Total read depth
|
||||
pub depth: usize,
|
||||
/// Alternate allele depth
|
||||
pub allele_depth: usize,
|
||||
/// Filter status
|
||||
pub filter_status: FilterStatus,
|
||||
}
|
||||
|
||||
/// Variant caller configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VariantCallerConfig {
|
||||
/// Minimum base quality to consider
|
||||
pub min_quality: u8,
|
||||
/// Minimum read depth
|
||||
pub min_depth: usize,
|
||||
/// Minimum alternate allele frequency for heterozygous call
|
||||
pub het_threshold: f64,
|
||||
/// Minimum alternate allele frequency for homozygous alt call
|
||||
pub hom_alt_threshold: f64,
|
||||
}
|
||||
|
||||
impl Default for VariantCallerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_quality: 20,
|
||||
min_depth: 5,
|
||||
het_threshold: 0.2,
|
||||
hom_alt_threshold: 0.8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Variant caller that processes pileup data to call SNPs
|
||||
pub struct VariantCaller {
|
||||
config: VariantCallerConfig,
|
||||
}
|
||||
|
||||
impl VariantCaller {
|
||||
/// Create a new variant caller with the given configuration
|
||||
pub fn new(config: VariantCallerConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Call a SNP at a single pileup position
|
||||
///
|
||||
/// Returns `Some(VariantCall)` if a variant is detected, `None` if all reads
|
||||
/// match the reference or depth is insufficient.
|
||||
pub fn call_snp(&self, pileup: &PileupColumn, reference_base: u8) -> Option<VariantCall> {
|
||||
let ref_base = reference_base.to_ascii_uppercase();
|
||||
|
||||
// Count alleles (only high-quality bases)
|
||||
let mut allele_counts: HashMap<u8, usize> = HashMap::new();
|
||||
for (i, &base) in pileup.bases.iter().enumerate() {
|
||||
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
|
||||
if qual >= self.config.min_quality {
|
||||
*allele_counts.entry(base.to_ascii_uppercase()).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let total_depth: usize = allele_counts.values().sum();
|
||||
if total_depth < self.config.min_depth {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Find the most common non-reference allele
|
||||
let mut best_alt: Option<(u8, usize)> = None;
|
||||
for (&allele, &count) in &allele_counts {
|
||||
if allele != ref_base {
|
||||
if best_alt.map_or(true, |(_, best_count)| count > best_count) {
|
||||
best_alt = Some((allele, count));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (alt_allele, alt_count) = best_alt?;
|
||||
let alt_freq = alt_count as f64 / total_depth as f64;
|
||||
|
||||
if alt_freq < self.config.het_threshold {
|
||||
return None;
|
||||
}
|
||||
|
||||
let genotype = if alt_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
|
||||
// Phred-scaled quality estimate
|
||||
let quality = -10.0 * (1.0 - alt_freq).max(1e-10).log10() * (alt_count as f64);
|
||||
|
||||
Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele,
|
||||
quality,
|
||||
genotype,
|
||||
depth: total_depth,
|
||||
allele_depth: alt_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect insertions/deletions from pileup data
|
||||
///
|
||||
/// Looks for gaps (represented as b'-') in the pileup bases that indicate
|
||||
/// indels relative to the reference.
|
||||
pub fn call_indel(
|
||||
&self,
|
||||
pileup: &PileupColumn,
|
||||
reference_base: u8,
|
||||
next_ref_bases: &[u8],
|
||||
) -> Option<VariantCall> {
|
||||
let ref_base = reference_base.to_ascii_uppercase();
|
||||
let mut del_count = 0usize;
|
||||
let mut ins_count = 0usize;
|
||||
|
||||
for (i, &base) in pileup.bases.iter().enumerate() {
|
||||
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
|
||||
if qual < self.config.min_quality {
|
||||
continue;
|
||||
}
|
||||
if base == b'-' || base == b'*' {
|
||||
del_count += 1;
|
||||
} else if base == b'+' {
|
||||
ins_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let total = pileup.bases.len();
|
||||
if total < self.config.min_depth {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check for deletion
|
||||
if del_count > 0 {
|
||||
let del_freq = del_count as f64 / total as f64;
|
||||
if del_freq >= self.config.het_threshold {
|
||||
let genotype = if del_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
let quality = -10.0 * (1.0 - del_freq).max(1e-10).log10() * (del_count as f64);
|
||||
return Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele: b'-',
|
||||
quality,
|
||||
genotype,
|
||||
depth: total,
|
||||
allele_depth: del_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check for insertion
|
||||
if ins_count > 0 {
|
||||
let ins_freq = ins_count as f64 / total as f64;
|
||||
if ins_freq >= self.config.het_threshold {
|
||||
let genotype = if ins_freq >= self.config.hom_alt_threshold {
|
||||
Genotype::HomAlt
|
||||
} else {
|
||||
Genotype::Het
|
||||
};
|
||||
let quality = -10.0 * (1.0 - ins_freq).max(1e-10).log10() * (ins_count as f64);
|
||||
return Some(VariantCall {
|
||||
chromosome: pileup.chromosome,
|
||||
position: pileup.position,
|
||||
ref_allele: ref_base,
|
||||
alt_allele: b'+',
|
||||
quality,
|
||||
genotype,
|
||||
depth: total,
|
||||
allele_depth: ins_count,
|
||||
filter_status: FilterStatus::Pass,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Apply quality and depth filters to a list of variant calls
|
||||
pub fn filter_variants(&self, calls: &mut [VariantCall]) {
|
||||
for call in calls.iter_mut() {
|
||||
if call.quality < self.config.min_quality as f64 {
|
||||
call.filter_status = FilterStatus::LowQuality;
|
||||
} else if call.depth < self.config.min_depth {
|
||||
call.filter_status = FilterStatus::LowDepth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate VCF-formatted output for variant calls
|
||||
pub fn to_vcf(&self, calls: &[VariantCall], sample_name: &str) -> String {
|
||||
let mut vcf = String::new();
|
||||
vcf.push_str("##fileformat=VCFv4.3\n");
|
||||
vcf.push_str(&format!("##source=RuVectorDNA\n"));
|
||||
vcf.push_str("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t");
|
||||
vcf.push_str(sample_name);
|
||||
vcf.push('\n');
|
||||
|
||||
for call in calls {
|
||||
let filter = match call.filter_status {
|
||||
FilterStatus::Pass => "PASS",
|
||||
FilterStatus::LowQuality => "LowQual",
|
||||
FilterStatus::LowDepth => "LowDepth",
|
||||
};
|
||||
let gt = match call.genotype {
|
||||
Genotype::HomRef => "0/0",
|
||||
Genotype::Het => "0/1",
|
||||
Genotype::HomAlt => "1/1",
|
||||
};
|
||||
vcf.push_str(&format!(
|
||||
"chr{}\t{}\t.\t{}\t{}\t{:.1}\t{}\tDP={};AF={:.3}\tGT:DP:AD\t{}:{}:{}\n",
|
||||
call.chromosome,
|
||||
call.position,
|
||||
call.ref_allele as char,
|
||||
call.alt_allele as char,
|
||||
call.quality,
|
||||
filter,
|
||||
call.depth,
|
||||
call.allele_depth as f64 / call.depth as f64,
|
||||
gt,
|
||||
call.depth,
|
||||
call.allele_depth,
|
||||
));
|
||||
}
|
||||
|
||||
vcf
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_variant_caller_creation() {
|
||||
let config = VariantCallerConfig::default();
|
||||
let _caller = VariantCaller::new(config);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snp_calling() {
|
||||
let caller = VariantCaller::new(VariantCallerConfig::default());
|
||||
let pileup = PileupColumn {
|
||||
bases: vec![b'G'; 15],
|
||||
qualities: vec![40; 15],
|
||||
position: 1000,
|
||||
chromosome: 1,
|
||||
};
|
||||
|
||||
let call = caller.call_snp(&pileup, b'A');
|
||||
assert!(call.is_some());
|
||||
let call = call.unwrap();
|
||||
assert_eq!(call.genotype, Genotype::HomAlt);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user