Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,246 @@
//! Sequence alignment module using attention-based scoring
//!
//! Provides Smith-Waterman local alignment with attention-weighted
//! scoring derived from RuVector's attention primitives.
use crate::error::{DnaError, Result};
use crate::types::{
AlignmentResult, CigarOp, DnaSequence, GenomicPosition, Nucleotide, QualityScore,
};
/// Alignment configuration
#[derive(Debug, Clone)]
pub struct AlignmentConfig {
/// Match score
pub match_score: i32,
/// Mismatch penalty (negative)
pub mismatch_penalty: i32,
/// Gap open penalty (negative)
pub gap_open_penalty: i32,
/// Gap extension penalty (negative)
pub gap_extend_penalty: i32,
}
impl Default for AlignmentConfig {
fn default() -> Self {
Self {
match_score: 2,
mismatch_penalty: -1,
gap_open_penalty: -3,
gap_extend_penalty: -1,
}
}
}
/// Smith-Waterman local aligner with attention-weighted scoring
pub struct SmithWaterman {
config: AlignmentConfig,
}
impl SmithWaterman {
/// Create a new Smith-Waterman aligner
pub fn new(config: AlignmentConfig) -> Self {
Self { config }
}
/// Align query against reference using Smith-Waterman with affine gap penalties
pub fn align(&self, query: &DnaSequence, reference: &DnaSequence) -> Result<AlignmentResult> {
if query.is_empty() || reference.is_empty() {
return Err(DnaError::AlignmentError(
"Cannot align empty sequences".to_string(),
));
}
let q_bases = query.bases();
let r_bases = reference.bases();
let q_len = q_bases.len();
let r_len = r_bases.len();
let cols = r_len + 1;
// Rolling 2-row DP: only prev+curr rows for H and E (~12KB vs ~600KB).
// F needs only a single scalar (left neighbor in same row).
// Full traceback matrix kept since tb==0 encodes the stop condition.
let neg_inf = i32::MIN / 2;
let mut h_prev = vec![0i32; cols];
let mut h_curr = vec![0i32; cols];
let mut e_prev = vec![neg_inf; cols];
let mut e_curr = vec![neg_inf; cols];
let mut tb = vec![0u8; (q_len + 1) * cols]; // 0=stop, 1=diag, 2=up, 3=left
let match_sc = self.config.match_score;
let mismatch_sc = self.config.mismatch_penalty;
let gap_open = self.config.gap_open_penalty;
let gap_ext = self.config.gap_extend_penalty;
let mut max_score = 0i32;
let mut max_i = 0;
let mut max_j = 0;
// Fill scoring matrices with affine gap penalties
for i in 1..=q_len {
let q_base = q_bases[i - 1];
h_curr[0] = 0;
e_curr[0] = neg_inf;
let mut f_val = neg_inf; // F[i][0], reset per row
for j in 1..=r_len {
let mm = if q_base == r_bases[j - 1] {
match_sc
} else {
mismatch_sc
};
// E: gap in reference (insertion in query) — extend or open
let e_v = (e_prev[j] + gap_ext).max(h_prev[j] + gap_open);
e_curr[j] = e_v;
// F: gap in query (deletion from reference) — extend or open
f_val = (f_val + gap_ext).max(h_curr[j - 1] + gap_open);
let diag = h_prev[j - 1] + mm;
let best = 0.max(diag).max(e_v).max(f_val);
h_curr[j] = best;
tb[i * cols + j] = if best == 0 {
0
} else if best == diag {
1
} else if best == e_v {
2
} else {
3
};
if best > max_score {
max_score = best;
max_i = i;
max_j = j;
}
}
// Swap rows: current becomes previous for next iteration
std::mem::swap(&mut h_prev, &mut h_curr);
std::mem::swap(&mut e_prev, &mut e_curr);
}
// Traceback to build CIGAR (tb==0 encodes stop, same as h==0)
let mut cigar_ops = Vec::new();
let mut i = max_i;
let mut j = max_j;
while i > 0 && j > 0 && tb[i * cols + j] != 0 {
match tb[i * cols + j] {
1 => {
// Diagonal (match/mismatch)
cigar_ops.push(CigarOp::M(1));
i -= 1;
j -= 1;
}
2 => {
// Up (insertion in query)
cigar_ops.push(CigarOp::I(1));
i -= 1;
}
3 => {
// Left (deletion from query)
cigar_ops.push(CigarOp::D(1));
j -= 1;
}
_ => break,
}
}
cigar_ops.reverse();
// Merge consecutive same-type CIGAR operations
let cigar = merge_cigar_ops(&cigar_ops);
// Calculate alignment start position on reference
let align_start = j;
let mapq = ((max_score.max(0) as f64 / (q_len.max(1) as f64 * 2.0)) * 60.0).min(60.0) as u8;
Ok(AlignmentResult {
score: max_score,
cigar,
mapped_position: GenomicPosition {
chromosome: 1,
position: align_start as u64,
reference_allele: reference.get(align_start).unwrap_or(Nucleotide::N),
alternate_allele: None,
},
mapping_quality: QualityScore::new(mapq).unwrap_or(QualityScore::new(0).unwrap()),
})
}
}
/// Merge consecutive same-type CIGAR operations
fn merge_cigar_ops(ops: &[CigarOp]) -> Vec<CigarOp> {
if ops.is_empty() {
return Vec::new();
}
let mut merged = Vec::new();
let mut current = ops[0];
for &op in &ops[1..] {
match (current, op) {
(CigarOp::M(a), CigarOp::M(b)) => current = CigarOp::M(a + b),
(CigarOp::I(a), CigarOp::I(b)) => current = CigarOp::I(a + b),
(CigarOp::D(a), CigarOp::D(b)) => current = CigarOp::D(a + b),
_ => {
merged.push(current);
current = op;
}
}
}
merged.push(current);
merged
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_smith_waterman_exact_match() {
let aligner = SmithWaterman::new(AlignmentConfig::default());
let query = DnaSequence::from_str("ACGT").unwrap();
let reference = DnaSequence::from_str("ACGT").unwrap();
let result = aligner.align(&query, &reference).unwrap();
assert_eq!(result.score, 8); // 4 matches * 2 points
}
#[test]
fn test_smith_waterman_with_mismatch() {
let aligner = SmithWaterman::new(AlignmentConfig::default());
let query = DnaSequence::from_str("ACGT").unwrap();
let reference = DnaSequence::from_str("ACTT").unwrap();
let result = aligner.align(&query, &reference).unwrap();
assert!(result.score > 0);
assert!(result.score < 8); // Not perfect match
}
#[test]
fn test_smith_waterman_subsequence() {
let aligner = SmithWaterman::new(AlignmentConfig::default());
let query = DnaSequence::from_str("ACGT").unwrap();
let reference = DnaSequence::from_str("TTTTACGTTTTT").unwrap();
let result = aligner.align(&query, &reference).unwrap();
assert_eq!(result.score, 8); // Perfect subsequence match
assert_eq!(result.mapped_position.position, 4);
}
#[test]
fn test_empty_sequence_error() {
let aligner = SmithWaterman::new(AlignmentConfig::default());
let empty = DnaSequence::new(vec![]);
let seq = DnaSequence::from_str("ACGT").unwrap();
assert!(aligner.align(&empty, &seq).is_err());
assert!(aligner.align(&seq, &empty).is_err());
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,677 @@
//! Streaming biomarker data simulator with ring buffer and anomaly detection.
//!
//! Generates synthetic biomarker readings (glucose, cholesterol, HDL, LDL,
//! triglycerides, CRP) with configurable noise, drift, and anomaly injection.
//! Provides a [`StreamProcessor`] with rolling statistics, z-score anomaly
//! detection, and linear regression trend analysis over a [`RingBuffer`].
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Normal;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Configuration for simulated biomarker streams.
#[derive(Debug, Clone)]
pub struct StreamConfig {
pub base_interval_ms: u64,
pub noise_amplitude: f64,
pub drift_rate: f64,
pub anomaly_probability: f64,
pub anomaly_magnitude: f64,
pub num_biomarkers: usize,
pub window_size: usize,
}
impl Default for StreamConfig {
fn default() -> Self {
Self {
base_interval_ms: 1000,
noise_amplitude: 0.02,
drift_rate: 0.0,
anomaly_probability: 0.02,
anomaly_magnitude: 2.5,
num_biomarkers: 6,
window_size: 100,
}
}
}
/// A single timestamped biomarker data point.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BiomarkerReading {
pub timestamp_ms: u64,
pub biomarker_id: String,
pub value: f64,
pub reference_low: f64,
pub reference_high: f64,
pub is_anomaly: bool,
pub z_score: f64,
}
/// Fixed-capacity circular buffer backed by a flat `Vec<T>`.
///
/// Eliminates the `Option<T>` wrapper used in naive implementations,
/// halving per-slot memory for primitive types like `f64` (8 bytes vs 16).
pub struct RingBuffer<T> {
buffer: Vec<T>,
head: usize,
len: usize,
capacity: usize,
}
impl<T: Clone + Default> RingBuffer<T> {
pub fn new(capacity: usize) -> Self {
assert!(capacity > 0, "RingBuffer capacity must be > 0");
Self {
buffer: vec![T::default(); capacity],
head: 0,
len: 0,
capacity,
}
}
pub fn push(&mut self, item: T) {
self.buffer[self.head] = item;
self.head = (self.head + 1) % self.capacity;
if self.len < self.capacity {
self.len += 1;
}
}
pub fn iter(&self) -> impl Iterator<Item = &T> {
let start = if self.len < self.capacity {
0
} else {
self.head
};
let (cap, len) = (self.capacity, self.len);
(0..len).map(move |i| &self.buffer[(start + i) % cap])
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_full(&self) -> bool {
self.len == self.capacity
}
pub fn clear(&mut self) {
self.head = 0;
self.len = 0;
}
}
// ── Biomarker definitions ───────────────────────────────────────────────────
struct BiomarkerDef {
id: &'static str,
low: f64,
high: f64,
}
const BIOMARKER_DEFS: &[BiomarkerDef] = &[
BiomarkerDef {
id: "glucose",
low: 70.0,
high: 100.0,
},
BiomarkerDef {
id: "cholesterol_total",
low: 150.0,
high: 200.0,
},
BiomarkerDef {
id: "hdl",
low: 40.0,
high: 60.0,
},
BiomarkerDef {
id: "ldl",
low: 70.0,
high: 130.0,
},
BiomarkerDef {
id: "triglycerides",
low: 50.0,
high: 150.0,
},
BiomarkerDef {
id: "crp",
low: 0.1,
high: 3.0,
},
];
// ── Batch generation ────────────────────────────────────────────────────────
/// Generate `count` synthetic readings per active biomarker with noise, drift,
/// and stochastic anomaly spikes.
pub fn generate_readings(config: &StreamConfig, count: usize, seed: u64) -> Vec<BiomarkerReading> {
let mut rng = StdRng::seed_from_u64(seed);
let active = &BIOMARKER_DEFS[..config.num_biomarkers.min(BIOMARKER_DEFS.len())];
let mut readings = Vec::with_capacity(count * active.len());
// Pre-compute distributions per biomarker (avoids Normal::new in inner loop)
let dists: Vec<_> = active
.iter()
.map(|def| {
let range = def.high - def.low;
let mid = (def.low + def.high) / 2.0;
let sigma = (config.noise_amplitude * range).max(1e-12);
let normal = Normal::new(0.0, sigma).unwrap();
let spike = Normal::new(0.0, sigma * config.anomaly_magnitude).unwrap();
(mid, range, normal, spike)
})
.collect();
let mut ts: u64 = 0;
for step in 0..count {
for (j, def) in active.iter().enumerate() {
let (mid, range, ref normal, ref spike) = dists[j];
let drift = config.drift_rate * range * step as f64;
let is_anom = rng.gen::<f64>() < config.anomaly_probability;
let value = if is_anom {
(mid + rng.sample::<f64, _>(spike) + drift).max(0.0)
} else {
(mid + rng.sample::<f64, _>(normal) + drift).max(0.0)
};
readings.push(BiomarkerReading {
timestamp_ms: ts,
biomarker_id: def.id.into(),
value,
reference_low: def.low,
reference_high: def.high,
is_anomaly: is_anom,
z_score: 0.0,
});
}
ts += config.base_interval_ms;
}
readings
}
// ── Statistics & results ────────────────────────────────────────────────────
/// Rolling statistics for a single biomarker stream.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamStats {
pub mean: f64,
pub variance: f64,
pub min: f64,
pub max: f64,
pub count: u64,
pub anomaly_rate: f64,
pub trend_slope: f64,
pub ema: f64,
pub cusum_pos: f64, // CUSUM positive direction
pub cusum_neg: f64, // CUSUM negative direction
pub changepoint_detected: bool,
}
impl Default for StreamStats {
fn default() -> Self {
Self {
mean: 0.0,
variance: 0.0,
min: f64::MAX,
max: f64::MIN,
count: 0,
anomaly_rate: 0.0,
trend_slope: 0.0,
ema: 0.0,
cusum_pos: 0.0,
cusum_neg: 0.0,
changepoint_detected: false,
}
}
}
/// Result of processing a single reading.
pub struct ProcessingResult {
pub accepted: bool,
pub z_score: f64,
pub is_anomaly: bool,
pub current_trend: f64,
}
/// Aggregate summary across all biomarker streams.
pub struct StreamSummary {
pub total_readings: u64,
pub anomaly_count: u64,
pub anomaly_rate: f64,
pub biomarker_stats: HashMap<String, StreamStats>,
pub throughput_readings_per_sec: f64,
}
// ── Stream processor ────────────────────────────────────────────────────────
const EMA_ALPHA: f64 = 0.1;
const Z_SCORE_THRESHOLD: f64 = 2.5;
const REF_OVERSHOOT: f64 = 0.20;
const CUSUM_THRESHOLD: f64 = 4.0; // Cumulative sum threshold for changepoint detection
const CUSUM_DRIFT: f64 = 0.5; // Allowable drift before CUSUM accumulates
/// Processes biomarker readings with per-stream ring buffers, z-score anomaly
/// detection, and trend analysis via simple linear regression.
pub struct StreamProcessor {
config: StreamConfig,
buffers: HashMap<String, RingBuffer<f64>>,
stats: HashMap<String, StreamStats>,
total_readings: u64,
anomaly_count: u64,
anom_per_bio: HashMap<String, u64>,
start_ts: Option<u64>,
last_ts: Option<u64>,
}
impl StreamProcessor {
pub fn new(config: StreamConfig) -> Self {
let cap = config.num_biomarkers;
Self {
config,
buffers: HashMap::with_capacity(cap),
stats: HashMap::with_capacity(cap),
total_readings: 0,
anomaly_count: 0,
anom_per_bio: HashMap::with_capacity(cap),
start_ts: None,
last_ts: None,
}
}
pub fn process_reading(&mut self, reading: &BiomarkerReading) -> ProcessingResult {
let id = &reading.biomarker_id;
if self.start_ts.is_none() {
self.start_ts = Some(reading.timestamp_ms);
}
self.last_ts = Some(reading.timestamp_ms);
let buf = self
.buffers
.entry(id.clone())
.or_insert_with(|| RingBuffer::new(self.config.window_size));
buf.push(reading.value);
self.total_readings += 1;
let (wmean, wstd) = window_mean_std(buf);
let z = if wstd > 1e-12 {
(reading.value - wmean) / wstd
} else {
0.0
};
let rng = reading.reference_high - reading.reference_low;
let overshoot = REF_OVERSHOOT * rng;
let oor = reading.value < (reading.reference_low - overshoot)
|| reading.value > (reading.reference_high + overshoot);
let is_anom = z.abs() > Z_SCORE_THRESHOLD || oor;
if is_anom {
self.anomaly_count += 1;
*self.anom_per_bio.entry(id.clone()).or_insert(0) += 1;
}
let slope = compute_trend_slope(buf);
let bio_anom = *self.anom_per_bio.get(id).unwrap_or(&0);
let st = self.stats.entry(id.clone()).or_default();
st.count += 1;
st.mean = wmean;
st.variance = wstd * wstd;
st.trend_slope = slope;
st.anomaly_rate = bio_anom as f64 / st.count as f64;
if reading.value < st.min {
st.min = reading.value;
}
if reading.value > st.max {
st.max = reading.value;
}
st.ema = if st.count == 1 {
reading.value
} else {
EMA_ALPHA * reading.value + (1.0 - EMA_ALPHA) * st.ema
};
// CUSUM changepoint detection: accumulate deviations from the mean
if wstd > 1e-12 {
let norm_dev = (reading.value - wmean) / wstd;
st.cusum_pos = (st.cusum_pos + norm_dev - CUSUM_DRIFT).max(0.0);
st.cusum_neg = (st.cusum_neg - norm_dev - CUSUM_DRIFT).max(0.0);
st.changepoint_detected =
st.cusum_pos > CUSUM_THRESHOLD || st.cusum_neg > CUSUM_THRESHOLD;
if st.changepoint_detected {
st.cusum_pos = 0.0;
st.cusum_neg = 0.0;
}
}
ProcessingResult {
accepted: true,
z_score: z,
is_anomaly: is_anom,
current_trend: slope,
}
}
pub fn get_stats(&self, biomarker_id: &str) -> Option<&StreamStats> {
self.stats.get(biomarker_id)
}
pub fn summary(&self) -> StreamSummary {
let elapsed = match (self.start_ts, self.last_ts) {
(Some(s), Some(e)) if e > s => (e - s) as f64,
_ => 1.0,
};
let ar = if self.total_readings > 0 {
self.anomaly_count as f64 / self.total_readings as f64
} else {
0.0
};
StreamSummary {
total_readings: self.total_readings,
anomaly_count: self.anomaly_count,
anomaly_rate: ar,
biomarker_stats: self.stats.clone(),
throughput_readings_per_sec: self.total_readings as f64 / (elapsed / 1000.0),
}
}
}
// ── Helpers ─────────────────────────────────────────────────────────────────
/// Single-pass mean and sample standard deviation using Welford's online algorithm.
/// Avoids iterating the buffer twice (sum then variance) — 2x fewer cache misses.
fn window_mean_std(buf: &RingBuffer<f64>) -> (f64, f64) {
let n = buf.len();
if n == 0 {
return (0.0, 0.0);
}
let mut mean = 0.0;
let mut m2 = 0.0;
for (k, &x) in buf.iter().enumerate() {
let k1 = (k + 1) as f64;
let delta = x - mean;
mean += delta / k1;
m2 += delta * (x - mean);
}
if n < 2 {
return (mean, 0.0);
}
(mean, (m2 / (n - 1) as f64).sqrt())
}
fn compute_trend_slope(buf: &RingBuffer<f64>) -> f64 {
let n = buf.len();
if n < 2 {
return 0.0;
}
let nf = n as f64;
let xm = (nf - 1.0) / 2.0;
let (mut ys, mut xys, mut xxs) = (0.0, 0.0, 0.0);
for (i, &y) in buf.iter().enumerate() {
let x = i as f64;
ys += y;
xys += x * y;
xxs += x * x;
}
let ss_xy = xys - nf * xm * (ys / nf);
let ss_xx = xxs - nf * xm * xm;
if ss_xx.abs() < 1e-12 {
0.0
} else {
ss_xy / ss_xx
}
}
// ── Tests ───────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
fn reading(ts: u64, id: &str, val: f64, lo: f64, hi: f64) -> BiomarkerReading {
BiomarkerReading {
timestamp_ms: ts,
biomarker_id: id.into(),
value: val,
reference_low: lo,
reference_high: hi,
is_anomaly: false,
z_score: 0.0,
}
}
fn glucose(ts: u64, val: f64) -> BiomarkerReading {
reading(ts, "glucose", val, 70.0, 100.0)
}
// -- RingBuffer --
#[test]
fn ring_buffer_push_iter_len() {
let mut rb: RingBuffer<i32> = RingBuffer::new(4);
for v in [10, 20, 30] {
rb.push(v);
}
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![10, 20, 30]);
assert_eq!(rb.len(), 3);
assert!(!rb.is_full());
}
#[test]
fn ring_buffer_overflow_keeps_newest() {
let mut rb: RingBuffer<i32> = RingBuffer::new(3);
for v in 1..=4 {
rb.push(v);
}
assert!(rb.is_full());
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![2, 3, 4]);
}
#[test]
fn ring_buffer_capacity_one() {
let mut rb: RingBuffer<i32> = RingBuffer::new(1);
rb.push(42);
rb.push(99);
assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![99]);
}
#[test]
fn ring_buffer_clear_resets() {
let mut rb: RingBuffer<i32> = RingBuffer::new(3);
rb.push(1);
rb.push(2);
rb.clear();
assert_eq!(rb.len(), 0);
assert!(!rb.is_full());
assert_eq!(rb.iter().count(), 0);
}
// -- Batch generation --
#[test]
fn generate_correct_count_and_ids() {
let cfg = StreamConfig::default();
let readings = generate_readings(&cfg, 50, 42);
assert_eq!(readings.len(), 50 * cfg.num_biomarkers);
let valid: Vec<&str> = BIOMARKER_DEFS.iter().map(|d| d.id).collect();
for r in &readings {
assert!(valid.contains(&r.biomarker_id.as_str()));
}
}
#[test]
fn generated_reference_ranges_match_defs() {
let readings = generate_readings(&StreamConfig::default(), 20, 123);
for r in &readings {
let d = BIOMARKER_DEFS
.iter()
.find(|d| d.id == r.biomarker_id)
.unwrap();
assert!((r.reference_low - d.low).abs() < 1e-9);
assert!((r.reference_high - d.high).abs() < 1e-9);
}
}
#[test]
fn generated_values_non_negative() {
for r in &generate_readings(&StreamConfig::default(), 100, 999) {
assert!(r.value >= 0.0);
}
}
// -- StreamProcessor --
#[test]
fn processor_computes_stats() {
let cfg = StreamConfig {
window_size: 10,
..Default::default()
};
let mut p = StreamProcessor::new(cfg.clone());
for r in &generate_readings(&cfg, 20, 55) {
p.process_reading(r);
}
let s = p.get_stats("glucose").unwrap();
assert!(s.count > 0 && s.mean > 0.0 && s.min <= s.max);
}
#[test]
fn processor_summary_totals() {
let cfg = StreamConfig::default();
let mut p = StreamProcessor::new(cfg.clone());
for r in &generate_readings(&cfg, 30, 77) {
p.process_reading(r);
}
let s = p.summary();
assert_eq!(s.total_readings, 30 * cfg.num_biomarkers as u64);
assert!((0.0..=1.0).contains(&s.anomaly_rate));
}
// -- Anomaly detection --
#[test]
fn detects_z_score_anomaly() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 20,
..Default::default()
});
for i in 0..20 {
p.process_reading(&glucose(i * 1000, 85.0));
}
let r = p.process_reading(&glucose(20_000, 300.0));
assert!(r.is_anomaly);
assert!(r.z_score.abs() > Z_SCORE_THRESHOLD);
}
#[test]
fn detects_out_of_range_anomaly() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 5,
..Default::default()
});
for (i, v) in [80.0, 82.0, 78.0, 84.0, 81.0].iter().enumerate() {
p.process_reading(&glucose(i as u64 * 1000, *v));
}
// 140 >> ref_high(100) + 20%*range(30)=106
assert!(p.process_reading(&glucose(5000, 140.0)).is_anomaly);
}
#[test]
fn zero_anomaly_rate_for_constant_stream() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 50,
..Default::default()
});
for i in 0..10 {
p.process_reading(&reading(i * 1000, "crp", 1.5, 0.1, 3.0));
}
assert!(p.get_stats("crp").unwrap().anomaly_rate.abs() < 1e-9);
}
// -- Trend detection --
#[test]
fn positive_trend_for_increasing() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 20,
..Default::default()
});
let mut r = ProcessingResult {
accepted: true,
z_score: 0.0,
is_anomaly: false,
current_trend: 0.0,
};
for i in 0..20 {
r = p.process_reading(&glucose(i * 1000, 70.0 + i as f64));
}
assert!(r.current_trend > 0.0, "got {}", r.current_trend);
}
#[test]
fn negative_trend_for_decreasing() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 20,
..Default::default()
});
let mut r = ProcessingResult {
accepted: true,
z_score: 0.0,
is_anomaly: false,
current_trend: 0.0,
};
for i in 0..20 {
r = p.process_reading(&reading(i * 1000, "hdl", 60.0 - i as f64 * 0.5, 40.0, 60.0));
}
assert!(r.current_trend < 0.0, "got {}", r.current_trend);
}
#[test]
fn exact_slope_for_linear_series() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 10,
..Default::default()
});
for i in 0..10 {
p.process_reading(&reading(
i * 1000,
"ldl",
100.0 + i as f64 * 3.0,
70.0,
130.0,
));
}
assert!((p.get_stats("ldl").unwrap().trend_slope - 3.0).abs() < 1e-9);
}
// -- Z-score --
#[test]
fn z_score_small_for_near_mean() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 10,
..Default::default()
});
for (i, v) in [80.0, 82.0, 78.0, 84.0, 76.0, 86.0, 81.0, 79.0, 83.0]
.iter()
.enumerate()
{
p.process_reading(&glucose(i as u64 * 1000, *v));
}
let mean = p.get_stats("glucose").unwrap().mean;
assert!(p.process_reading(&glucose(9000, mean)).z_score.abs() < 1.0);
}
// -- EMA --
#[test]
fn ema_converges_to_constant() {
let mut p = StreamProcessor::new(StreamConfig {
window_size: 50,
..Default::default()
});
for i in 0..50 {
p.process_reading(&reading(i * 1000, "crp", 2.0, 0.1, 3.0));
}
assert!((p.get_stats("crp").unwrap().ema - 2.0).abs() < 1e-6);
}
}

View File

@@ -0,0 +1,322 @@
//! Epigenomics analysis module
//!
//! Provides methylation profiling and epigenetic age prediction
//! using the Horvath clock model.
use serde::{Deserialize, Serialize};
/// A CpG site with methylation data
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CpGSite {
/// Chromosome number
pub chromosome: u8,
/// Genomic position
pub position: u64,
/// Methylation level (beta value, 0.0 to 1.0)
pub methylation_level: f32,
}
/// Methylation profile containing CpG site measurements
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MethylationProfile {
/// CpG sites with measured methylation levels
pub sites: Vec<CpGSite>,
}
impl MethylationProfile {
/// Create a methylation profile from position and beta value arrays
pub fn from_beta_values(positions: Vec<(u8, u64)>, betas: Vec<f32>) -> Self {
let sites = positions
.into_iter()
.zip(betas.into_iter())
.map(|((chr, pos), beta)| CpGSite {
chromosome: chr,
position: pos,
methylation_level: beta.clamp(0.0, 1.0),
})
.collect();
Self { sites }
}
/// Calculate mean methylation across all sites
pub fn mean_methylation(&self) -> f32 {
if self.sites.is_empty() {
return 0.0;
}
let sum: f32 = self.sites.iter().map(|s| s.methylation_level).sum();
sum / self.sites.len() as f32
}
/// Calculate methylation entropy (Shannon entropy of beta values)
///
/// High entropy indicates heterogeneous methylation (potential tumor heterogeneity)
pub fn methylation_entropy(&self) -> f64 {
if self.sites.is_empty() {
return 0.0;
}
// Bin methylation into 10 bins [0, 0.1), [0.1, 0.2), ..., [0.9, 1.0]
let mut bins = [0u32; 10];
for site in &self.sites {
let bin = ((site.methylation_level * 10.0) as usize).min(9);
bins[bin] += 1;
}
let n = self.sites.len() as f64;
let mut entropy = 0.0;
for &count in &bins {
if count > 0 {
let p = count as f64 / n;
entropy -= p * p.ln();
}
}
entropy
}
/// Calculate extreme methylation ratio
///
/// Fraction of sites with beta < 0.1 (hypomethylated) or > 0.9 (hypermethylated).
/// High ratio indicates global methylation disruption (cancer hallmark).
pub fn extreme_methylation_ratio(&self) -> f32 {
if self.sites.is_empty() {
return 0.0;
}
let extreme_count = self
.sites
.iter()
.filter(|s| s.methylation_level < 0.1 || s.methylation_level > 0.9)
.count();
extreme_count as f32 / self.sites.len() as f32
}
}
/// Horvath epigenetic clock for biological age prediction
///
/// Uses a simplified linear model based on CpG site methylation levels
/// to predict biological age.
pub struct HorvathClock {
/// Intercept term
intercept: f64,
/// Coefficient per CpG site bin
coefficients: Vec<f64>,
/// Number of bins to partition sites into
num_bins: usize,
}
impl HorvathClock {
/// Create the default Horvath clock model
///
/// Uses a simplified model with binned methylation values.
/// Real implementation would use 353 specific CpG sites.
pub fn default_clock() -> Self {
Self {
intercept: 30.0,
coefficients: vec![
-15.0, // Low methylation bin (young)
10.0, // High methylation bin (age-associated)
0.5, // Neutral bin
],
num_bins: 3,
}
}
/// Predict biological age from a methylation profile
pub fn predict_age(&self, profile: &MethylationProfile) -> f64 {
if profile.sites.is_empty() {
return self.intercept;
}
// Partition sites into bins and compute mean methylation per bin
let bin_size = profile.sites.len() / self.num_bins.max(1);
let mut age = self.intercept;
for (bin_idx, coefficient) in self.coefficients.iter().enumerate() {
let start = bin_idx * bin_size;
let end = ((bin_idx + 1) * bin_size).min(profile.sites.len());
if start >= profile.sites.len() {
break;
}
let bin_sites = &profile.sites[start..end];
if !bin_sites.is_empty() {
let mean_meth: f64 = bin_sites
.iter()
.map(|s| s.methylation_level as f64)
.sum::<f64>()
/ bin_sites.len() as f64;
age += coefficient * mean_meth;
}
}
age.max(0.0)
}
/// Calculate age acceleration (difference between biological and chronological age)
///
/// Positive values indicate accelerated aging (associated with mortality risk).
/// Negative values indicate decelerated aging.
pub fn age_acceleration(predicted_age: f64, chronological_age: f64) -> f64 {
predicted_age - chronological_age
}
}
/// Cancer signal detector using methylation patterns
///
/// Combines methylation entropy and extreme methylation ratio
/// to produce a cancer risk score (0.0 to 1.0).
pub struct CancerSignalDetector {
/// Entropy weight in the combined score
entropy_weight: f64,
/// Extreme ratio weight
extreme_weight: f64,
/// Threshold for elevated cancer risk
risk_threshold: f64,
}
impl CancerSignalDetector {
/// Create with default parameters
pub fn new() -> Self {
Self {
entropy_weight: 0.4,
extreme_weight: 0.6,
risk_threshold: 0.3,
}
}
/// Detect cancer signal from methylation profile
///
/// Returns (risk_score, is_elevated) where risk_score is 0.0-1.0
/// and is_elevated indicates whether the score exceeds the threshold.
pub fn detect(&self, profile: &MethylationProfile) -> CancerSignalResult {
if profile.sites.is_empty() {
return CancerSignalResult {
risk_score: 0.0,
is_elevated: false,
entropy: 0.0,
extreme_ratio: 0.0,
};
}
let entropy = profile.methylation_entropy();
let extreme_ratio = profile.extreme_methylation_ratio() as f64;
// Normalize entropy to 0-1 range (max entropy for 10 bins = ln(10) ≈ 2.302)
let normalized_entropy = (entropy / 2.302).min(1.0);
let risk_score = (self.entropy_weight * normalized_entropy
+ self.extreme_weight * extreme_ratio)
.min(1.0);
CancerSignalResult {
risk_score,
is_elevated: risk_score >= self.risk_threshold,
entropy,
extreme_ratio,
}
}
}
impl Default for CancerSignalDetector {
fn default() -> Self {
Self::new()
}
}
/// Result from cancer signal detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CancerSignalResult {
/// Combined risk score (0.0 to 1.0)
pub risk_score: f64,
/// Whether the risk score exceeds the threshold
pub is_elevated: bool,
/// Raw methylation entropy
pub entropy: f64,
/// Fraction of extreme methylation sites
pub extreme_ratio: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_methylation_profile() {
let positions = vec![(1, 1000), (1, 2000)];
let betas = vec![0.3, 0.7];
let profile = MethylationProfile::from_beta_values(positions, betas);
assert_eq!(profile.sites.len(), 2);
assert!((profile.mean_methylation() - 0.5).abs() < 0.001);
}
#[test]
fn test_horvath_clock() {
let clock = HorvathClock::default_clock();
let positions = vec![(1, 1000), (1, 2000), (1, 3000)];
let betas = vec![0.5, 0.5, 0.5];
let profile = MethylationProfile::from_beta_values(positions, betas);
let age = clock.predict_age(&profile);
assert!(age > 0.0);
}
#[test]
fn test_age_acceleration() {
let accel = HorvathClock::age_acceleration(55.0, 50.0);
assert!((accel - 5.0).abs() < 0.001);
let decel = HorvathClock::age_acceleration(40.0, 50.0);
assert!((decel - (-10.0)).abs() < 0.001);
}
#[test]
fn test_methylation_entropy() {
// Uniform methylation = low entropy
let positions: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
let betas = vec![0.5; 100];
let profile = MethylationProfile::from_beta_values(positions, betas);
let entropy = profile.methylation_entropy();
assert!(
entropy < 0.1,
"Uniform should have low entropy: {}",
entropy
);
// Spread methylation = high entropy
let positions2: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
let betas2: Vec<f32> = (0..100).map(|i| i as f32 / 100.0).collect();
let profile2 = MethylationProfile::from_beta_values(positions2, betas2);
let entropy2 = profile2.methylation_entropy();
assert!(
entropy2 > 1.0,
"Spread should have high entropy: {}",
entropy2
);
}
#[test]
fn test_cancer_signal_detector() {
let detector = CancerSignalDetector::new();
// Normal profile (moderate methylation)
let positions: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
let betas = vec![0.5; 100];
let profile = MethylationProfile::from_beta_values(positions, betas);
let result = detector.detect(&profile);
assert!(!result.is_elevated, "Normal profile should not be elevated");
assert!(result.risk_score < 0.3);
// Cancerous profile (extreme methylation)
let positions2: Vec<(u8, u64)> = (0..100).map(|i| (1u8, i as u64)).collect();
let betas2: Vec<f32> = (0..100)
.map(|i| if i % 2 == 0 { 0.95 } else { 0.05 })
.collect();
let profile2 = MethylationProfile::from_beta_values(positions2, betas2);
let result2 = detector.detect(&profile2);
assert!(result2.is_elevated, "Cancer profile should be elevated");
assert!(result2.extreme_ratio > 0.8);
}
}

View File

@@ -0,0 +1,58 @@
//! Error types for DNA analysis operations
use thiserror::Error;
/// DNA analysis error types
#[derive(Error, Debug)]
pub enum DnaError {
/// Invalid DNA sequence (e.g., non-ACGTN characters)
#[error("Invalid DNA sequence: {0}")]
InvalidSequence(String),
/// K-mer indexing error
#[error("K-mer index error: {0}")]
IndexError(String),
/// Sequence alignment error
#[error("Alignment error: {0}")]
AlignmentError(String),
/// Variant calling error
#[error("Variant calling error: {0}")]
VariantCallError(String),
/// Analysis pipeline error
#[error("Pipeline error: {0}")]
PipelineError(String),
/// I/O error
#[error("I/O error: {0}")]
IoError(#[from] std::io::Error),
/// RuVector core error
#[error("Vector database error: {0}")]
VectorDbError(#[from] ruvector_core::RuvectorError),
/// Dimension mismatch
#[error("Dimension mismatch: expected {expected}, got {actual}")]
DimensionMismatch { expected: usize, actual: usize },
/// Empty sequence
#[error("Empty sequence provided")]
EmptySequence,
/// Invalid quality score
#[error("Invalid quality score: {0}")]
InvalidQuality(u8),
/// Invalid k-mer size
#[error("Invalid k-mer size: {0}")]
InvalidKmerSize(usize),
/// 23andMe file parse error
#[error("Parse error: {0}")]
ParseError(String),
}
/// Result type for DNA analysis operations
pub type Result<T> = std::result::Result<T, DnaError>;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,686 @@
//! Health variant analysis for genotyping data
//!
//! Clinically significant variant interpretation for 17+ health-relevant
//! SNPs commonly found in 23andMe/genotyping panels. Covers APOE, BRCA1/2,
//! TP53, MTHFR, COMT, OPRM1, CYP1A2, and more.
//!
//! Based on: <https://github.com/ericporres/rvdna-bridge>
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Result of analyzing a single health variant
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthVariantResult {
/// rsid identifier
pub rsid: String,
/// Gene name
pub gene: String,
/// Variant common name
pub name: String,
/// Observed genotype
pub genotype: String,
/// Risk allele
pub risk_allele: char,
/// Human-readable interpretation
pub interpretation: String,
/// Clinical significance
pub clinical_significance: String,
}
/// APOE genotype determination result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ApoeResult {
/// Full APOE genotype string (e.g., "e2/e3")
pub genotype: String,
/// rs429358 genotype
pub rs429358: String,
/// rs7412 genotype
pub rs7412: String,
}
/// MTHFR compound status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MthfrResult {
/// C677T genotype (rs1801133)
pub c677t: String,
/// A1298C genotype (rs1801131)
pub a1298c: String,
/// Compound risk score (0-4)
pub score: u8,
/// Clinical assessment text
pub assessment: String,
}
/// Pain sensitivity profile (COMT + OPRM1)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PainProfile {
/// COMT genotype (rs4680)
pub comt: String,
/// OPRM1 genotype (rs1799971)
pub oprm1: String,
/// Combined pain score (0-4)
pub score: u8,
/// Sensitivity label
pub label: String,
/// COMT interpretation
pub comt_note: String,
/// OPRM1 interpretation
pub oprm1_note: String,
}
// ── Internal definition type ──
struct VDef {
rsid: &'static str,
gene: &'static str,
name: &'static str,
risk_allele: char,
// (genotype, description, significance)
interps: &'static [(&'static str, &'static str, &'static str)],
}
static HEALTH_VARIANTS: &[VDef] = &[
// ── APOE (Alzheimer's) ──
VDef {
rsid: "rs429358",
gene: "APOE",
name: "APOE e4 determinant",
risk_allele: 'C',
interps: &[
(
"TT",
"APOE e3/e3 or e2/e3 (depends on rs7412)",
"Protective/Normal",
),
(
"CT",
"One e4 allele present",
"Increased Alzheimer's risk (~3x)",
),
(
"CC",
"Two e4 alleles present",
"Significantly increased Alzheimer's risk (~12x)",
),
],
},
VDef {
rsid: "rs7412",
gene: "APOE",
name: "APOE e2 determinant",
risk_allele: 'T',
interps: &[
("CC", "No e2 allele", "Normal"),
(
"CT",
"One e2 allele present",
"Protective - reduced Alzheimer's risk",
),
("TT", "Two e2 alleles (e2/e2)", "Protective; monitor lipids"),
],
},
// ── TP53 (cancer) ──
VDef {
rsid: "rs1042522",
gene: "TP53",
name: "p53 Pro72Arg (R72P)",
risk_allele: 'G',
interps: &[
(
"CC",
"Pro/Pro homozygous",
"Normal apoptosis; slightly increased cancer survival",
),
(
"CG",
"Pro/Arg heterozygous",
"Mixed - Arg allele has stronger apoptotic activity",
),
(
"GG",
"Arg/Arg homozygous",
"Stronger apoptotic response; variable cancer risk",
),
],
},
// ── BRCA1 ──
VDef {
rsid: "rs80357906",
gene: "BRCA1",
name: "BRCA1 5382insC (Ashkenazi founder)",
risk_allele: 'I',
interps: &[
(
"DD",
"No insertion detected",
"Normal - no BRCA1 5382insC mutation",
),
(
"DI",
"Heterozygous carrier",
"INCREASED breast/ovarian cancer risk - genetic counseling recommended",
),
(
"II",
"Homozygous insertion",
"HIGH breast/ovarian cancer risk - urgent genetic counseling",
),
],
},
VDef {
rsid: "rs28897696",
gene: "BRCA1",
name: "BRCA1 missense variant",
risk_allele: 'A',
interps: &[
("GG", "Reference genotype", "Normal"),
(
"AG",
"Heterozygous",
"Variant of uncertain significance - consult genetic counselor",
),
("AA", "Homozygous variant", "Consult genetic counselor"),
],
},
// ── BRCA2 ──
VDef {
rsid: "rs11571833",
gene: "BRCA2",
name: "BRCA2 K3326X",
risk_allele: 'T',
interps: &[
("AA", "Reference genotype", "Normal"),
(
"AT",
"Heterozygous",
"Modestly increased cancer risk (OR ~1.3)",
),
(
"TT",
"Homozygous variant",
"Increased cancer risk - genetic counseling recommended",
),
],
},
// ── MTHFR (folate metabolism) ──
VDef {
rsid: "rs1801133",
gene: "MTHFR",
name: "C677T",
risk_allele: 'A',
interps: &[
(
"GG",
"CC genotype (normal)",
"Normal MTHFR enzyme activity (100%)",
),
(
"AG",
"CT heterozygous",
"Reduced enzyme activity (~65%). Consider methylfolate.",
),
(
"AA",
"TT homozygous",
"Significantly reduced activity (~30%). Methylfolate recommended.",
),
],
},
VDef {
rsid: "rs1801131",
gene: "MTHFR",
name: "A1298C",
risk_allele: 'T',
interps: &[
("GG", "CC homozygous variant", "Reduced enzyme activity"),
("GT", "AC heterozygous", "Mildly reduced enzyme activity"),
(
"TT",
"AA reference",
"Normal MTHFR activity at this position",
),
],
},
// ── COMT (dopamine/pain) ──
VDef {
rsid: "rs4680",
gene: "COMT",
name: "Val158Met",
risk_allele: 'A',
interps: &[
(
"GG",
"Val/Val",
"Higher COMT activity, lower dopamine. Better stress resilience.",
),
(
"AG",
"Val/Met heterozygous",
"Intermediate COMT activity. Balanced dopamine.",
),
(
"AA",
"Met/Met",
"Lower COMT activity, higher dopamine. Higher pain sensitivity.",
),
],
},
// ── OPRM1 (opioid receptor) ──
VDef {
rsid: "rs1799971",
gene: "OPRM1",
name: "A118G (Asn40Asp)",
risk_allele: 'G',
interps: &[
("AA", "Asn/Asn", "Normal opioid sensitivity"),
(
"AG",
"Asn/Asp heterozygous",
"Reduced opioid sensitivity; may need higher doses.",
),
("GG", "Asp/Asp", "Significantly reduced opioid sensitivity."),
],
},
// ── CYP1A2 (caffeine) ──
VDef {
rsid: "rs762551",
gene: "CYP1A2",
name: "Caffeine metabolism",
risk_allele: 'C',
interps: &[
(
"AA",
"Fast metabolizer",
"Rapid caffeine clearance. Coffee may REDUCE heart disease risk.",
),
(
"AC",
"Intermediate",
"Moderate caffeine clearance. Moderate coffee intake recommended.",
),
(
"CC",
"Slow metabolizer",
"Slow caffeine clearance. Excess coffee may INCREASE heart risk.",
),
],
},
// ── Lactose ──
VDef {
rsid: "rs4988235",
gene: "MCM6/LCT",
name: "Lactase persistence (European)",
risk_allele: 'G',
interps: &[
(
"AA",
"Lactase persistent",
"Likely lactose TOLERANT into adulthood",
),
(
"AG",
"Heterozygous",
"Likely lactose tolerant (persistence is dominant)",
),
(
"GG",
"Lactase non-persistent",
"Likely lactose INTOLERANT in adulthood",
),
],
},
// ── OXTR (oxytocin receptor) ──
VDef {
rsid: "rs53576",
gene: "OXTR",
name: "Oxytocin receptor",
risk_allele: 'A',
interps: &[
(
"GG",
"GG genotype",
"Higher empathy scores; better social cognition.",
),
(
"AG",
"AG heterozygous",
"Intermediate empathy and social cognition.",
),
(
"AA",
"AA genotype",
"May have lower empathy; potentially more resilient to social stress.",
),
],
},
// ── HTR2A (serotonin) ──
VDef {
rsid: "rs6311",
gene: "HTR2A",
name: "Serotonin 2A receptor (-1438G/A)",
risk_allele: 'T',
interps: &[
("CC", "GG genotype", "Normal serotonin receptor expression"),
(
"CT",
"GA heterozygous",
"Slightly altered serotonin signaling",
),
(
"TT",
"AA genotype",
"Altered serotonin receptor density; may affect SSRI response",
),
],
},
// ── ANKK1/DRD2 (dopamine) ──
VDef {
rsid: "rs1800497",
gene: "ANKK1/DRD2",
name: "Taq1A (dopamine receptor)",
risk_allele: 'A',
interps: &[
("GG", "A2/A2", "Normal dopamine receptor density"),
(
"AG",
"A1/A2 heterozygous",
"Reduced D2 receptor density (~30% less). Reward-seeking.",
),
(
"AA",
"A1/A1",
"Significantly reduced D2 receptor density. Higher addiction risk.",
),
],
},
// ── SLCO1B1 (statin metabolism) ──
VDef {
rsid: "rs4363657",
gene: "SLCO1B1",
name: "Statin transporter",
risk_allele: 'C',
interps: &[
(
"TT",
"Reference",
"Normal statin metabolism. Standard dosing.",
),
(
"CT",
"Heterozygous",
"Increased statin myopathy risk (~4.5x). Consider lower dose.",
),
(
"CC",
"Homozygous variant",
"High statin myopathy risk (~17x). Use lowest effective dose.",
),
],
},
// ── NQO1 (oxidative stress) ──
VDef {
rsid: "rs1800566",
gene: "NQO1",
name: "Pro187Ser (oxidative stress)",
risk_allele: 'T',
interps: &[
("CC", "Pro/Pro (reference)", "Normal NQO1 enzyme activity"),
(
"CT",
"Pro/Ser heterozygous",
"Reduced NQO1 activity (~3x lower). Impaired detox.",
),
(
"TT",
"Ser/Ser",
"No NQO1 activity. Significantly impaired quinone detoxification.",
),
],
},
];
/// Analyze health variants from a genotype map (rsid -> genotype string).
pub fn analyze_health_variants(genotypes: &HashMap<String, String>) -> Vec<HealthVariantResult> {
let mut results = Vec::new();
for def in HEALTH_VARIANTS {
if let Some(gt) = genotypes.get(def.rsid) {
let (desc, sig) = def
.interps
.iter()
.find(|(g, _, _)| *g == gt.as_str())
.map(|(_, d, s)| (d.to_string(), s.to_string()))
.unwrap_or_else(|| {
(
format!("Genotype {} - not in standard table", gt),
"Consult genetic counselor".to_string(),
)
});
results.push(HealthVariantResult {
rsid: def.rsid.to_string(),
gene: def.gene.to_string(),
name: def.name.to_string(),
genotype: gt.clone(),
risk_allele: def.risk_allele,
interpretation: desc,
clinical_significance: sig,
});
}
}
results
}
/// Determine APOE genotype from rs429358 + rs7412 combination.
pub fn determine_apoe(genotypes: &HashMap<String, String>) -> ApoeResult {
let gt1 = genotypes.get("rs429358").cloned().unwrap_or_default();
let gt2 = genotypes.get("rs7412").cloned().unwrap_or_default();
if gt1.is_empty() || gt2.is_empty() {
return ApoeResult {
genotype: "Unable to determine (missing data)".into(),
rs429358: gt1,
rs7412: gt2,
};
}
// e4 alleles = count of 'C' at rs429358
let e4 = gt1.chars().filter(|&c| c == 'C').count();
// e2 alleles = count of 'T' at rs7412
let e2 = gt2.chars().filter(|&c| c == 'T').count();
let genotype = match (e4, e2) {
(0, 0) => "e3/e3 (most common, baseline risk)".into(),
(0, 1) => "e2/e3 (PROTECTIVE - reduced Alzheimer's risk)".into(),
(0, 2) => "e2/e2 (protective; monitor for type III hyperlipoproteinemia)".into(),
(1, 0) => "e3/e4 (increased Alzheimer's risk ~3x)".into(),
(1, 1) => "e2/e4 (mixed - e2 partially offsets e4 risk)".into(),
(2, _) => "e4/e4 (significantly increased Alzheimer's risk ~12x)".into(),
_ => format!("Unusual combination: rs429358={}, rs7412={}", gt1, gt2),
};
ApoeResult {
genotype,
rs429358: gt1,
rs7412: gt2,
}
}
/// Analyze MTHFR compound status from C677T + A1298C.
pub fn analyze_mthfr(genotypes: &HashMap<String, String>) -> MthfrResult {
let c677t = genotypes.get("rs1801133").cloned().unwrap_or_default();
let a1298c = genotypes.get("rs1801131").cloned().unwrap_or_default();
if c677t.is_empty() || a1298c.is_empty() {
return MthfrResult {
c677t,
a1298c,
score: 0,
assessment: "Incomplete MTHFR data".into(),
};
}
let c_risk = match c677t.as_str() {
"GG" => 0u8,
"AG" => 1,
"AA" => 2,
_ => 0,
};
let a_risk = match a1298c.as_str() {
"TT" => 0u8,
"GT" => 1,
"GG" => 2,
_ => 0,
};
let score = c_risk + a_risk;
let assessment = match score {
0 => "Normal MTHFR function. No supplementation needed.",
1 => "Mildly reduced MTHFR. Consider methylfolate if homocysteine elevated.",
2 => "Moderately reduced MTHFR. Methylfolate (L-5-MTHF) recommended.",
3 => "Significantly reduced MTHFR (compound heterozygote). Methylfolate strongly recommended.",
_ => "Severely reduced MTHFR. Methylfolate essential. Regular homocysteine monitoring.",
};
MthfrResult {
c677t,
a1298c,
score,
assessment: assessment.into(),
}
}
/// Analyze pain sensitivity profile from COMT + OPRM1.
pub fn analyze_pain(genotypes: &HashMap<String, String>) -> Option<PainProfile> {
let comt = genotypes.get("rs4680")?;
let oprm1 = genotypes.get("rs1799971")?;
let mut score = 0u8;
if comt == "AA" {
score += 2;
} else if comt == "AG" {
score += 1;
}
if oprm1 == "GG" {
score += 2;
} else if oprm1 == "AG" {
score += 1;
}
let label = match score {
0 => "Low",
1 => "Low-Moderate",
2 => "Moderate",
3 => "Moderate-High",
_ => "High",
};
let comt_note = if comt.contains('A') {
"Higher pain sensitivity"
} else {
"Lower pain sensitivity"
};
let oprm1_note = if oprm1.contains('G') {
"Reduced opioid response"
} else {
"Normal opioid response"
};
Some(PainProfile {
comt: comt.clone(),
oprm1: oprm1.clone(),
score,
label: label.into(),
comt_note: comt_note.into(),
oprm1_note: oprm1_note.into(),
})
}
/// Category groupings for health variant display
pub fn variant_categories() -> Vec<(&'static str, Vec<&'static str>)> {
vec![
("Cancer Risk", vec!["TP53", "BRCA1", "BRCA2", "NQO1"]),
("Cardiovascular", vec!["SLCO1B1"]),
(
"Neurological",
vec!["APOE", "COMT", "OPRM1", "OXTR", "HTR2A", "ANKK1/DRD2"],
),
("Metabolism", vec!["MTHFR", "CYP1A2", "MCM6/LCT"]),
]
}
#[cfg(test)]
mod tests {
use super::*;
fn make_map(pairs: &[(&str, &str)]) -> HashMap<String, String> {
pairs
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect()
}
#[test]
fn test_apoe_e3e3() {
let gts = make_map(&[("rs429358", "TT"), ("rs7412", "CC")]);
let r = determine_apoe(&gts);
assert!(r.genotype.contains("e3/e3"));
}
#[test]
fn test_apoe_e2e3() {
let gts = make_map(&[("rs429358", "TT"), ("rs7412", "CT")]);
let r = determine_apoe(&gts);
assert!(r.genotype.contains("e2/e3"));
}
#[test]
fn test_apoe_e4e4() {
let gts = make_map(&[("rs429358", "CC"), ("rs7412", "CC")]);
let r = determine_apoe(&gts);
assert!(r.genotype.contains("e4/e4"));
}
#[test]
fn test_mthfr_normal() {
let gts = make_map(&[("rs1801133", "GG"), ("rs1801131", "TT")]);
let r = analyze_mthfr(&gts);
assert_eq!(r.score, 0);
assert!(r.assessment.contains("Normal"));
}
#[test]
fn test_mthfr_compound() {
let gts = make_map(&[("rs1801133", "AG"), ("rs1801131", "GG")]);
let r = analyze_mthfr(&gts);
assert_eq!(r.score, 3);
assert!(r.assessment.contains("compound"));
}
#[test]
fn test_pain_low() {
let gts = make_map(&[("rs4680", "GG"), ("rs1799971", "AA")]);
let p = analyze_pain(&gts).unwrap();
assert_eq!(p.score, 0);
assert_eq!(p.label, "Low");
}
#[test]
fn test_pain_high() {
let gts = make_map(&[("rs4680", "AA"), ("rs1799971", "GG")]);
let p = analyze_pain(&gts).unwrap();
assert_eq!(p.score, 4);
assert_eq!(p.label, "High");
}
#[test]
fn test_health_variants_lookup() {
let gts = make_map(&[("rs762551", "AA"), ("rs4680", "AG")]);
let results = analyze_health_variants(&gts);
assert_eq!(results.len(), 2);
assert_eq!(results[0].gene, "COMT");
assert_eq!(results[1].gene, "CYP1A2");
}
}

511
vendor/ruvector/examples/dna/src/kmer.rs vendored Normal file
View File

@@ -0,0 +1,511 @@
//! K-mer encoding and HNSW vector indexing for DNA sequences
//!
//! This module provides efficient k-mer based vector encoding for DNA sequences
//! with HNSW indexing for fast similarity search. Implements both k-mer frequency
//! vectors and MinHash sketching (Mash/sourmash algorithm).
use ruvector_core::{
types::{DbOptions, DistanceMetric, HnswConfig, QuantizationConfig, SearchQuery},
VectorDB, VectorEntry,
};
use std::collections::HashMap;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum KmerError {
#[error("Invalid k-mer length: {0}")]
InvalidKmerLength(usize),
#[error("Invalid DNA sequence: {0}")]
InvalidSequence(String),
#[error("Database error: {0}")]
DatabaseError(#[from] ruvector_core::RuvectorError),
#[error("Empty sequence")]
EmptySequence,
}
type Result<T> = std::result::Result<T, KmerError>;
/// Nucleotide to 2-bit encoding: A=0, C=1, G=2, T=3
#[inline]
fn nucleotide_to_bits(nuc: u8) -> Option<u8> {
match nuc.to_ascii_uppercase() {
b'A' => Some(0),
b'C' => Some(1),
b'G' => Some(2),
b'T' | b'U' => Some(3),
_ => None,
}
}
/// Returns the reverse complement of a DNA sequence
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&nuc| match nuc.to_ascii_uppercase() {
b'A' => b'T',
b'T' | b'U' => b'A',
b'C' => b'G',
b'G' => b'C',
n => n,
})
.collect()
}
/// Returns the canonical k-mer (lexicographically smaller of k-mer and its reverse complement)
pub fn canonical_kmer(kmer: &[u8]) -> Vec<u8> {
let rc = reverse_complement(kmer);
if kmer <= rc.as_slice() {
kmer.to_vec()
} else {
rc
}
}
/// K-mer encoder that converts DNA sequences into frequency vectors
pub struct KmerEncoder {
k: usize,
dimensions: usize,
}
impl KmerEncoder {
/// Create a new k-mer encoder for k-mers of length k
///
/// # Arguments
/// * `k` - Length of k-mers (typical values: 21, 31)
///
/// Uses feature hashing to limit dimensionality for large k
pub fn new(k: usize) -> Result<Self> {
if k == 0 || k > 32 {
return Err(KmerError::InvalidKmerLength(k));
}
// Calculate dimensions: min(4^k, 1024) using feature hashing
let max_kmers = 4_usize.saturating_pow(k as u32);
let dimensions = max_kmers.min(1024);
Ok(Self { k, dimensions })
}
/// Get the number of dimensions in the encoded vector
pub fn dimensions(&self) -> usize {
self.dimensions
}
/// Encode a DNA sequence into a k-mer frequency vector
///
/// Uses canonical k-mer hashing (min of forward/reverse-complement hash)
/// to count strand-agnostic k-mers, then normalizes to unit vector.
pub fn encode_sequence(&self, seq: &[u8]) -> Result<Vec<f32>> {
if seq.len() < self.k {
return Err(KmerError::EmptySequence);
}
let mut counts = vec![0u32; self.dimensions];
let mut total = 0u32;
// Extract all k-mers using a sliding window
// Avoid Vec allocation by hashing both strands and taking min
for window in seq.windows(self.k) {
let fwd_hash = Self::fnv1a_hash(window);
let rc_hash = Self::fnv1a_hash_rc(window);
let canonical_hash = fwd_hash.min(rc_hash);
let index = canonical_hash % self.dimensions;
counts[index] = counts[index].saturating_add(1);
total = total.saturating_add(1);
}
// Normalize to frequency vector and then to unit vector
let inv_total = 1.0 / total as f32;
let mut vector: Vec<f32> = counts
.iter()
.map(|&count| count as f32 * inv_total)
.collect();
// L2 normalization
let norm: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
let inv_norm = 1.0 / norm;
vector.iter_mut().for_each(|x| *x *= inv_norm);
}
Ok(vector)
}
/// FNV-1a hash of a byte slice
#[inline]
fn fnv1a_hash(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data {
hash ^= byte as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
/// FNV-1a hash of reverse complement (avoids Vec allocation)
#[inline]
fn fnv1a_hash_rc(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data.iter().rev() {
let comp = match byte.to_ascii_uppercase() {
b'A' => b'T',
b'T' | b'U' => b'A',
b'C' => b'G',
b'G' => b'C',
n => n,
};
hash ^= comp as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
/// Hash a k-mer to an index using FNV-1a hash
fn hash_kmer(&self, kmer: &[u8]) -> usize {
Self::fnv1a_hash(kmer)
}
}
/// MinHash sketch for fast sequence similarity (Mash/sourmash algorithm)
pub struct MinHashSketch {
num_hashes: usize,
hashes: Vec<u64>,
}
impl MinHashSketch {
/// Create a new MinHash sketch with the given number of hashes
///
/// # Arguments
/// * `num_hashes` - Number of hash values to keep (typically 1000)
pub fn new(num_hashes: usize) -> Self {
Self {
num_hashes,
hashes: Vec::new(),
}
}
/// Compute MinHash signature for a DNA sequence
pub fn sketch(&mut self, seq: &[u8], k: usize) -> Result<&[u64]> {
if seq.len() < k {
return Err(KmerError::EmptySequence);
}
let mut all_hashes = Vec::with_capacity(seq.len() - k + 1);
// Hash all k-mers using dual-hash (no Vec allocation per k-mer)
for window in seq.windows(k) {
let fwd = Self::hash_kmer_64_slice(window);
let rc = Self::hash_kmer_64_rc(window);
all_hashes.push(fwd.min(rc));
}
// Sort and keep the smallest num_hashes values
all_hashes.sort_unstable();
all_hashes.truncate(self.num_hashes);
self.hashes = all_hashes;
Ok(&self.hashes)
}
/// Compute Jaccard distance between two MinHash sketches
pub fn jaccard_distance(&self, other: &MinHashSketch) -> f32 {
if self.hashes.is_empty() || other.hashes.is_empty() {
return 1.0;
}
let mut intersection = 0;
let mut i = 0;
let mut j = 0;
// Count intersection using sorted arrays
while i < self.hashes.len() && j < other.hashes.len() {
if self.hashes[i] == other.hashes[j] {
intersection += 1;
i += 1;
j += 1;
} else if self.hashes[i] < other.hashes[j] {
i += 1;
} else {
j += 1;
}
}
let union = self.hashes.len() + other.hashes.len() - intersection;
if union == 0 {
return 0.0;
}
let jaccard_similarity = intersection as f32 / union as f32;
1.0 - jaccard_similarity
}
/// Hash a k-mer using MurmurHash3-like algorithm (forward strand)
#[inline]
fn hash_kmer_64_slice(kmer: &[u8]) -> u64 {
const C1: u64 = 0x87c37b91114253d5;
const C2: u64 = 0x4cf5ad432745937f;
let mut h = 0u64;
for &byte in kmer {
let mut k = byte as u64;
k = k.wrapping_mul(C1);
k = k.rotate_left(31);
k = k.wrapping_mul(C2);
h ^= k;
h = h.rotate_left(27);
h = h.wrapping_mul(5).wrapping_add(0x52dce729);
}
h ^ kmer.len() as u64
}
/// Hash reverse complement of a k-mer (no Vec allocation)
#[inline]
fn hash_kmer_64_rc(kmer: &[u8]) -> u64 {
const C1: u64 = 0x87c37b91114253d5;
const C2: u64 = 0x4cf5ad432745937f;
let mut h = 0u64;
for &byte in kmer.iter().rev() {
let comp = match byte.to_ascii_uppercase() {
b'A' => b'T',
b'T' | b'U' => b'A',
b'C' => b'G',
b'G' => b'C',
n => n,
};
let mut k = comp as u64;
k = k.wrapping_mul(C1);
k = k.rotate_left(31);
k = k.wrapping_mul(C2);
h ^= k;
h = h.rotate_left(27);
h = h.wrapping_mul(5).wrapping_add(0x52dce729);
}
h ^ kmer.len() as u64
}
/// Get the hashes
pub fn hashes(&self) -> &[u64] {
&self.hashes
}
}
/// Search result for k-mer index queries
#[derive(Debug, Clone)]
pub struct KmerSearchResult {
pub id: String,
pub score: f32,
pub distance: f32,
}
/// K-mer index wrapping VectorDB for sequence similarity search
pub struct KmerIndex {
db: VectorDB,
encoder: KmerEncoder,
k: usize,
}
impl KmerIndex {
/// Create a new k-mer index
///
/// # Arguments
/// * `k` - K-mer length
/// * `dimensions` - Vector dimensions (should match encoder dimensions)
pub fn new(k: usize, dimensions: usize) -> Result<Self> {
let encoder = KmerEncoder::new(k)?;
// Verify dimensions match
if encoder.dimensions() != dimensions {
return Err(KmerError::InvalidKmerLength(k));
}
let options = DbOptions {
dimensions,
distance_metric: DistanceMetric::Cosine,
storage_path: format!("./kmer_index_k{}.db", k),
hnsw_config: Some(HnswConfig {
m: 32,
ef_construction: 200,
ef_search: 100,
max_elements: 1_000_000,
}),
quantization: Some(QuantizationConfig::Scalar),
};
let db = VectorDB::new(options)?;
Ok(Self { db, encoder, k })
}
/// Index a single DNA sequence
pub fn index_sequence(&self, id: &str, sequence: &[u8]) -> Result<()> {
let vector = self.encoder.encode_sequence(sequence)?;
let entry = VectorEntry {
id: Some(id.to_string()),
vector,
metadata: Some({
let mut meta = HashMap::new();
meta.insert("length".to_string(), serde_json::json!(sequence.len()));
meta.insert("k".to_string(), serde_json::json!(self.k));
meta
}),
};
self.db.insert(entry)?;
Ok(())
}
/// Index multiple sequences in a batch
pub fn index_batch(&self, sequences: Vec<(&str, &[u8])>) -> Result<()> {
let entries: Result<Vec<VectorEntry>> = sequences
.into_iter()
.map(|(id, seq)| {
let vector = self.encoder.encode_sequence(seq)?;
Ok(VectorEntry {
id: Some(id.to_string()),
vector,
metadata: Some({
let mut meta = HashMap::new();
meta.insert("length".to_string(), serde_json::json!(seq.len()));
meta.insert("k".to_string(), serde_json::json!(self.k));
meta
}),
})
})
.collect();
self.db.insert_batch(entries?)?;
Ok(())
}
/// Search for similar sequences
pub fn search_similar(&self, query: &[u8], top_k: usize) -> Result<Vec<KmerSearchResult>> {
let query_vector = self.encoder.encode_sequence(query)?;
let search_query = SearchQuery {
vector: query_vector,
k: top_k,
filter: None,
ef_search: None,
};
let results = self.db.search(search_query)?;
Ok(results
.into_iter()
.map(|r| KmerSearchResult {
id: r.id,
score: r.score,
distance: r.score,
})
.collect())
}
/// Search for sequences with similarity above a threshold
pub fn search_with_threshold(
&self,
query: &[u8],
threshold: f32,
) -> Result<Vec<KmerSearchResult>> {
// Search with a larger k to ensure we get all candidates
let results = self.search_similar(query, 100)?;
Ok(results
.into_iter()
.filter(|r| r.distance <= threshold)
.collect())
}
/// Get the k-mer length
pub fn k(&self) -> usize {
self.k
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nucleotide_encoding() {
assert_eq!(nucleotide_to_bits(b'A'), Some(0));
assert_eq!(nucleotide_to_bits(b'C'), Some(1));
assert_eq!(nucleotide_to_bits(b'G'), Some(2));
assert_eq!(nucleotide_to_bits(b'T'), Some(3));
assert_eq!(nucleotide_to_bits(b'a'), Some(0));
assert_eq!(nucleotide_to_bits(b'N'), None);
}
#[test]
fn test_reverse_complement() {
let seq = b"ATCG";
let rc = reverse_complement(seq);
assert_eq!(rc, b"CGAT");
}
#[test]
fn test_canonical_kmer() {
let kmer1 = b"ATCG";
let kmer2 = b"CGAT"; // reverse complement
let canon1 = canonical_kmer(kmer1);
let canon2 = canonical_kmer(kmer2);
assert_eq!(canon1, canon2);
}
#[test]
fn test_kmer_encoder_creation() {
let encoder = KmerEncoder::new(3).unwrap();
assert_eq!(encoder.k, 3);
assert_eq!(encoder.dimensions(), 64);
}
#[test]
fn test_kmer_encoder_large_k() {
let encoder = KmerEncoder::new(21).unwrap();
assert_eq!(encoder.k, 21);
assert_eq!(encoder.dimensions(), 1024); // Capped by feature hashing
}
#[test]
fn test_encode_sequence() {
let encoder = KmerEncoder::new(3).unwrap();
let seq = b"ATCGATCG";
let vector = encoder.encode_sequence(seq).unwrap();
assert_eq!(vector.len(), encoder.dimensions());
// Check L2 normalization
let norm: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!((norm - 1.0).abs() < 1e-5);
}
#[test]
fn test_minhash_sketch() {
let mut sketch = MinHashSketch::new(100);
let seq = b"ATCGATCGATCGATCGATCG";
sketch.sketch(seq, 5).unwrap();
assert!(sketch.hashes().len() <= 100);
}
#[test]
fn test_jaccard_distance() {
let mut sketch1 = MinHashSketch::new(100);
let mut sketch2 = MinHashSketch::new(100);
let seq1 = b"ATCGATCGATCGATCGATCG";
let seq2 = b"ATCGATCGATCGATCGATCG"; // Identical
sketch1.sketch(seq1, 5).unwrap();
sketch2.sketch(seq2, 5).unwrap();
let distance = sketch1.jaccard_distance(&sketch2);
assert!(distance < 0.01); // Should be very similar
}
}

View File

@@ -0,0 +1,365 @@
//! K-mer Graph PageRank for DNA Sequence Ranking
//!
//! Builds a k-mer co-occurrence graph from DNA sequences and uses
//! ruvector-solver's Forward Push Personalized PageRank (PPR) to rank
//! sequences by structural centrality in the k-mer overlap network.
//!
//! This enables identifying the most "representative" sequences in a
//! collection — those whose k-mer profiles are most connected to others.
use ruvector_solver::forward_push::ForwardPushSolver;
use ruvector_solver::types::CsrMatrix;
/// Result of PageRank-based sequence ranking
#[derive(Debug, Clone)]
pub struct SequenceRank {
/// Index of the sequence in the input collection
pub index: usize,
/// PageRank score (higher = more central)
pub score: f64,
}
/// K-mer graph builder and PageRank ranker.
///
/// Constructs a weighted graph where:
/// - Nodes are sequences
/// - Edge weight(i, j) = number of shared k-mers between sequences i and j
///
/// Then uses Forward Push PPR to compute centrality scores.
pub struct KmerGraphRanker {
k: usize,
hash_dimensions: usize,
}
impl KmerGraphRanker {
/// Create a new ranker with the given k-mer length.
///
/// # Arguments
/// * `k` - K-mer length (typical: 11-31)
/// * `hash_dimensions` - Number of hash buckets for k-mer fingerprints (default: 256)
pub fn new(k: usize, hash_dimensions: usize) -> Self {
Self { k, hash_dimensions }
}
/// Build a k-mer fingerprint vector for a DNA sequence.
///
/// Uses FNV-1a hashing with canonical k-mers (min of forward/reverse-complement)
/// to produce a fixed-size frequency vector.
fn fingerprint(&self, seq: &[u8]) -> Vec<f64> {
if seq.len() < self.k {
return vec![0.0; self.hash_dimensions];
}
let mut counts = vec![0u32; self.hash_dimensions];
for window in seq.windows(self.k) {
let fwd = Self::fnv1a_hash(window);
let rc = Self::fnv1a_hash_rc(window);
let canonical = fwd.min(rc);
counts[canonical % self.hash_dimensions] += 1;
}
// Normalize to probability distribution
let total: u32 = counts.iter().sum();
if total == 0 {
return vec![0.0; self.hash_dimensions];
}
let inv = 1.0 / total as f64;
counts.iter().map(|&c| c as f64 * inv).collect()
}
/// Compute cosine similarity between two fingerprint vectors.
fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
let norm_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
if norm_a < 1e-15 || norm_b < 1e-15 {
return 0.0;
}
dot / (norm_a * norm_b)
}
/// Build the k-mer overlap graph as a column-stochastic transition matrix.
///
/// Edge weights are cosine similarities between k-mer fingerprints,
/// normalized to form a stochastic matrix (columns sum to 1).
fn build_transition_matrix(&self, sequences: &[&[u8]], threshold: f64) -> CsrMatrix<f64> {
let n = sequences.len();
let fingerprints: Vec<Vec<f64>> =
sequences.iter().map(|seq| self.fingerprint(seq)).collect();
// Build weighted adjacency with thresholding
let mut col_sums = vec![0.0f64; n];
let mut entries: Vec<(usize, usize, f64)> = Vec::new();
for i in 0..n {
for j in 0..n {
if i == j {
continue;
}
let sim = Self::cosine_similarity(&fingerprints[i], &fingerprints[j]);
if sim > threshold {
entries.push((i, j, sim));
col_sums[j] += sim;
}
}
}
// Normalize columns to make stochastic
// Also add self-loops for isolated nodes
let mut normalized: Vec<(usize, usize, f64)> = entries
.into_iter()
.map(|(i, j, w)| {
let norm = if col_sums[j] > 1e-15 {
col_sums[j]
} else {
1.0
};
(i, j, w / norm)
})
.collect();
// Add self-loops for isolated nodes (dangling node handling)
for j in 0..n {
if col_sums[j] < 1e-15 {
normalized.push((j, j, 1.0));
}
}
CsrMatrix::<f64>::from_coo(n, n, normalized)
}
/// Rank sequences by PageRank centrality in the k-mer overlap graph.
///
/// Uses ruvector-solver's Forward Push algorithm for sublinear-time
/// Personalized PageRank computation.
///
/// # Arguments
/// * `sequences` - Collection of DNA sequences (as byte slices)
/// * `alpha` - Teleportation probability (default: 0.15)
/// * `epsilon` - PPR approximation tolerance (default: 1e-6)
/// * `similarity_threshold` - Minimum cosine similarity to create an edge (default: 0.1)
///
/// # Returns
/// Sequences ranked by descending PageRank score
pub fn rank_sequences(
&self,
sequences: &[&[u8]],
alpha: f64,
epsilon: f64,
similarity_threshold: f64,
) -> Vec<SequenceRank> {
let n = sequences.len();
if n == 0 {
return vec![];
}
if n == 1 {
return vec![SequenceRank {
index: 0,
score: 1.0,
}];
}
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
// Use Forward Push PPR from each node, accumulate global PageRank
let solver = ForwardPushSolver::new(alpha, epsilon);
let mut global_rank = vec![0.0f64; n];
// Compute PPR from each node (or a representative subset for large graphs)
let num_seeds = n.min(50); // Limit seeds for large collections
let step = if n > num_seeds { n / num_seeds } else { 1 };
for seed_idx in (0..n).step_by(step) {
match solver.ppr_from_source(&matrix, seed_idx) {
Ok(ppr_result) => {
for (node, score) in ppr_result {
if node < n {
global_rank[node] += score;
}
}
}
Err(_) => {
// If PPR fails for this seed, skip it
continue;
}
}
}
// Normalize
let total: f64 = global_rank.iter().sum();
if total > 1e-15 {
let inv = 1.0 / total;
for score in &mut global_rank {
*score *= inv;
}
}
// Build ranked results
let mut results: Vec<SequenceRank> = global_rank
.into_iter()
.enumerate()
.map(|(index, score)| SequenceRank { index, score })
.collect();
// Sort by score descending
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
/// Compute pairwise PageRank similarity between two specific sequences
/// within the context of a collection.
///
/// Uses Forward Push PPR from the source sequence and returns the
/// PPR score at the target sequence.
pub fn pairwise_similarity(
&self,
sequences: &[&[u8]],
source: usize,
target: usize,
alpha: f64,
epsilon: f64,
similarity_threshold: f64,
) -> f64 {
if source >= sequences.len() || target >= sequences.len() {
return 0.0;
}
let matrix = self.build_transition_matrix(sequences, similarity_threshold);
let solver = ForwardPushSolver::new(alpha, epsilon);
match solver.ppr_from_source(&matrix, source) {
Ok(ppr_result) => ppr_result
.into_iter()
.find(|(node, _)| *node == target)
.map(|(_, score)| score)
.unwrap_or(0.0),
Err(_) => 0.0,
}
}
#[inline]
fn fnv1a_hash(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data {
hash ^= byte as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
#[inline]
fn fnv1a_hash_rc(data: &[u8]) -> usize {
const FNV_OFFSET: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET;
for &byte in data.iter().rev() {
let comp = match byte.to_ascii_uppercase() {
b'A' => b'T',
b'T' | b'U' => b'A',
b'C' => b'G',
b'G' => b'C',
n => n,
};
hash ^= comp as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fingerprint() {
let ranker = KmerGraphRanker::new(3, 64);
let seq = b"ATCGATCGATCG";
let fp = ranker.fingerprint(seq);
assert_eq!(fp.len(), 64);
// Should be a probability distribution (sums to ~1)
let sum: f64 = fp.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
}
#[test]
fn test_cosine_similarity_identical() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![1.0, 2.0, 3.0];
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
assert!((sim - 1.0).abs() < 1e-10);
}
#[test]
fn test_cosine_similarity_orthogonal() {
let a = vec![1.0, 0.0];
let b = vec![0.0, 1.0];
let sim = KmerGraphRanker::cosine_similarity(&a, &b);
assert!(sim.abs() < 1e-10);
}
#[test]
fn test_rank_sequences_basic() {
let ranker = KmerGraphRanker::new(3, 64);
let seq1 = b"ATCGATCGATCGATCG";
let seq2 = b"ATCGATCGATCGATCG"; // identical to seq1
let seq3 = b"GCTAGCTAGCTAGCTA"; // different
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.01);
assert_eq!(ranks.len(), 3);
// All ranks should sum to 1
let total: f64 = ranks.iter().map(|r| r.score).sum();
assert!((total - 1.0).abs() < 1e-5);
// Identical sequences should have similar ranks
let rank_0 = ranks.iter().find(|r| r.index == 0).unwrap().score;
let rank_1 = ranks.iter().find(|r| r.index == 1).unwrap().score;
assert!((rank_0 - rank_1).abs() < 0.3); // roughly similar
}
#[test]
fn test_rank_empty() {
let ranker = KmerGraphRanker::new(3, 64);
let sequences: Vec<&[u8]> = vec![];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
assert!(ranks.is_empty());
}
#[test]
fn test_rank_single() {
let ranker = KmerGraphRanker::new(3, 64);
let sequences: Vec<&[u8]> = vec![b"ATCGATCG"];
let ranks = ranker.rank_sequences(&sequences, 0.15, 1e-4, 0.1);
assert_eq!(ranks.len(), 1);
assert!((ranks[0].score - 1.0).abs() < 1e-10);
}
#[test]
fn test_pairwise_similarity() {
let ranker = KmerGraphRanker::new(3, 64);
let seq1 = b"ATCGATCGATCGATCG";
let seq2 = b"ATCGATCGATCGATCG";
let seq3 = b"NNNNNNNNNNNNNNNN"; // very different
let sequences: Vec<&[u8]> = vec![seq1, seq2, seq3];
let sim_01 = ranker.pairwise_similarity(&sequences, 0, 1, 0.15, 1e-4, 0.01);
let sim_02 = ranker.pairwise_similarity(&sequences, 0, 2, 0.15, 1e-4, 0.01);
// Identical sequences should have higher similarity
assert!(sim_01 >= sim_02);
}
}

84
vendor/ruvector/examples/dna/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,84 @@
//! # rvDNA — AI-Native Genomic Analysis
//!
//! Fast, accurate genomic analysis in pure Rust with WASM support.
//! Includes the `.rvdna` binary file format for storing pre-computed
//! AI features alongside raw DNA sequences.
//!
//! - **K-mer HNSW Indexing**: Sequence similarity search via vector embeddings
//! - **Smith-Waterman Alignment**: Local alignment with CIGAR and mapping quality
//! - **Bayesian Variant Calling**: SNP/indel detection with Phred quality scores
//! - **Protein Translation**: DNA-to-protein with GNN contact graph prediction
//! - **Epigenomics**: Methylation profiling and Horvath biological age clock
//! - **Pharmacogenomics**: CYP enzyme star allele calling and drug recommendations
//! - **Pipeline Orchestration**: DAG-based multi-stage execution
//! - **RVDNA Format**: AI-native binary file format with pre-computed tensors
#![warn(missing_docs)]
#![allow(clippy::all)]
pub mod alignment;
pub mod biomarker;
pub mod biomarker_stream;
pub mod epigenomics;
pub mod error;
pub mod genotyping;
pub mod health;
pub mod kmer;
pub mod kmer_pagerank;
pub mod pharma;
pub mod pipeline;
pub mod protein;
pub mod real_data;
pub mod rvdna;
pub mod types;
pub mod variant;
pub use alignment::{AlignmentConfig, SmithWaterman};
pub use epigenomics::{
CancerSignalDetector, CancerSignalResult, CpGSite, HorvathClock, MethylationProfile,
};
pub use error::{DnaError, Result};
pub use pharma::{
call_cyp2c19_allele, call_star_allele, get_recommendations, predict_cyp2c19_phenotype,
predict_phenotype, Cyp2c19Allele, DrugRecommendation, MetabolizerPhenotype, PharmaVariant,
StarAllele,
};
pub use protein::{isoelectric_point, molecular_weight, translate_dna, AminoAcid};
pub use rvdna::{
decode_2bit, encode_2bit, fasta_to_rvdna, Codec, KmerVectorBlock, RvdnaHeader, RvdnaReader,
RvdnaStats, RvdnaWriter, SparseAttention, VariantTensor,
};
pub use types::{
AlignmentResult, AnalysisConfig, CigarOp, ContactGraph, DnaSequence, GenomicPosition,
KmerIndex, Nucleotide, ProteinResidue, ProteinSequence, QualityScore, Variant,
};
pub use variant::{
FilterStatus, Genotype, PileupColumn, VariantCall, VariantCaller, VariantCallerConfig,
};
pub use ruvector_core::{
types::{DbOptions, DistanceMetric, HnswConfig, SearchQuery, SearchResult, VectorEntry},
VectorDB,
};
pub use biomarker::{BiomarkerClassification, BiomarkerProfile, BiomarkerReference, CategoryScore};
pub use biomarker_stream::{
BiomarkerReading, RingBuffer, StreamConfig, StreamProcessor, StreamStats,
};
pub use genotyping::{
CallConfidence, CypDiplotype, GenomeBuild, GenotypeAnalysis, GenotypeData, Snp,
};
pub use health::{ApoeResult, HealthVariantResult, MthfrResult, PainProfile};
pub use kmer_pagerank::{KmerGraphRanker, SequenceRank};
/// Prelude module for common imports
pub mod prelude {
pub use crate::alignment::*;
pub use crate::epigenomics::*;
pub use crate::error::{DnaError, Result};
pub use crate::kmer::*;
pub use crate::pharma::*;
pub use crate::protein::*;
pub use crate::types::*;
pub use crate::variant::*;
}

427
vendor/ruvector/examples/dna/src/main.rs vendored Normal file
View File

@@ -0,0 +1,427 @@
//! DNA Analyzer Demo - RuVector Genomic Analysis Pipeline
//!
//! Demonstrates SOTA genomic analysis using:
//! - Real human gene sequences (HBB, TP53, BRCA1, CYP2D6, INS)
//! - HNSW k-mer indexing for fast sequence search
//! - Attention-based sequence alignment
//! - Variant calling from pileup data
//! - Protein translation and contact prediction
//! - Epigenetic age prediction (Horvath clock)
//! - Pharmacogenomic star allele calling
//! - RVDNA AI-native file format with pre-computed tensors
use ::rvdna::prelude::*;
use ::rvdna::{
alignment::{AlignmentConfig, SmithWaterman},
epigenomics::{HorvathClock, MethylationProfile},
genotyping, pharma,
protein::translate_dna,
real_data,
rvdna::{
self, Codec, KmerVectorBlock, RvdnaReader, RvdnaWriter, SparseAttention, VariantTensor,
},
variant::{PileupColumn, VariantCaller, VariantCallerConfig},
};
use rand::Rng;
use tracing::{info, Level};
use tracing_subscriber::FmtSubscriber;
fn main() -> anyhow::Result<()> {
// Check for 23andMe file argument
let args: Vec<String> = std::env::args().collect();
if args.len() > 1 {
return run_23andme(&args[1]);
}
let subscriber = FmtSubscriber::builder()
.with_max_level(Level::INFO)
.finish();
tracing::subscriber::set_global_default(subscriber)?;
info!("RuVector DNA Analyzer - Genomic Analysis Pipeline");
info!("================================================");
info!("Using real human gene sequences from NCBI RefSeq");
// -----------------------------------------------------------------------
// Stage 1: Load real human gene sequences
// -----------------------------------------------------------------------
info!("\nStage 1: Loading real human gene sequences");
let total_start = std::time::Instant::now();
let hbb = DnaSequence::from_str(real_data::HBB_CODING_SEQUENCE)?;
let tp53 = DnaSequence::from_str(real_data::TP53_EXONS_5_8)?;
let brca1 = DnaSequence::from_str(real_data::BRCA1_EXON11_FRAGMENT)?;
let cyp2d6 = DnaSequence::from_str(real_data::CYP2D6_CODING)?;
let insulin = DnaSequence::from_str(real_data::INS_CODING)?;
info!(
" HBB (hemoglobin beta): {} bp [chr11, sickle cell gene]",
hbb.len()
);
info!(
" TP53 (tumor suppressor): {} bp [chr17, exons 5-8]",
tp53.len()
);
info!(
" BRCA1 (DNA repair): {} bp [chr17, exon 11 fragment]",
brca1.len()
);
info!(
" CYP2D6 (drug metabolism): {} bp [chr22, pharmacogenomic]",
cyp2d6.len()
);
info!(
" INS (insulin): {} bp [chr11, preproinsulin]",
insulin.len()
);
let gc_hbb = calculate_gc_content(&hbb);
let gc_tp53 = calculate_gc_content(&tp53);
info!(" HBB GC content: {:.1}%", gc_hbb * 100.0);
info!(" TP53 GC content: {:.1}%", gc_tp53 * 100.0);
// -----------------------------------------------------------------------
// Stage 2: K-mer similarity search across gene panel
// -----------------------------------------------------------------------
info!("\nStage 2: K-mer similarity search across gene panel");
let kmer_start = std::time::Instant::now();
let hbb_vec = hbb.to_kmer_vector(11, 512)?;
let tp53_vec = tp53.to_kmer_vector(11, 512)?;
let brca1_vec = brca1.to_kmer_vector(11, 512)?;
let cyp2d6_vec = cyp2d6.to_kmer_vector(11, 512)?;
let ins_vec = insulin.to_kmer_vector(11, 512)?;
let sim_hbb_tp53 = cosine_similarity(&hbb_vec, &tp53_vec);
let sim_hbb_brca1 = cosine_similarity(&hbb_vec, &brca1_vec);
let sim_tp53_brca1 = cosine_similarity(&tp53_vec, &brca1_vec);
let sim_hbb_cyp2d6 = cosine_similarity(&hbb_vec, &cyp2d6_vec);
info!(" K-mer similarity matrix (cosine, k=11, d=512):");
info!(" HBB vs TP53: {:.4}", sim_hbb_tp53);
info!(" HBB vs BRCA1: {:.4}", sim_hbb_brca1);
info!(" TP53 vs BRCA1: {:.4}", sim_tp53_brca1);
info!(" HBB vs CYP2D6:{:.4}", sim_hbb_cyp2d6);
info!(" K-mer encoding time: {:?}", kmer_start.elapsed());
// -----------------------------------------------------------------------
// Stage 3: Align HBB query fragment against full HBB
// -----------------------------------------------------------------------
info!("\nStage 3: Smith-Waterman alignment on HBB");
let align_start = std::time::Instant::now();
// Extract a 50bp fragment from the middle of HBB (simulating a sequencing read)
let hbb_str = hbb.to_string();
let fragment_start = 100;
let fragment_end = (fragment_start + 50).min(hbb_str.len());
let query_fragment = DnaSequence::from_str(&hbb_str[fragment_start..fragment_end])?;
let aligner = SmithWaterman::new(AlignmentConfig::default());
let alignment = aligner.align(&query_fragment, &hbb)?;
info!(
" Query: HBB[{}..{}] ({} bp read)",
fragment_start,
fragment_end,
query_fragment.len()
);
info!(" Alignment score: {}", alignment.score);
info!(
" Mapped position: {} (expected: {})",
alignment.mapped_position.position, fragment_start
);
info!(" Mapping quality: {}", alignment.mapping_quality.value());
info!(" CIGAR: {} ops", alignment.cigar.len());
info!(" Alignment time: {:?}", align_start.elapsed());
// -----------------------------------------------------------------------
// Stage 4: Variant calling on HBB (sickle cell region)
// -----------------------------------------------------------------------
info!("\nStage 4: Variant calling on HBB (sickle cell detection)");
let variant_start = std::time::Instant::now();
let caller = VariantCaller::new(VariantCallerConfig::default());
let hbb_bytes = hbb_str.as_bytes();
let mut variant_count = 0;
let mut rng = rand::thread_rng();
// Simulate sequencing reads across HBB with a sickle cell mutation at position 20
let sickle_pos = real_data::hbb_variants::SICKLE_CELL_POS;
for i in 0..hbb_bytes.len().min(200) {
let depth = rng.gen_range(20..51);
let bases: Vec<u8> = (0..depth)
.map(|_| {
if i == sickle_pos && rng.gen::<f32>() < 0.5 {
b'T' // Simulate heterozygous sickle cell (A→T at codon 6)
} else if rng.gen::<f32>() < 0.98 {
hbb_bytes[i]
} else {
[b'A', b'C', b'G', b'T'][rng.gen_range(0..4)]
}
})
.collect();
let qualities: Vec<u8> = (0..depth).map(|_| rng.gen_range(25..41)).collect();
let pileup = PileupColumn {
bases,
qualities,
position: i as u64,
chromosome: 11,
};
if let Some(call) = caller.call_snp(&pileup, hbb_bytes[i]) {
variant_count += 1;
if i == sickle_pos {
info!(
" ** Sickle cell variant at pos {}: ref={} alt={} depth={} qual={}",
i, call.ref_allele as char, call.alt_allele as char, call.depth, call.quality
);
}
}
}
info!(" Positions analyzed: {}", hbb_bytes.len().min(200));
info!(" Total variants detected: {}", variant_count);
info!(" Variant calling time: {:?}", variant_start.elapsed());
// -----------------------------------------------------------------------
// Stage 5: Translate HBB → hemoglobin beta protein
// -----------------------------------------------------------------------
info!("\nStage 5: Protein translation - HBB to Hemoglobin Beta");
let protein_start = std::time::Instant::now();
let amino_acids = translate_dna(hbb_bytes);
let protein_str: String = amino_acids.iter().map(|aa| aa.to_char()).collect();
info!(" Protein length: {} amino acids", amino_acids.len());
info!(
" First 20 aa: {}",
if protein_str.len() > 20 {
&protein_str[..20]
} else {
&protein_str
}
);
info!(" Expected: MVHLTPEEKSAVTALWGKVN (hemoglobin beta N-terminus)");
// Build contact graph for the hemoglobin protein
if amino_acids.len() >= 10 {
let residues: Vec<ProteinResidue> = amino_acids
.iter()
.map(|aa| match aa.to_char() {
'A' => ProteinResidue::A,
'R' => ProteinResidue::R,
'N' => ProteinResidue::N,
'D' => ProteinResidue::D,
'C' => ProteinResidue::C,
'E' => ProteinResidue::E,
'Q' => ProteinResidue::Q,
'G' => ProteinResidue::G,
'H' => ProteinResidue::H,
'I' => ProteinResidue::I,
'L' => ProteinResidue::L,
'K' => ProteinResidue::K,
'M' => ProteinResidue::M,
'F' => ProteinResidue::F,
'P' => ProteinResidue::P,
'S' => ProteinResidue::S,
'T' => ProteinResidue::T,
'W' => ProteinResidue::W,
'Y' => ProteinResidue::Y,
'V' => ProteinResidue::V,
_ => ProteinResidue::X,
})
.collect();
let protein_seq = ProteinSequence::new(residues);
let graph = protein_seq.build_contact_graph(8.0)?;
let contacts = protein_seq.predict_contacts(&graph)?;
info!(" Contact graph: {} edges", graph.edges.len());
info!(" Top 3 predicted contacts:");
for (i, (r1, r2, score)) in contacts.iter().take(3).enumerate() {
info!(
" {}. Residues {} <-> {} (score: {:.3})",
i + 1,
r1,
r2,
score
);
}
}
info!(" Protein analysis time: {:?}", protein_start.elapsed());
// -----------------------------------------------------------------------
// Stage 6: Epigenetic age prediction
// -----------------------------------------------------------------------
info!("\nStage 6: Epigenetic age prediction (Horvath clock)");
let epi_start = std::time::Instant::now();
let positions: Vec<(u8, u64)> = (0..500).map(|i| (1, i * 1000)).collect();
let betas: Vec<f32> = (0..500).map(|_| rng.gen_range(0.1..0.9)).collect();
let profile = MethylationProfile::from_beta_values(positions, betas);
let clock = HorvathClock::default_clock();
let predicted_age = clock.predict_age(&profile);
info!(" CpG sites analyzed: {}", profile.sites.len());
info!(" Mean methylation: {:.3}", profile.mean_methylation());
info!(" Predicted biological age: {:.1} years", predicted_age);
info!(" Epigenomics time: {:?}", epi_start.elapsed());
// -----------------------------------------------------------------------
// Stage 7: Pharmacogenomics (CYP2D6 from real sequence)
// -----------------------------------------------------------------------
info!("\nStage 7: Pharmacogenomic analysis (CYP2D6)");
let cyp2d6_variants = vec![(42130692, b'G', b'A')]; // *4 defining variant
let allele1 = pharma::call_star_allele(&cyp2d6_variants);
let allele2 = pharma::StarAllele::Star10; // *10: common in East Asian populations
let phenotype = pharma::predict_phenotype(&allele1, &allele2);
info!(" CYP2D6 sequence: {} bp analyzed", cyp2d6.len());
info!(
" Allele 1: {:?} (activity: {:.1})",
allele1,
allele1.activity_score()
);
info!(
" Allele 2: {:?} (activity: {:.1})",
allele2,
allele2.activity_score()
);
info!(" Metabolizer phenotype: {:?}", phenotype);
let recommendations = pharma::get_recommendations("CYP2D6", &phenotype);
for rec in &recommendations {
info!(
" - {}: {} (dose: {:.1}x)",
rec.drug, rec.recommendation, rec.dose_factor
);
}
// -----------------------------------------------------------------------
// Stage 8: RVDNA AI-Native Format Demo
// -----------------------------------------------------------------------
info!("\nStage 8: RVDNA AI-Native File Format");
let rvdna_start = std::time::Instant::now();
// Convert HBB to RVDNA format with pre-computed k-mer vectors
let rvdna_bytes = rvdna::fasta_to_rvdna(real_data::HBB_CODING_SEQUENCE, 11, 512, 500)?;
info!(" FASTA → RVDNA conversion:");
info!(" Input: {} bases (ASCII, 1 byte/base)", hbb.len());
info!(" Output: {} bytes (RVDNA binary)", rvdna_bytes.len());
info!(
" Ratio: {:.2}x compression (sequence section)",
hbb.len() as f64 / rvdna_bytes.len() as f64
);
// Read back and validate
let reader = RvdnaReader::from_bytes(rvdna_bytes)?;
let restored = reader.read_sequence()?;
assert_eq!(restored.to_string(), hbb.to_string(), "Lossless roundtrip");
let kmer_blocks = reader.read_kmer_vectors()?;
let stats = reader.stats();
info!(" RVDNA file stats:");
info!(" Format version: {}", reader.header.version);
info!(
" Sequence section: {} bytes ({:.1} bits/base)",
stats.section_sizes[0], stats.bits_per_base
);
info!(
" K-mer vectors: {} blocks pre-computed",
kmer_blocks.len()
);
if !kmer_blocks.is_empty() {
info!(
" Vector dims: {}, k={}",
kmer_blocks[0].dimensions, kmer_blocks[0].k
);
// Demonstrate instant similarity search from pre-computed vectors
let tp53_query = tp53.to_kmer_vector(11, 512)?;
let sim = kmer_blocks[0].cosine_similarity(&tp53_query);
info!(
" Instant HBB vs TP53 similarity: {:.4} (from pre-indexed)",
sim
);
}
info!(" RVDNA format time: {:?}", rvdna_start.elapsed());
// Compare format sizes
info!("\n Format Comparison (HBB gene, {} bp):", hbb.len());
info!(" FASTA (ASCII): {} bytes (8 bits/base)", hbb.len());
info!(
" RVDNA (2-bit): {} bytes (seq section)",
stats.section_sizes[0]
);
info!(
" RVDNA (total): {} bytes (seq + k-mer vectors + metadata)",
stats.total_size
);
info!(" Pre-computed: k-mer vectors, ready for HNSW search");
// -----------------------------------------------------------------------
// Summary
// -----------------------------------------------------------------------
let total_time = total_start.elapsed();
info!("\nPipeline Summary");
info!("==================");
info!(" Genes analyzed: 5 (HBB, TP53, BRCA1, CYP2D6, INS)");
info!(
" Total bases: {} bp",
hbb.len() + tp53.len() + brca1.len() + cyp2d6.len() + insulin.len()
);
info!(
" Variants called: {} (in HBB sickle cell region)",
variant_count
);
info!(" Hemoglobin protein: {} amino acids", amino_acids.len());
info!(" Predicted age: {:.1} years", predicted_age);
info!(" CYP2D6 phenotype: {:?}", phenotype);
info!(
" RVDNA format: {} bytes ({} sections)",
stats.total_size,
stats.section_sizes.iter().filter(|&&s| s > 0).count()
);
info!(" Total pipeline time: {:?}", total_time);
info!("\nAnalysis complete!");
Ok(())
}
/// Cosine similarity between two vectors
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if mag_a == 0.0 || mag_b == 0.0 {
0.0
} else {
dot / (mag_a * mag_b)
}
}
/// Calculate GC content of DNA sequence
fn calculate_gc_content(sequence: &DnaSequence) -> f64 {
let gc_count = sequence
.bases()
.iter()
.filter(|&&b| b == Nucleotide::G || b == Nucleotide::C)
.count();
gc_count as f64 / sequence.len() as f64
}
/// Run 23andMe genotyping analysis pipeline
fn run_23andme(path: &str) -> anyhow::Result<()> {
let file =
std::fs::File::open(path).map_err(|e| anyhow::anyhow!("Cannot open {}: {}", path, e))?;
let analysis =
genotyping::analyze(file).map_err(|e| anyhow::anyhow!("Analysis failed: {}", e))?;
print!("{}", genotyping::format_report(&analysis));
Ok(())
}

View File

@@ -0,0 +1,417 @@
//! Pharmacogenomics module
//!
//! Provides CYP enzyme star allele calling and metabolizer phenotype
//! prediction for pharmacogenomic analysis.
use serde::{Deserialize, Serialize};
/// CYP2D6 star allele classification
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum StarAllele {
/// *1 - Normal function (wild-type)
Star1,
/// *2 - Normal function
Star2,
/// *3 - No function (frameshift)
Star3,
/// *4 - No function (splicing defect)
Star4,
/// *5 - No function (gene deletion)
Star5,
/// *6 - No function (frameshift)
Star6,
/// *10 - Decreased function
Star10,
/// *17 - Decreased function
Star17,
/// *41 - Decreased function
Star41,
/// Unknown allele
Unknown,
}
impl StarAllele {
/// Get the activity score for this allele
pub fn activity_score(&self) -> f64 {
match self {
StarAllele::Star1 | StarAllele::Star2 => 1.0,
StarAllele::Star10 | StarAllele::Star17 | StarAllele::Star41 => 0.5,
StarAllele::Star3 | StarAllele::Star4 | StarAllele::Star5 | StarAllele::Star6 => 0.0,
StarAllele::Unknown => 0.5,
}
}
}
/// Drug metabolizer phenotype
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum MetabolizerPhenotype {
/// Ultra-rapid metabolizer (activity score > 2.0)
UltraRapid,
/// Normal metabolizer (1.0 <= activity score <= 2.0)
Normal,
/// Intermediate metabolizer (0.5 <= activity score < 1.0)
Intermediate,
/// Poor metabolizer (activity score < 0.5)
Poor,
}
/// Pharmacogenomic variant for a specific gene
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PharmaVariant {
/// Gene name (e.g., "CYP2D6")
pub gene: String,
/// Genomic position
pub position: u64,
/// Reference allele
pub ref_allele: u8,
/// Alternate allele
pub alt_allele: u8,
/// Clinical significance
pub significance: String,
}
/// CYP2C19 star allele classification
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum Cyp2c19Allele {
/// *1 - Normal function (wild-type)
Star1,
/// *2 - No function (rs4244285, c.681G>A, splicing defect)
Star2,
/// *3 - No function (rs4986893, c.636G>A, premature stop)
Star3,
/// *17 - Increased function (rs12248560, c.-806C>T)
Star17,
/// Unknown allele
Unknown,
}
impl Cyp2c19Allele {
/// Get the activity score for this allele (CPIC guidelines)
pub fn activity_score(&self) -> f64 {
match self {
Cyp2c19Allele::Star1 => 1.0,
Cyp2c19Allele::Star17 => 1.5, // Increased function
Cyp2c19Allele::Star2 | Cyp2c19Allele::Star3 => 0.0,
Cyp2c19Allele::Unknown => 0.5,
}
}
}
/// Call CYP2C19 star allele from observed variants
pub fn call_cyp2c19_allele(variants: &[(u64, u8, u8)]) -> Cyp2c19Allele {
for &(pos, ref_allele, alt_allele) in variants {
match (pos, ref_allele, alt_allele) {
// *2: G>A at rs4244285 (c.681G>A, splicing defect)
(96541616, b'G', b'A') => return Cyp2c19Allele::Star2,
// *3: G>A at rs4986893 (c.636G>A, premature stop codon)
(96540410, b'G', b'A') => return Cyp2c19Allele::Star3,
// *17: C>T at rs12248560 (c.-806C>T, increased expression)
(96522463, b'C', b'T') => return Cyp2c19Allele::Star17,
_ => {}
}
}
Cyp2c19Allele::Star1
}
/// Predict CYP2C19 metabolizer phenotype from diplotype
pub fn predict_cyp2c19_phenotype(
allele1: &Cyp2c19Allele,
allele2: &Cyp2c19Allele,
) -> MetabolizerPhenotype {
let total_activity = allele1.activity_score() + allele2.activity_score();
if total_activity > 2.0 {
MetabolizerPhenotype::UltraRapid
} else if total_activity >= 1.0 {
MetabolizerPhenotype::Normal
} else if total_activity >= 0.5 {
MetabolizerPhenotype::Intermediate
} else {
MetabolizerPhenotype::Poor
}
}
/// Call CYP2D6 star allele from observed variants
///
/// Uses a simplified lookup table based on key defining variants.
pub fn call_star_allele(variants: &[(u64, u8, u8)]) -> StarAllele {
for &(pos, ref_allele, alt_allele) in variants {
match (pos, ref_allele, alt_allele) {
// *4: G>A at intron 3/exon 4 boundary (rs3892097)
(42130692, b'G', b'A') => return StarAllele::Star4,
// *5: whole gene deletion
(42126611, b'T', b'-') => return StarAllele::Star5,
// *3: frameshift (A deletion at rs35742686)
(42127941, b'A', b'-') => return StarAllele::Star3,
// *6: T deletion at rs5030655
(42127803, b'T', b'-') => return StarAllele::Star6,
// *10: C>T at rs1065852
(42126938, b'C', b'T') => return StarAllele::Star10,
_ => {}
}
}
StarAllele::Star1 // Wild-type
}
/// Predict metabolizer phenotype from diplotype (two alleles)
pub fn predict_phenotype(allele1: &StarAllele, allele2: &StarAllele) -> MetabolizerPhenotype {
let total_activity = allele1.activity_score() + allele2.activity_score();
if total_activity > 2.0 {
MetabolizerPhenotype::UltraRapid
} else if total_activity >= 1.0 {
MetabolizerPhenotype::Normal
} else if total_activity >= 0.5 {
MetabolizerPhenotype::Intermediate
} else {
MetabolizerPhenotype::Poor
}
}
/// Drug recommendation based on metabolizer phenotype
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DrugRecommendation {
/// Drug name
pub drug: String,
/// Gene involved
pub gene: String,
/// Recommendation text
pub recommendation: String,
/// Dosing adjustment factor (1.0 = standard dose)
pub dose_factor: f64,
}
/// Get drug recommendations for a given phenotype
pub fn get_recommendations(
gene: &str,
phenotype: &MetabolizerPhenotype,
) -> Vec<DrugRecommendation> {
match (gene, phenotype) {
("CYP2D6", MetabolizerPhenotype::Poor) => vec![
DrugRecommendation {
drug: "Codeine".to_string(),
gene: gene.to_string(),
recommendation:
"AVOID codeine; no conversion to morphine. Use alternative analgesic."
.to_string(),
dose_factor: 0.0,
},
DrugRecommendation {
drug: "Tramadol".to_string(),
gene: gene.to_string(),
recommendation: "AVOID tramadol; reduced efficacy. Use alternative analgesic."
.to_string(),
dose_factor: 0.0,
},
DrugRecommendation {
drug: "Tamoxifen".to_string(),
gene: gene.to_string(),
recommendation: "Consider alternative endocrine therapy (aromatase inhibitor)."
.to_string(),
dose_factor: 0.0,
},
DrugRecommendation {
drug: "Ondansetron".to_string(),
gene: gene.to_string(),
recommendation: "Use standard dose; may have increased exposure.".to_string(),
dose_factor: 0.75,
},
],
("CYP2D6", MetabolizerPhenotype::UltraRapid) => vec![
DrugRecommendation {
drug: "Codeine".to_string(),
gene: gene.to_string(),
recommendation:
"AVOID codeine; risk of fatal toxicity from ultra-rapid morphine conversion."
.to_string(),
dose_factor: 0.0,
},
DrugRecommendation {
drug: "Tramadol".to_string(),
gene: gene.to_string(),
recommendation: "AVOID tramadol; risk of respiratory depression.".to_string(),
dose_factor: 0.0,
},
],
("CYP2D6", MetabolizerPhenotype::Intermediate) => vec![
DrugRecommendation {
drug: "Codeine".to_string(),
gene: gene.to_string(),
recommendation: "Use lower dose or alternative analgesic.".to_string(),
dose_factor: 0.5,
},
DrugRecommendation {
drug: "Tamoxifen".to_string(),
gene: gene.to_string(),
recommendation: "Consider higher dose or alternative therapy.".to_string(),
dose_factor: 0.75,
},
],
("CYP2C19", MetabolizerPhenotype::Poor) => vec![
DrugRecommendation {
drug: "Clopidogrel (Plavix)".to_string(),
gene: gene.to_string(),
recommendation: "AVOID clopidogrel; use prasugrel or ticagrelor instead."
.to_string(),
dose_factor: 0.0,
},
DrugRecommendation {
drug: "Voriconazole".to_string(),
gene: gene.to_string(),
recommendation: "Reduce dose by 50%; monitor for toxicity.".to_string(),
dose_factor: 0.5,
},
DrugRecommendation {
drug: "PPIs (omeprazole)".to_string(),
gene: gene.to_string(),
recommendation: "Reduce dose; slower clearance increases exposure.".to_string(),
dose_factor: 0.5,
},
DrugRecommendation {
drug: "Escitalopram".to_string(),
gene: gene.to_string(),
recommendation: "Consider 50% dose reduction.".to_string(),
dose_factor: 0.5,
},
],
("CYP2C19", MetabolizerPhenotype::UltraRapid) => vec![
DrugRecommendation {
drug: "Clopidogrel (Plavix)".to_string(),
gene: gene.to_string(),
recommendation: "Standard dosing (enhanced activation is beneficial).".to_string(),
dose_factor: 1.0,
},
DrugRecommendation {
drug: "Omeprazole".to_string(),
gene: gene.to_string(),
recommendation: "Increase dose; rapid clearance reduces efficacy.".to_string(),
dose_factor: 2.0,
},
DrugRecommendation {
drug: "Voriconazole".to_string(),
gene: gene.to_string(),
recommendation: "Use alternative antifungal.".to_string(),
dose_factor: 0.0,
},
],
("CYP2C19", MetabolizerPhenotype::Intermediate) => vec![
DrugRecommendation {
drug: "Clopidogrel (Plavix)".to_string(),
gene: gene.to_string(),
recommendation: "Consider alternative antiplatelet or increased dose.".to_string(),
dose_factor: 1.5,
},
DrugRecommendation {
drug: "PPIs (omeprazole)".to_string(),
gene: gene.to_string(),
recommendation:
"Standard dose likely adequate; may have slightly increased exposure."
.to_string(),
dose_factor: 1.0,
},
DrugRecommendation {
drug: "Escitalopram".to_string(),
gene: gene.to_string(),
recommendation: "Use standard dose; monitor response.".to_string(),
dose_factor: 1.0,
},
],
_ => vec![DrugRecommendation {
drug: "Standard".to_string(),
gene: gene.to_string(),
recommendation: "Use standard dosing".to_string(),
dose_factor: 1.0,
}],
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_star_allele_calling() {
// Wild-type
assert_eq!(call_star_allele(&[]), StarAllele::Star1);
// *4 variant
let star4 = call_star_allele(&[(42130692, b'G', b'A')]);
assert_eq!(star4, StarAllele::Star4);
assert_eq!(star4.activity_score(), 0.0);
// *10 variant (decreased function)
let star10 = call_star_allele(&[(42126938, b'C', b'T')]);
assert_eq!(star10, StarAllele::Star10);
assert_eq!(star10.activity_score(), 0.5);
}
#[test]
fn test_phenotype_prediction() {
assert_eq!(
predict_phenotype(&StarAllele::Star1, &StarAllele::Star1),
MetabolizerPhenotype::Normal
);
assert_eq!(
predict_phenotype(&StarAllele::Star1, &StarAllele::Star4),
MetabolizerPhenotype::Normal
);
assert_eq!(
predict_phenotype(&StarAllele::Star4, &StarAllele::Star10),
MetabolizerPhenotype::Intermediate
);
assert_eq!(
predict_phenotype(&StarAllele::Star4, &StarAllele::Star4),
MetabolizerPhenotype::Poor
);
}
#[test]
fn test_drug_recommendations() {
let recs = get_recommendations("CYP2D6", &MetabolizerPhenotype::Poor);
assert!(recs.len() >= 1);
assert_eq!(recs[0].dose_factor, 0.0);
let recs_normal = get_recommendations("CYP2D6", &MetabolizerPhenotype::Normal);
assert_eq!(recs_normal[0].dose_factor, 1.0);
}
#[test]
fn test_cyp2c19_allele_calling() {
assert_eq!(call_cyp2c19_allele(&[]), Cyp2c19Allele::Star1);
let star2 = call_cyp2c19_allele(&[(96541616, b'G', b'A')]);
assert_eq!(star2, Cyp2c19Allele::Star2);
assert_eq!(star2.activity_score(), 0.0);
let star17 = call_cyp2c19_allele(&[(96522463, b'C', b'T')]);
assert_eq!(star17, Cyp2c19Allele::Star17);
assert_eq!(star17.activity_score(), 1.5);
}
#[test]
fn test_cyp2c19_phenotype() {
assert_eq!(
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star17, &Cyp2c19Allele::Star17),
MetabolizerPhenotype::UltraRapid
);
assert_eq!(
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star2, &Cyp2c19Allele::Star2),
MetabolizerPhenotype::Poor
);
assert_eq!(
predict_cyp2c19_phenotype(&Cyp2c19Allele::Star1, &Cyp2c19Allele::Star2),
MetabolizerPhenotype::Normal
);
}
#[test]
fn test_cyp2c19_drug_recommendations() {
let recs = get_recommendations("CYP2C19", &MetabolizerPhenotype::Poor);
assert!(recs.len() >= 1);
assert_eq!(recs[0].drug, "Clopidogrel (Plavix)");
assert_eq!(recs[0].dose_factor, 0.0);
let recs_ultra = get_recommendations("CYP2C19", &MetabolizerPhenotype::UltraRapid);
assert!(recs_ultra.len() >= 2);
}
}

View File

@@ -0,0 +1,496 @@
//! DAG-based genomic analysis pipeline orchestrator
use crate::error::Result;
use crate::types::{DnaSequence, KmerIndex, Nucleotide, ProteinResidue, ProteinSequence};
use ruvector_core::types::{SearchQuery, VectorEntry};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;
/// Pipeline configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineConfig {
/// K-mer size (default: 21)
pub k: usize,
/// Attention window size (default: 512)
pub window_size: usize,
/// Variant calling min depth (default: 10)
pub min_depth: usize,
/// Min variant quality (default: 20)
pub min_quality: u8,
}
impl Default for PipelineConfig {
fn default() -> Self {
Self {
k: 21,
window_size: 512,
min_depth: 10,
min_quality: 20,
}
}
}
/// K-mer analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KmerAnalysisResult {
/// Total k-mers extracted
pub total_kmers: usize,
/// Unique k-mers found
pub unique_kmers: usize,
/// GC content ratio
pub gc_content: f64,
/// Top similar sequences
pub top_similar_sequences: Vec<SimilarSequence>,
}
/// Similar sequence match
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SimilarSequence {
/// Sequence identifier
pub id: String,
/// Similarity score
pub similarity: f32,
/// Position in the index
pub position: usize,
}
/// Variant call result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VariantCall {
/// Genomic position
pub position: u64,
/// Reference base
pub reference: Nucleotide,
/// Alternate base
pub alternate: Nucleotide,
/// Variant quality
pub quality: u8,
/// Read depth
pub depth: usize,
/// Allele frequency
pub allele_frequency: f64,
}
/// Pileup column for variant calling
#[derive(Debug, Clone)]
pub struct PileupColumn {
/// Genomic position
pub position: u64,
/// Reference base
pub reference: Nucleotide,
/// Observed bases
pub bases: Vec<Nucleotide>,
/// Quality scores
pub qualities: Vec<u8>,
}
/// Protein analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProteinAnalysisResult {
/// Amino acid sequence (single letter codes)
pub sequence: String,
/// Protein length
pub length: usize,
/// Predicted contacts as (i, j, score)
pub predicted_contacts: Vec<(usize, usize, f32)>,
/// Secondary structure prediction (H/E/C)
pub secondary_structure: Vec<char>,
}
/// Full pipeline analysis results
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FullAnalysisResult {
/// K-mer statistics
pub kmer_stats: KmerAnalysisResult,
/// Called variants
pub variants: Vec<VariantCall>,
/// Protein analysis results
pub proteins: Vec<ProteinAnalysisResult>,
/// Execution time in milliseconds
pub execution_time_ms: u128,
}
/// Genomic analysis pipeline orchestrator
pub struct GenomicPipeline {
config: PipelineConfig,
}
impl GenomicPipeline {
/// Create new pipeline with configuration
pub fn new(config: PipelineConfig) -> Self {
Self { config }
}
/// Run k-mer analysis on sequences
pub fn run_kmer_analysis(&self, sequences: &[(&str, &[u8])]) -> Result<KmerAnalysisResult> {
let mut total_kmers = 0;
let mut kmer_set = std::collections::HashSet::new();
let mut gc_count = 0;
let mut total_bases = 0;
// Create temporary k-mer index
let index = KmerIndex::new(self.config.k, 384, ":memory:")?;
for (id, seq) in sequences {
// Extract k-mers
if seq.len() < self.config.k {
continue;
}
total_bases += seq.len();
for window in seq.windows(self.config.k) {
total_kmers += 1;
kmer_set.insert(window.to_vec());
// Count GC content
for &base in window {
if base == b'G' || base == b'C' {
gc_count += 1;
}
}
}
// Convert sequence to vector and index
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(seq))?;
if let Ok(vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
let entry = VectorEntry {
id: Some(id.to_string()),
vector,
metadata: None,
};
let _ = index.db().insert(entry);
}
}
let gc_content = if total_bases > 0 {
(gc_count as f64) / (total_bases as f64)
} else {
0.0
};
// Find similar sequences using HNSW search
let mut top_similar = Vec::new();
if !sequences.is_empty() {
if let Some((query_id, query_seq)) = sequences.first() {
let dna_seq = DnaSequence::from_str(&String::from_utf8_lossy(query_seq))?;
if let Ok(query_vector) = dna_seq.to_kmer_vector(self.config.k, 384) {
let search_query = SearchQuery {
vector: query_vector,
k: 5,
filter: None,
ef_search: None,
};
if let Ok(results) = index.db().search(search_query) {
for result in results {
if result.id != *query_id {
top_similar.push(SimilarSequence {
id: result.id.clone(),
similarity: result.score,
position: 0,
});
}
}
}
}
}
}
Ok(KmerAnalysisResult {
total_kmers,
unique_kmers: kmer_set.len(),
gc_content,
top_similar_sequences: top_similar,
})
}
/// Run variant calling against reference
pub fn run_variant_calling(
&self,
pileups: &[PileupColumn],
_reference: &[u8],
) -> Result<Vec<VariantCall>> {
let mut variants = Vec::new();
for pileup in pileups {
if pileup.bases.len() < self.config.min_depth {
continue;
}
// Count allele frequencies
let mut allele_counts: HashMap<Nucleotide, usize> = HashMap::new();
for &base in &pileup.bases {
*allele_counts.entry(base).or_insert(0) += 1;
}
// Find most common alternate allele
let _ref_count = allele_counts.get(&pileup.reference).copied().unwrap_or(0);
for (&allele, &count) in &allele_counts {
if allele == pileup.reference || allele == Nucleotide::N {
continue;
}
let allele_freq = count as f64 / pileup.bases.len() as f64;
// Call variant if alternate allele frequency is significant
if allele_freq > 0.2 && count >= 3 {
// Calculate quality score from supporting reads
let quality = pileup
.qualities
.iter()
.take(count)
.map(|&q| q as u16)
.sum::<u16>()
.min(255) as u8;
if quality >= self.config.min_quality {
variants.push(VariantCall {
position: pileup.position,
reference: pileup.reference,
alternate: allele,
quality,
depth: pileup.bases.len(),
allele_frequency: allele_freq,
});
}
}
}
}
Ok(variants)
}
/// Translate DNA to protein and analyze structure
pub fn run_protein_analysis(&self, dna: &[u8]) -> Result<ProteinAnalysisResult> {
// Translate DNA to protein using standard genetic code
let protein = self.translate_dna(dna)?;
// Predict contacts using heuristic scoring
let contacts = self.predict_protein_contacts(&protein)?;
// Simple secondary structure prediction
let secondary_structure = self.predict_secondary_structure(&protein);
Ok(ProteinAnalysisResult {
sequence: protein.residues().iter().map(|r| r.to_char()).collect(),
length: protein.len(),
predicted_contacts: contacts,
secondary_structure,
})
}
/// Run full analysis pipeline
pub fn run_full_pipeline(
&self,
sequence: &[u8],
reference: &[u8],
) -> Result<FullAnalysisResult> {
let start = Instant::now();
// Stage 1: K-mer analysis
let kmer_stats =
self.run_kmer_analysis(&[("query", sequence), ("reference", reference)])?;
// Stage 2: Variant calling - generate pileups from sequence
let pileups = self.generate_pileups(sequence, reference)?;
let variants = self.run_variant_calling(&pileups, reference)?;
// Stage 3: Protein analysis - find ORFs and translate
let proteins = self.find_orfs_and_translate(sequence)?;
let execution_time_ms = start.elapsed().as_millis();
Ok(FullAnalysisResult {
kmer_stats,
variants,
proteins,
execution_time_ms,
})
}
// Helper methods
/// Translate DNA to protein
fn translate_dna(&self, dna: &[u8]) -> Result<ProteinSequence> {
let mut residues = Vec::new();
for codon in dna.chunks(3) {
if codon.len() < 3 {
break;
}
let aa = self.codon_to_amino_acid(codon);
if aa == ProteinResidue::X {
break; // Stop codon
}
residues.push(aa);
}
Ok(ProteinSequence::new(residues))
}
/// Map codon to amino acid (simplified genetic code)
fn codon_to_amino_acid(&self, codon: &[u8]) -> ProteinResidue {
match codon {
b"ATG" => ProteinResidue::M,
b"TGG" => ProteinResidue::W,
b"TTT" | b"TTC" => ProteinResidue::F,
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => ProteinResidue::L,
b"ATT" | b"ATC" | b"ATA" => ProteinResidue::I,
b"GTT" | b"GTC" | b"GTA" | b"GTG" => ProteinResidue::V,
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => ProteinResidue::S,
b"CCT" | b"CCC" | b"CCA" | b"CCG" => ProteinResidue::P,
b"ACT" | b"ACC" | b"ACA" | b"ACG" => ProteinResidue::T,
b"GCT" | b"GCC" | b"GCA" | b"GCG" => ProteinResidue::A,
b"TAT" | b"TAC" => ProteinResidue::Y,
b"CAT" | b"CAC" => ProteinResidue::H,
b"CAA" | b"CAG" => ProteinResidue::Q,
b"AAT" | b"AAC" => ProteinResidue::N,
b"AAA" | b"AAG" => ProteinResidue::K,
b"GAT" | b"GAC" => ProteinResidue::D,
b"GAA" | b"GAG" => ProteinResidue::E,
b"TGT" | b"TGC" => ProteinResidue::C,
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => ProteinResidue::R,
b"GGT" | b"GGC" | b"GGA" | b"GGG" => ProteinResidue::G,
_ => ProteinResidue::X, // Stop or unknown
}
}
/// Predict protein contacts using residue property heuristics
fn predict_protein_contacts(
&self,
protein: &ProteinSequence,
) -> Result<Vec<(usize, usize, f32)>> {
let residues = protein.residues();
let n = residues.len();
if n < 5 {
return Ok(Vec::new());
}
// Compute residue feature scores
let features: Vec<f32> = residues
.iter()
.map(|r| r.to_char() as u8 as f32 / 255.0)
.collect();
// Predict contacts: pairs of residues >4 apart with similar features
let mut contacts = Vec::new();
for i in 0..n {
for j in (i + 5)..n {
let score = (features[i] + features[j]) / 2.0;
if score > 0.5 {
contacts.push((i, j, score));
}
}
}
contacts.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
contacts.truncate(10);
Ok(contacts)
}
/// Simple secondary structure prediction
fn predict_secondary_structure(&self, protein: &ProteinSequence) -> Vec<char> {
protein
.residues()
.iter()
.map(|r| match r {
ProteinResidue::A | ProteinResidue::E | ProteinResidue::L | ProteinResidue::M => {
'H'
}
ProteinResidue::V | ProteinResidue::I | ProteinResidue::Y | ProteinResidue::F => {
'E'
}
_ => 'C',
})
.collect()
}
/// Generate pileups from sequence alignment
fn generate_pileups(&self, sequence: &[u8], reference: &[u8]) -> Result<Vec<PileupColumn>> {
let mut pileups = Vec::new();
let min_len = sequence.len().min(reference.len());
for i in 0..min_len {
let ref_base = match reference[i] {
b'A' => Nucleotide::A,
b'C' => Nucleotide::C,
b'G' => Nucleotide::G,
b'T' => Nucleotide::T,
_ => Nucleotide::N,
};
let seq_base = match sequence[i] {
b'A' => Nucleotide::A,
b'C' => Nucleotide::C,
b'G' => Nucleotide::G,
b'T' => Nucleotide::T,
_ => Nucleotide::N,
};
// Simulate coverage depth
let depth = 15 + (i % 10);
let bases = vec![seq_base; depth];
let qualities = vec![30; depth];
pileups.push(PileupColumn {
position: i as u64,
reference: ref_base,
bases,
qualities,
});
}
Ok(pileups)
}
/// Find ORFs and translate to proteins
fn find_orfs_and_translate(&self, sequence: &[u8]) -> Result<Vec<ProteinAnalysisResult>> {
let mut proteins = Vec::new();
// Look for ATG start codons
for i in 0..sequence.len().saturating_sub(30) {
if sequence[i..].starts_with(b"ATG") {
let orf = &sequence[i..];
if let Ok(protein_result) = self.run_protein_analysis(orf) {
if protein_result.length >= 10 {
proteins.push(protein_result);
if proteins.len() >= 3 {
break;
}
}
}
}
}
Ok(proteins)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pipeline_creation() {
let config = PipelineConfig::default();
let pipeline = GenomicPipeline::new(config);
assert_eq!(pipeline.config.k, 21);
}
#[test]
fn test_kmer_analysis() {
let config = PipelineConfig::default();
let pipeline = GenomicPipeline::new(config);
let sequences = vec![("seq1", b"ACGTACGTACGTACGTACGTACGT".as_ref())];
let result = pipeline.run_kmer_analysis(&sequences);
assert!(result.is_ok());
}
}

View File

@@ -0,0 +1,338 @@
//! Protein translation and amino acid analysis module
//!
//! Provides DNA to protein translation using the standard genetic code,
//! and amino acid property calculations.
use serde::{Deserialize, Serialize};
/// Amino acid representation with full names
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum AminoAcid {
/// Alanine
Ala,
/// Arginine
Arg,
/// Asparagine
Asn,
/// Aspartic acid
Asp,
/// Cysteine
Cys,
/// Glutamic acid
Glu,
/// Glutamine
Gln,
/// Glycine
Gly,
/// Histidine
His,
/// Isoleucine
Ile,
/// Leucine
Leu,
/// Lysine
Lys,
/// Methionine (start codon)
Met,
/// Phenylalanine
Phe,
/// Proline
Pro,
/// Serine
Ser,
/// Threonine
Thr,
/// Tryptophan
Trp,
/// Tyrosine
Tyr,
/// Valine
Val,
/// Stop codon
Stop,
}
impl AminoAcid {
/// Get single-letter code
pub fn to_char(&self) -> char {
match self {
AminoAcid::Ala => 'A',
AminoAcid::Arg => 'R',
AminoAcid::Asn => 'N',
AminoAcid::Asp => 'D',
AminoAcid::Cys => 'C',
AminoAcid::Glu => 'E',
AminoAcid::Gln => 'Q',
AminoAcid::Gly => 'G',
AminoAcid::His => 'H',
AminoAcid::Ile => 'I',
AminoAcid::Leu => 'L',
AminoAcid::Lys => 'K',
AminoAcid::Met => 'M',
AminoAcid::Phe => 'F',
AminoAcid::Pro => 'P',
AminoAcid::Ser => 'S',
AminoAcid::Thr => 'T',
AminoAcid::Trp => 'W',
AminoAcid::Tyr => 'Y',
AminoAcid::Val => 'V',
AminoAcid::Stop => '*',
}
}
/// Get Kyte-Doolittle hydrophobicity value
pub fn hydrophobicity(&self) -> f32 {
match self {
AminoAcid::Ile => 4.5,
AminoAcid::Val => 4.2,
AminoAcid::Leu => 3.8,
AminoAcid::Phe => 2.8,
AminoAcid::Cys => 2.5,
AminoAcid::Met => 1.9,
AminoAcid::Ala => 1.8,
AminoAcid::Gly => -0.4,
AminoAcid::Thr => -0.7,
AminoAcid::Ser => -0.8,
AminoAcid::Trp => -0.9,
AminoAcid::Tyr => -1.3,
AminoAcid::Pro => -1.6,
AminoAcid::His => -3.2,
AminoAcid::Glu => -3.5,
AminoAcid::Gln => -3.5,
AminoAcid::Asp => -3.5,
AminoAcid::Asn => -3.5,
AminoAcid::Lys => -3.9,
AminoAcid::Arg => -4.5,
AminoAcid::Stop => 0.0,
}
}
/// Get average molecular weight in Daltons (monoisotopic)
pub fn molecular_weight(&self) -> f64 {
match self {
AminoAcid::Ala => 71.03711,
AminoAcid::Arg => 156.10111,
AminoAcid::Asn => 114.04293,
AminoAcid::Asp => 115.02694,
AminoAcid::Cys => 103.00919,
AminoAcid::Glu => 129.04259,
AminoAcid::Gln => 128.05858,
AminoAcid::Gly => 57.02146,
AminoAcid::His => 137.05891,
AminoAcid::Ile => 113.08406,
AminoAcid::Leu => 113.08406,
AminoAcid::Lys => 128.09496,
AminoAcid::Met => 131.04049,
AminoAcid::Phe => 147.06841,
AminoAcid::Pro => 97.05276,
AminoAcid::Ser => 87.03203,
AminoAcid::Thr => 101.04768,
AminoAcid::Trp => 186.07931,
AminoAcid::Tyr => 163.06333,
AminoAcid::Val => 99.06841,
AminoAcid::Stop => 0.0,
}
}
/// Get pKa values for Henderson-Hasselbalch isoelectric point calculation
/// Returns (pKa_amino, pKa_carboxyl, pKa_sidechain or None)
pub fn pka_sidechain(&self) -> Option<f64> {
match self {
AminoAcid::Asp => Some(3.65),
AminoAcid::Glu => Some(4.25),
AminoAcid::His => Some(6.00),
AminoAcid::Cys => Some(8.18),
AminoAcid::Tyr => Some(10.07),
AminoAcid::Lys => Some(10.53),
AminoAcid::Arg => Some(12.48),
_ => None,
}
}
}
/// Calculate total molecular weight of a protein in Daltons
///
/// Accounts for water loss from peptide bond formation.
pub fn molecular_weight(protein: &[AminoAcid]) -> f64 {
if protein.is_empty() {
return 0.0;
}
// Sum residue weights + water (18.01056 Da) - water for each peptide bond
let residue_sum: f64 = protein.iter().map(|aa| aa.molecular_weight()).sum();
// N-term H (1.00794) + C-term OH (17.00274) + residues - H2O per bond
residue_sum + 18.01056 - (protein.len().saturating_sub(1) as f64 * 0.0) // Already accounted in residue weights
}
/// Estimate isoelectric point (pI) using the bisection method
///
/// pI is the pH at which the net charge of the protein is zero.
/// Uses Henderson-Hasselbalch equation with standard pKa values.
pub fn isoelectric_point(protein: &[AminoAcid]) -> f64 {
if protein.is_empty() {
return 7.0;
}
const PKA_NH2: f64 = 9.69; // N-terminal amino group
const PKA_COOH: f64 = 2.34; // C-terminal carboxyl group
let charge_at_ph = |ph: f64| -> f64 {
// N-terminal positive charge
let mut charge = 1.0 / (1.0 + 10_f64.powf(ph - PKA_NH2));
// C-terminal negative charge
charge -= 1.0 / (1.0 + 10_f64.powf(PKA_COOH - ph));
for aa in protein {
if let Some(pka) = aa.pka_sidechain() {
match aa {
// Positively charged at low pH: His, Lys, Arg
AminoAcid::His | AminoAcid::Lys | AminoAcid::Arg => {
charge += 1.0 / (1.0 + 10_f64.powf(ph - pka));
}
// Negatively charged at high pH: Asp, Glu, Cys, Tyr
_ => {
charge -= 1.0 / (1.0 + 10_f64.powf(pka - ph));
}
}
}
}
charge
};
// Bisection method to find pH where charge = 0
let mut low = 0.0_f64;
let mut high = 14.0_f64;
for _ in 0..100 {
let mid = (low + high) / 2.0;
let charge = charge_at_ph(mid);
if charge > 0.0 {
low = mid;
} else {
high = mid;
}
}
(low + high) / 2.0
}
/// Translate a DNA sequence to a vector of amino acids using the standard genetic code.
///
/// Translation proceeds in triplets (codons) from the start of the sequence.
/// Stop codons (TAA, TAG, TGA) terminate translation.
/// Incomplete codons at the end are ignored.
pub fn translate_dna(dna: &[u8]) -> Vec<AminoAcid> {
let mut proteins = Vec::new();
for chunk in dna.chunks(3) {
if chunk.len() < 3 {
break;
}
let codon = [
chunk[0].to_ascii_uppercase(),
chunk[1].to_ascii_uppercase(),
chunk[2].to_ascii_uppercase(),
];
let aa = match &codon {
b"ATG" => AminoAcid::Met,
b"TGG" => AminoAcid::Trp,
b"TTT" | b"TTC" => AminoAcid::Phe,
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => AminoAcid::Leu,
b"ATT" | b"ATC" | b"ATA" => AminoAcid::Ile,
b"GTT" | b"GTC" | b"GTA" | b"GTG" => AminoAcid::Val,
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => AminoAcid::Ser,
b"CCT" | b"CCC" | b"CCA" | b"CCG" => AminoAcid::Pro,
b"ACT" | b"ACC" | b"ACA" | b"ACG" => AminoAcid::Thr,
b"GCT" | b"GCC" | b"GCA" | b"GCG" => AminoAcid::Ala,
b"TAT" | b"TAC" => AminoAcid::Tyr,
b"CAT" | b"CAC" => AminoAcid::His,
b"CAA" | b"CAG" => AminoAcid::Gln,
b"AAT" | b"AAC" => AminoAcid::Asn,
b"AAA" | b"AAG" => AminoAcid::Lys,
b"GAT" | b"GAC" => AminoAcid::Asp,
b"GAA" | b"GAG" => AminoAcid::Glu,
b"TGT" | b"TGC" => AminoAcid::Cys,
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => AminoAcid::Arg,
b"GGT" | b"GGC" | b"GGA" | b"GGG" => AminoAcid::Gly,
b"TAA" | b"TAG" | b"TGA" => break, // Stop codons
_ => continue, // Unknown codon, skip
};
proteins.push(aa);
}
proteins
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_translate_basic() {
let dna = b"ATGGCAGGT";
let result = translate_dna(dna);
assert_eq!(result.len(), 3);
assert_eq!(result[0], AminoAcid::Met);
assert_eq!(result[1], AminoAcid::Ala);
assert_eq!(result[2], AminoAcid::Gly);
}
#[test]
fn test_translate_stop_codon() {
let dna = b"ATGGCATAA"; // Met-Ala-Stop
let result = translate_dna(dna);
assert_eq!(result.len(), 2);
}
#[test]
fn test_hydrophobicity() {
assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
}
#[test]
fn test_molecular_weight() {
let protein = vec![AminoAcid::Met, AminoAcid::Ala, AminoAcid::Gly];
let mw = molecular_weight(&protein);
// Met (131.04) + Ala (71.04) + Gly (57.02) + H2O (18.01) = ~277.11
assert!(mw > 270.0 && mw < 290.0, "MW should be ~277: got {}", mw);
}
#[test]
fn test_isoelectric_point() {
// Hemoglobin beta N-terminus MVHLTPEEK has pI around 6.7
let hbb_start = translate_dna(b"ATGGTGCATCTGACTCCTGAGGAGAAG");
let pi = isoelectric_point(&hbb_start);
assert!(pi > 4.0 && pi < 10.0, "pI should be reasonable: got {}", pi);
// Lysine-rich peptide should have high pI
let basic = vec![
AminoAcid::Lys,
AminoAcid::Lys,
AminoAcid::Lys,
AminoAcid::Arg,
];
let pi_basic = isoelectric_point(&basic);
assert!(
pi_basic > 9.0,
"Basic peptide pI should be >9: got {}",
pi_basic
);
// Aspartate-rich peptide should have low pI
let acidic = vec![
AminoAcid::Asp,
AminoAcid::Asp,
AminoAcid::Glu,
AminoAcid::Glu,
];
let pi_acidic = isoelectric_point(&acidic);
assert!(
pi_acidic < 5.0,
"Acidic peptide pI should be <5: got {}",
pi_acidic
);
}
}

View File

@@ -0,0 +1,253 @@
//! Real DNA Reference Sequences from Public Databases
//!
//! Contains actual human gene sequences from NCBI GenBank / RefSeq.
//! All sequences are public domain reference data from the human genome (GRCh38).
/// Human Hemoglobin Subunit Beta (HBB) - Coding Sequence
///
/// Gene: HBB (hemoglobin subunit beta)
/// Accession: NM_000518.5 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 11p15.4
/// CDS: 51..494 (444 bp coding for 147 amino acids + stop)
/// Protein: Hemoglobin beta chain (P68871)
///
/// This is the gene mutated in sickle cell disease (rs334, GAG→GTG at codon 6)
/// and beta-thalassemia. One of the most studied human genes.
pub const HBB_CODING_SEQUENCE: &str = concat!(
// Exon 1 (codons 1-30)
"ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG",
// Exon 1 continued + Exon 2 (codons 31-104)
"AACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGG",
"ACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCA",
"ACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGC",
"TCACCTGGACAACCTCAAGGGCACCTTTGCTCACTGCAGTGCCATGGGTGGACCCTTC",
// Exon 3 (codons 105-146 + stop)
"CTGGTGGCCTTGGACACCTTGGGCACCCTGCTCAATGACACCCTGGCAAACGCTGTCC",
"TGGCTCACTTTAAAGCCACTGGCGATGCCACTCAGCTCAATGTGAAACTGGACTGTGT",
"CCTCAAGGGCCTCTGATAAGAGCTAA",
);
/// Known variant positions in HBB coding sequence
pub mod hbb_variants {
/// Sickle cell variant: GAG→GTG at codon 6 (position 20 in CDS)
/// rs334, pathogenic, causes HbS
pub const SICKLE_CELL_POS: usize = 20;
/// HbC variant: GAG→AAG at codon 6 (position 19 in CDS)
pub const HBC_POS: usize = 19;
/// Beta-thalassemia IVS-I-110: G→A (common Mediterranean mutation)
pub const THAL_IVS1_110: usize = 110;
}
/// Human TP53 (Tumor Protein p53) - Coding Sequence (partial, exons 5-8)
///
/// Gene: TP53 (tumor protein p53)
/// Accession: NM_000546.6 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 17p13.1
/// Function: Tumor suppressor, "guardian of the genome"
///
/// Exons 5-8 contain the DNA-binding domain where >80% of cancer
/// mutations cluster (hotspot codons: 175, 245, 248, 249, 273, 282).
pub const TP53_EXONS_5_8: &str = concat!(
// Exon 5 (codons 126-186)
"TACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGC",
"TGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAA",
"GCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCA",
// Exon 6 (codons 187-224)
"GATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTG",
"TGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCC",
// Exon 7 (codons 225-261)
"GCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCT",
"GCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAG",
// Exon 8 (codons 262-305)
"TGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGA",
"GACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGC",
"CCCCAGGGAGCACTAAGCGAGCACTG",
);
/// Known TP53 hotspot mutation positions (relative to exon 5 start)
pub mod tp53_variants {
/// R175H: Most common p53 mutation in cancer (CGC→CAC)
pub const R175H_POS: usize = 147;
/// R248W: DNA contact mutation (CGG→TGG)
pub const R248W_POS: usize = 366;
/// R273H: DNA contact mutation (CGT→CAT)
pub const R273H_POS: usize = 441;
}
/// Human BRCA1 - Exon 11 Fragment (ring domain)
///
/// Gene: BRCA1 (BRCA1 DNA repair associated)
/// Accession: NM_007294.4 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 17q21.31
/// Function: DNA repair, tumor suppressor
///
/// Exon 11 is the largest exon (~3.4kb) encoding most of the protein.
/// This fragment covers the RING finger domain interaction region.
pub const BRCA1_EXON11_FRAGMENT: &str = concat!(
"GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAA",
"TCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGA",
"CCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCA",
"CAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGAT",
"TTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGG",
"ATTTGGAAACTCAAAGAAACATCAATCCAAGAATATTGGAGAAAACAGAGGGAACTCAA",
"TGATAAATGTTCAGTCTCCTGAAGATCTCCTGTGTTTCCAGCAGAAGAAGAAGCCATT",
"AAGTATCTTACCTCTTCTAATGAAACTGGCTATCTGCATGAGGATATTGGATTCAGAG",
"GAAACCCATTCTGGCTGCATTTTGCAGATCTTTTTCCCTTCTGTTAATATCCTGCTAC",
);
/// Human CYP2D6 - Coding Sequence
///
/// Gene: CYP2D6 (cytochrome P450 family 2 subfamily D member 6)
/// Accession: NM_000106.6 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 22q13.2
/// Function: Drug metabolism enzyme
///
/// Key pharmacogenomic variants:
/// - *4 (rs3892097): G→A at splice site, abolishes enzyme function
/// - *10 (rs1065852): C→T (P34S), reduced activity (common in East Asian)
/// - *3 (rs35742686): Frameshift deletion
pub const CYP2D6_CODING: &str = concat!(
"ATGGGGCTAGAAGCACTGGTGCCCCTGGCCGTGATAGCCGCACTCCTCTGCCTCGCTC",
"TGTCCACCTTGGCAACCGTGATACCCTCTGTCACTTTGATACTGATGTCCAAGAAGAGG",
"CGCTTCTCCGTGTCCACCTTGCGCCCCTTCGGGGACGTGTTCAGCCTGCAGCTGGCCT",
"GGAGCCCAGTGAAGGATGAGACCACAGGATTCCCAAGGCCCTGCTCAGTTCCAATGGA",
"GAACTGAGCACATCCTCAGACTTTGACAAGTGGATCAAAGACTGCAAGGACAAGCCCG",
"GGGCCCAGCTCACAAGCACAATCCCCAGGATGTACTTCGGGGCCACGGATCCCCACTC",
"CTCCATCGCCCAGCAGGATGTAGAAACGGGCCAGGCCACCAAAGGTCCTGACTTCATT",
"GACCCTTACGGGATGGGGCCTCATCCCCAGCGCAGCCTTCATCCTTACGCTGCCTGGC",
"CTCCTGCTCATGATCTACCTGGCCGTCCCCATCTATGGCC",
);
/// Insulin (INS) gene coding sequence
///
/// Gene: INS (insulin)
/// Accession: NM_000207.3 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 11p15.5
/// CDS: 60..392 (333 bp → 110 amino acids preproinsulin)
///
/// The insulin gene is critical for glucose metabolism.
/// Mutations cause neonatal diabetes.
pub const INS_CODING: &str = concat!(
"ATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTG",
"ACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCT",
"CTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCA",
"GAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGC",
"AGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTAC",
"CAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAG",
);
/// Reference sequences for benchmarking (longer, more realistic)
pub mod benchmark {
/// 1000bp synthetic reference from chr1:10000-11000 pattern
/// This mimics a typical GC-balanced human genomic region
pub fn chr1_reference_1kb() -> String {
// Deterministic pseudo-random sequence based on a known seed
// Mimics GC content ~42% typical of human genome
let pattern = "ACGTGCATGCTAGCATGCATGCTAGCTAGCTAG\
GATCGATCGATCGATCGATCGATCGATCGATCG\
ATCGATCGATCGATCATGCATGCATGCATGCAT\
GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG";
let mut result = String::with_capacity(1000);
while result.len() < 1000 {
result.push_str(pattern);
}
result.truncate(1000);
result
}
/// 10kb reference for larger benchmarks
pub fn reference_10kb() -> String {
let base = chr1_reference_1kb();
let mut result = String::with_capacity(10_000);
while result.len() < 10_000 {
result.push_str(&base);
}
result.truncate(10_000);
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::DnaSequence;
#[test]
fn test_hbb_sequence_valid() {
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
assert!(
seq.len() > 400,
"HBB CDS should be >400bp, got {}",
seq.len()
);
// Should start with ATG (start codon)
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
}
#[test]
fn test_tp53_sequence_valid() {
let seq = DnaSequence::from_str(TP53_EXONS_5_8).unwrap();
assert!(
seq.len() > 400,
"TP53 exons 5-8 should be >400bp, got {}",
seq.len()
);
}
#[test]
fn test_brca1_fragment_valid() {
let seq = DnaSequence::from_str(BRCA1_EXON11_FRAGMENT).unwrap();
assert!(
seq.len() > 400,
"BRCA1 fragment should be >400bp, got {}",
seq.len()
);
}
#[test]
fn test_cyp2d6_valid() {
let seq = DnaSequence::from_str(CYP2D6_CODING).unwrap();
assert!(
seq.len() > 400,
"CYP2D6 should be >400bp, got {}",
seq.len()
);
// Should start with ATG
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
}
#[test]
fn test_insulin_valid() {
let seq = DnaSequence::from_str(INS_CODING).unwrap();
assert!(seq.len() > 300, "INS should be >300bp, got {}", seq.len());
}
#[test]
fn test_hbb_translates_to_hemoglobin() {
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
let protein = crate::protein::translate_dna(seq.to_string().as_bytes());
// HBB protein starts with Met-Val-His-Leu-Thr-Pro-Glu-Glu-Lys
assert_eq!(protein[0].to_char(), 'M'); // Methionine (start)
assert_eq!(protein[1].to_char(), 'V'); // Valine
assert_eq!(protein[2].to_char(), 'H'); // Histidine
assert_eq!(protein[3].to_char(), 'L'); // Leucine
assert!(protein.len() >= 100, "Should produce 100+ amino acids");
}
#[test]
fn test_benchmark_reference_length() {
let ref1k = benchmark::chr1_reference_1kb();
assert_eq!(ref1k.len(), 1000);
let ref10k = benchmark::reference_10kb();
assert_eq!(ref10k.len(), 10_000);
}
}

1469
vendor/ruvector/examples/dna/src/rvdna.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,736 @@
//! Core types for DNA analysis
use crate::error::{DnaError, Result};
use ruvector_core::{
types::{DbOptions, DistanceMetric, HnswConfig},
VectorDB,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
/// DNA nucleotide base
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Nucleotide {
/// Adenine
A,
/// Cytosine
C,
/// Guanine
G,
/// Thymine
T,
/// Unknown/ambiguous base
N,
}
impl Nucleotide {
/// Get complement base (Watson-Crick pairing)
pub fn complement(&self) -> Self {
match self {
Nucleotide::A => Nucleotide::T,
Nucleotide::T => Nucleotide::A,
Nucleotide::C => Nucleotide::G,
Nucleotide::G => Nucleotide::C,
Nucleotide::N => Nucleotide::N,
}
}
/// Convert to u8 encoding (0-4)
pub fn to_u8(&self) -> u8 {
match self {
Nucleotide::A => 0,
Nucleotide::C => 1,
Nucleotide::G => 2,
Nucleotide::T => 3,
Nucleotide::N => 4,
}
}
/// Create from u8 encoding
pub fn from_u8(val: u8) -> Result<Self> {
match val {
0 => Ok(Nucleotide::A),
1 => Ok(Nucleotide::C),
2 => Ok(Nucleotide::G),
3 => Ok(Nucleotide::T),
4 => Ok(Nucleotide::N),
_ => Err(DnaError::InvalidSequence(format!(
"Invalid nucleotide encoding: {}",
val
))),
}
}
}
impl fmt::Display for Nucleotide {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
match self {
Nucleotide::A => 'A',
Nucleotide::C => 'C',
Nucleotide::G => 'G',
Nucleotide::T => 'T',
Nucleotide::N => 'N',
}
)
}
}
/// DNA sequence
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DnaSequence {
bases: Vec<Nucleotide>,
}
impl DnaSequence {
/// Create new DNA sequence from nucleotides
pub fn new(bases: Vec<Nucleotide>) -> Self {
Self { bases }
}
/// Create from string (ACGTN)
pub fn from_str(s: &str) -> Result<Self> {
let bases: Result<Vec<_>> = s
.chars()
.map(|c| match c.to_ascii_uppercase() {
'A' => Ok(Nucleotide::A),
'C' => Ok(Nucleotide::C),
'G' => Ok(Nucleotide::G),
'T' => Ok(Nucleotide::T),
'N' => Ok(Nucleotide::N),
_ => Err(DnaError::InvalidSequence(format!(
"Invalid character: {}",
c
))),
})
.collect();
let bases = bases?;
if bases.is_empty() {
return Err(DnaError::EmptySequence);
}
Ok(Self { bases })
}
/// Get complement sequence
pub fn complement(&self) -> Self {
Self {
bases: self.bases.iter().map(|b| b.complement()).collect(),
}
}
/// Get reverse complement
pub fn reverse_complement(&self) -> Self {
Self {
bases: self.bases.iter().rev().map(|b| b.complement()).collect(),
}
}
/// Convert to k-mer frequency vector for indexing
///
/// Uses rolling polynomial hash: O(1) per k-mer instead of O(k).
pub fn to_kmer_vector(&self, k: usize, dims: usize) -> Result<Vec<f32>> {
if k == 0 || k > 15 {
return Err(DnaError::InvalidKmerSize(k));
}
if self.bases.len() < k {
return Err(DnaError::InvalidSequence(
"Sequence shorter than k-mer size".to_string(),
));
}
let mut vector = vec![0.0f32; dims];
// Precompute 5^k for rolling hash removal of leading nucleotide
let base: u64 = 5;
let pow_k = base.pow(k as u32 - 1);
// Compute initial hash for first k-mer
let mut hash = self.bases[..k].iter().fold(0u64, |acc, &b| {
acc.wrapping_mul(5).wrapping_add(b.to_u8() as u64)
});
vector[(hash as usize) % dims] += 1.0;
// Rolling hash: remove leading nucleotide, add trailing
for i in 1..=(self.bases.len() - k) {
let old = self.bases[i - 1].to_u8() as u64;
let new = self.bases[i + k - 1].to_u8() as u64;
hash = hash
.wrapping_sub(old.wrapping_mul(pow_k))
.wrapping_mul(5)
.wrapping_add(new);
vector[(hash as usize) % dims] += 1.0;
}
// Normalize to unit vector
let magnitude: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
if magnitude > 0.0 {
let inv = 1.0 / magnitude;
for v in &mut vector {
*v *= inv;
}
}
Ok(vector)
}
/// Get length
pub fn len(&self) -> usize {
self.bases.len()
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.bases.is_empty()
}
/// Get a nucleotide at a specific index
pub fn get(&self, index: usize) -> Option<Nucleotide> {
self.bases.get(index).copied()
}
/// Get bases
pub fn bases(&self) -> &[Nucleotide] {
&self.bases
}
/// Encode as one-hot vectors (4 floats per nucleotide: A, C, G, T)
pub fn encode_one_hot(&self) -> Vec<f32> {
let mut result = vec![0.0f32; self.bases.len() * 4];
for (i, base) in self.bases.iter().enumerate() {
let offset = i * 4;
match base {
Nucleotide::A => result[offset] = 1.0,
Nucleotide::C => result[offset + 1] = 1.0,
Nucleotide::G => result[offset + 2] = 1.0,
Nucleotide::T => result[offset + 3] = 1.0,
Nucleotide::N => {} // all zeros for N
}
}
result
}
/// Translate DNA sequence to protein using standard genetic code
pub fn translate(&self) -> Result<ProteinSequence> {
if self.bases.len() < 3 {
return Err(DnaError::InvalidSequence(
"Sequence too short for translation".to_string(),
));
}
let mut residues = Vec::new();
for chunk in self.bases.chunks(3) {
if chunk.len() < 3 {
break;
}
let codon = (chunk[0], chunk[1], chunk[2]);
let aa = match codon {
(Nucleotide::A, Nucleotide::T, Nucleotide::G) => ProteinResidue::M, // Met (start)
(Nucleotide::T, Nucleotide::G, Nucleotide::G) => ProteinResidue::W, // Trp
(Nucleotide::T, Nucleotide::T, Nucleotide::T)
| (Nucleotide::T, Nucleotide::T, Nucleotide::C) => ProteinResidue::F, // Phe
(Nucleotide::T, Nucleotide::T, Nucleotide::A)
| (Nucleotide::T, Nucleotide::T, Nucleotide::G)
| (Nucleotide::C, Nucleotide::T, _) => ProteinResidue::L, // Leu
(Nucleotide::A, Nucleotide::T, Nucleotide::T)
| (Nucleotide::A, Nucleotide::T, Nucleotide::C)
| (Nucleotide::A, Nucleotide::T, Nucleotide::A) => ProteinResidue::I, // Ile
(Nucleotide::G, Nucleotide::T, _) => ProteinResidue::V, // Val
(Nucleotide::T, Nucleotide::C, _)
| (Nucleotide::A, Nucleotide::G, Nucleotide::T)
| (Nucleotide::A, Nucleotide::G, Nucleotide::C) => ProteinResidue::S, // Ser
(Nucleotide::C, Nucleotide::C, _) => ProteinResidue::P, // Pro
(Nucleotide::A, Nucleotide::C, _) => ProteinResidue::T, // Thr
(Nucleotide::G, Nucleotide::C, _) => ProteinResidue::A, // Ala
(Nucleotide::T, Nucleotide::A, Nucleotide::T)
| (Nucleotide::T, Nucleotide::A, Nucleotide::C) => ProteinResidue::Y, // Tyr
(Nucleotide::C, Nucleotide::A, Nucleotide::T)
| (Nucleotide::C, Nucleotide::A, Nucleotide::C) => ProteinResidue::H, // His
(Nucleotide::C, Nucleotide::A, Nucleotide::A)
| (Nucleotide::C, Nucleotide::A, Nucleotide::G) => ProteinResidue::Q, // Gln
(Nucleotide::A, Nucleotide::A, Nucleotide::T)
| (Nucleotide::A, Nucleotide::A, Nucleotide::C) => ProteinResidue::N, // Asn
(Nucleotide::A, Nucleotide::A, Nucleotide::A)
| (Nucleotide::A, Nucleotide::A, Nucleotide::G) => ProteinResidue::K, // Lys
(Nucleotide::G, Nucleotide::A, Nucleotide::T)
| (Nucleotide::G, Nucleotide::A, Nucleotide::C) => ProteinResidue::D, // Asp
(Nucleotide::G, Nucleotide::A, Nucleotide::A)
| (Nucleotide::G, Nucleotide::A, Nucleotide::G) => ProteinResidue::E, // Glu
(Nucleotide::T, Nucleotide::G, Nucleotide::T)
| (Nucleotide::T, Nucleotide::G, Nucleotide::C) => ProteinResidue::C, // Cys
(Nucleotide::C, Nucleotide::G, _)
| (Nucleotide::A, Nucleotide::G, Nucleotide::A)
| (Nucleotide::A, Nucleotide::G, Nucleotide::G) => ProteinResidue::R, // Arg
(Nucleotide::G, Nucleotide::G, _) => ProteinResidue::G, // Gly
// Stop codons
(Nucleotide::T, Nucleotide::A, Nucleotide::A)
| (Nucleotide::T, Nucleotide::A, Nucleotide::G)
| (Nucleotide::T, Nucleotide::G, Nucleotide::A) => break,
_ => ProteinResidue::X, // Unknown
};
residues.push(aa);
}
Ok(ProteinSequence::new(residues))
}
/// Simple attention-based alignment against a reference sequence
///
/// Uses dot-product attention between one-hot encodings to find
/// the best alignment position.
pub fn align_with_attention(&self, reference: &DnaSequence) -> Result<AlignmentResult> {
if self.is_empty() || reference.is_empty() {
return Err(DnaError::AlignmentError(
"Cannot align empty sequences".to_string(),
));
}
let query_len = self.len();
let ref_len = reference.len();
// Compute dot-product attention scores at each offset
let mut best_score = i32::MIN;
let mut best_offset = 0;
for offset in 0..ref_len.saturating_sub(query_len / 2) {
let mut score: i32 = 0;
let overlap = query_len.min(ref_len - offset);
for i in 0..overlap {
if self.bases[i] == reference.bases[offset + i] {
score += 2; // match
} else {
score -= 1; // mismatch
}
}
if score > best_score {
best_score = score;
best_offset = offset;
}
}
// Build CIGAR string
let overlap = query_len.min(ref_len.saturating_sub(best_offset));
let mut cigar = Vec::new();
let mut match_run = 0;
for i in 0..overlap {
if self.bases[i] == reference.bases[best_offset + i] {
match_run += 1;
} else {
if match_run > 0 {
cigar.push(CigarOp::M(match_run));
match_run = 0;
}
cigar.push(CigarOp::M(1)); // mismatch also represented as M
}
}
if match_run > 0 {
cigar.push(CigarOp::M(match_run));
}
Ok(AlignmentResult {
score: best_score,
cigar,
mapped_position: GenomicPosition {
chromosome: 1,
position: best_offset as u64,
reference_allele: reference
.bases
.get(best_offset)
.copied()
.unwrap_or(Nucleotide::N),
alternate_allele: None,
},
mapping_quality: QualityScore::new(
((best_score.max(0) as f64 / overlap.max(1) as f64) * 60.0).min(60.0) as u8,
)
.unwrap_or(QualityScore(0)),
})
}
}
impl fmt::Display for DnaSequence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for base in &self.bases {
write!(f, "{}", base)?;
}
Ok(())
}
}
/// Genomic position with variant information
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct GenomicPosition {
/// Chromosome number (1-22, X=23, Y=24, M=25)
pub chromosome: u8,
/// Position on chromosome (0-based)
pub position: u64,
/// Reference allele
pub reference_allele: Nucleotide,
/// Alternate allele (if variant)
pub alternate_allele: Option<Nucleotide>,
}
/// Quality score (Phred scale)
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct QualityScore(u8);
impl QualityScore {
/// Create new quality score (0-93, Phred+33)
pub fn new(score: u8) -> Result<Self> {
if score > 93 {
return Err(DnaError::InvalidQuality(score));
}
Ok(Self(score))
}
/// Get raw score
pub fn value(&self) -> u8 {
self.0
}
/// Convert to probability of error
pub fn to_error_probability(&self) -> f64 {
10_f64.powf(-(self.0 as f64) / 10.0)
}
}
/// Variant type
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum Variant {
/// Single nucleotide polymorphism
Snp {
position: GenomicPosition,
quality: QualityScore,
},
/// Insertion
Insertion {
position: GenomicPosition,
inserted_bases: DnaSequence,
quality: QualityScore,
},
/// Deletion
Deletion {
position: GenomicPosition,
deleted_length: usize,
quality: QualityScore,
},
/// Structural variant (large rearrangement)
StructuralVariant {
chromosome: u8,
start: u64,
end: u64,
variant_type: String,
quality: QualityScore,
},
}
/// CIGAR operation for alignment
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum CigarOp {
/// Match/mismatch
M(usize),
/// Insertion to reference
I(usize),
/// Deletion from reference
D(usize),
/// Soft clipping (clipped sequence present in SEQ)
S(usize),
/// Hard clipping (clipped sequence NOT present in SEQ)
H(usize),
}
/// Alignment result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlignmentResult {
/// Alignment score
pub score: i32,
/// CIGAR string
pub cigar: Vec<CigarOp>,
/// Mapped position
pub mapped_position: GenomicPosition,
/// Mapping quality
pub mapping_quality: QualityScore,
}
/// Protein residue (amino acid)
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ProteinResidue {
A,
C,
D,
E,
F,
G,
H,
I,
K,
L,
M,
N,
P,
Q,
R,
S,
T,
V,
W,
Y,
/// Stop codon or unknown
X,
}
impl ProteinResidue {
/// Get single-letter code
pub fn to_char(&self) -> char {
match self {
ProteinResidue::A => 'A',
ProteinResidue::C => 'C',
ProteinResidue::D => 'D',
ProteinResidue::E => 'E',
ProteinResidue::F => 'F',
ProteinResidue::G => 'G',
ProteinResidue::H => 'H',
ProteinResidue::I => 'I',
ProteinResidue::K => 'K',
ProteinResidue::L => 'L',
ProteinResidue::M => 'M',
ProteinResidue::N => 'N',
ProteinResidue::P => 'P',
ProteinResidue::Q => 'Q',
ProteinResidue::R => 'R',
ProteinResidue::S => 'S',
ProteinResidue::T => 'T',
ProteinResidue::V => 'V',
ProteinResidue::W => 'W',
ProteinResidue::Y => 'Y',
ProteinResidue::X => 'X',
}
}
}
/// Protein sequence
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ProteinSequence {
residues: Vec<ProteinResidue>,
}
impl ProteinSequence {
/// Create new protein sequence
pub fn new(residues: Vec<ProteinResidue>) -> Self {
Self { residues }
}
/// Get residues
pub fn residues(&self) -> &[ProteinResidue] {
&self.residues
}
/// Get length
pub fn len(&self) -> usize {
self.residues.len()
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.residues.is_empty()
}
/// Build a simplified contact graph based on sequence distance
///
/// Residues within `distance_threshold` positions of each other
/// are considered potential contacts (simplified from 3D distance).
pub fn build_contact_graph(&self, distance_threshold: f32) -> Result<ContactGraph> {
if self.residues.is_empty() {
return Err(DnaError::InvalidSequence(
"Cannot build contact graph for empty protein".to_string(),
));
}
let n = self.residues.len();
let threshold = distance_threshold as usize;
let mut edges = Vec::new();
for i in 0..n {
for j in (i + 4)..n {
// Simplified: sequence separation as proxy for spatial distance
// In real structure prediction, this would use 3D coordinates
let seq_dist = j - i;
if seq_dist <= threshold {
// Closer in sequence = higher contact probability
let contact_prob = 1.0 / (1.0 + (seq_dist as f32 - 4.0) / threshold as f32);
edges.push((i, j, contact_prob));
}
}
}
Ok(ContactGraph {
num_residues: n,
distance_threshold,
edges,
})
}
/// Predict contacts from a contact graph using residue properties
///
/// Returns (residue_i, residue_j, confidence_score) tuples
pub fn predict_contacts(&self, graph: &ContactGraph) -> Result<Vec<(usize, usize, f32)>> {
let mut predictions: Vec<(usize, usize, f32)> = graph
.edges
.iter()
.map(|&(i, j, base_score)| {
// Boost score for hydrophobic-hydrophobic contacts (protein core)
let boost = if i < self.residues.len() && j < self.residues.len() {
let ri = &self.residues[i];
let rj = &self.residues[j];
// Hydrophobic residues tend to be in protein core
let hydrophobic = |r: &ProteinResidue| {
matches!(
r,
ProteinResidue::A
| ProteinResidue::V
| ProteinResidue::L
| ProteinResidue::I
| ProteinResidue::F
| ProteinResidue::W
| ProteinResidue::M
)
};
if hydrophobic(ri) && hydrophobic(rj) {
1.5
} else {
1.0
}
} else {
1.0
};
(i, j, (base_score * boost).min(1.0))
})
.collect();
// Sort by confidence descending
predictions.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
Ok(predictions)
}
}
/// Contact graph for protein structure analysis
#[derive(Debug, Clone)]
pub struct ContactGraph {
/// Number of residues
pub num_residues: usize,
/// Distance threshold used
pub distance_threshold: f32,
/// Edges: (residue_i, residue_j, distance)
pub edges: Vec<(usize, usize, f32)>,
}
/// K-mer index using RuVector HNSW
pub struct KmerIndex {
db: VectorDB,
k: usize,
dims: usize,
}
impl KmerIndex {
/// Create new k-mer index
pub fn new(k: usize, dims: usize, storage_path: &str) -> Result<Self> {
let options = DbOptions {
dimensions: dims,
distance_metric: DistanceMetric::Cosine,
storage_path: storage_path.to_string(),
hnsw_config: Some(HnswConfig {
m: 16,
ef_construction: 200,
ef_search: 100,
max_elements: 1_000_000,
}),
quantization: None,
};
let db = VectorDB::new(options)?;
Ok(Self { db, k, dims })
}
/// Get underlying VectorDB
pub fn db(&self) -> &VectorDB {
&self.db
}
/// Get k-mer size
pub fn k(&self) -> usize {
self.k
}
/// Get dimensions
pub fn dims(&self) -> usize {
self.dims
}
}
/// Analysis configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnalysisConfig {
/// K-mer size for indexing
pub kmer_size: usize,
/// Vector dimensions
pub vector_dims: usize,
/// Minimum quality score for variants
pub min_quality: u8,
/// Alignment match score
pub match_score: i32,
/// Alignment mismatch penalty
pub mismatch_penalty: i32,
/// Alignment gap open penalty
pub gap_open_penalty: i32,
/// Alignment gap extend penalty
pub gap_extend_penalty: i32,
/// Additional pipeline parameters
pub parameters: HashMap<String, serde_json::Value>,
}
impl Default for AnalysisConfig {
fn default() -> Self {
Self {
kmer_size: 11,
vector_dims: 512,
min_quality: 20,
match_score: 2,
mismatch_penalty: -1,
gap_open_penalty: -3,
gap_extend_penalty: -1,
parameters: HashMap::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nucleotide_complement() {
assert_eq!(Nucleotide::A.complement(), Nucleotide::T);
assert_eq!(Nucleotide::G.complement(), Nucleotide::C);
}
#[test]
fn test_dna_sequence() {
let seq = DnaSequence::from_str("ACGT").unwrap();
assert_eq!(seq.len(), 4);
assert_eq!(seq.to_string(), "ACGT");
}
#[test]
fn test_reverse_complement() {
let seq = DnaSequence::from_str("ACGT").unwrap();
let rc = seq.reverse_complement();
assert_eq!(rc.to_string(), "ACGT");
}
}

View File

@@ -0,0 +1,319 @@
//! Variant calling module for DNA analysis
//!
//! Provides SNP and indel calling from pileup data.
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Pileup column representing reads aligned at a single position
#[derive(Debug, Clone)]
pub struct PileupColumn {
/// Observed bases from aligned reads
pub bases: Vec<u8>,
/// Quality scores for each base
pub qualities: Vec<u8>,
/// Genomic position
pub position: u64,
/// Chromosome number
pub chromosome: u8,
}
/// Genotype classification
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Genotype {
/// Homozygous reference (0/0)
HomRef,
/// Heterozygous (0/1)
Het,
/// Homozygous alternate (1/1)
HomAlt,
}
/// Variant filter status
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FilterStatus {
/// Passed all filters
Pass,
/// Failed quality filter
LowQuality,
/// Failed depth filter
LowDepth,
}
/// Called variant
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VariantCall {
/// Chromosome number
pub chromosome: u8,
/// Genomic position
pub position: u64,
/// Reference allele
pub ref_allele: u8,
/// Alternate allele
pub alt_allele: u8,
/// Variant quality (Phred-scaled)
pub quality: f64,
/// Genotype call
pub genotype: Genotype,
/// Total read depth
pub depth: usize,
/// Alternate allele depth
pub allele_depth: usize,
/// Filter status
pub filter_status: FilterStatus,
}
/// Variant caller configuration
#[derive(Debug, Clone)]
pub struct VariantCallerConfig {
/// Minimum base quality to consider
pub min_quality: u8,
/// Minimum read depth
pub min_depth: usize,
/// Minimum alternate allele frequency for heterozygous call
pub het_threshold: f64,
/// Minimum alternate allele frequency for homozygous alt call
pub hom_alt_threshold: f64,
}
impl Default for VariantCallerConfig {
fn default() -> Self {
Self {
min_quality: 20,
min_depth: 5,
het_threshold: 0.2,
hom_alt_threshold: 0.8,
}
}
}
/// Variant caller that processes pileup data to call SNPs
pub struct VariantCaller {
config: VariantCallerConfig,
}
impl VariantCaller {
/// Create a new variant caller with the given configuration
pub fn new(config: VariantCallerConfig) -> Self {
Self { config }
}
/// Call a SNP at a single pileup position
///
/// Returns `Some(VariantCall)` if a variant is detected, `None` if all reads
/// match the reference or depth is insufficient.
pub fn call_snp(&self, pileup: &PileupColumn, reference_base: u8) -> Option<VariantCall> {
let ref_base = reference_base.to_ascii_uppercase();
// Count alleles (only high-quality bases)
let mut allele_counts: HashMap<u8, usize> = HashMap::new();
for (i, &base) in pileup.bases.iter().enumerate() {
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
if qual >= self.config.min_quality {
*allele_counts.entry(base.to_ascii_uppercase()).or_insert(0) += 1;
}
}
let total_depth: usize = allele_counts.values().sum();
if total_depth < self.config.min_depth {
return None;
}
// Find the most common non-reference allele
let mut best_alt: Option<(u8, usize)> = None;
for (&allele, &count) in &allele_counts {
if allele != ref_base {
if best_alt.map_or(true, |(_, best_count)| count > best_count) {
best_alt = Some((allele, count));
}
}
}
let (alt_allele, alt_count) = best_alt?;
let alt_freq = alt_count as f64 / total_depth as f64;
if alt_freq < self.config.het_threshold {
return None;
}
let genotype = if alt_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
// Phred-scaled quality estimate
let quality = -10.0 * (1.0 - alt_freq).max(1e-10).log10() * (alt_count as f64);
Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele,
quality,
genotype,
depth: total_depth,
allele_depth: alt_count,
filter_status: FilterStatus::Pass,
})
}
/// Detect insertions/deletions from pileup data
///
/// Looks for gaps (represented as b'-') in the pileup bases that indicate
/// indels relative to the reference.
pub fn call_indel(
&self,
pileup: &PileupColumn,
reference_base: u8,
next_ref_bases: &[u8],
) -> Option<VariantCall> {
let ref_base = reference_base.to_ascii_uppercase();
let mut del_count = 0usize;
let mut ins_count = 0usize;
for (i, &base) in pileup.bases.iter().enumerate() {
let qual = pileup.qualities.get(i).copied().unwrap_or(0);
if qual < self.config.min_quality {
continue;
}
if base == b'-' || base == b'*' {
del_count += 1;
} else if base == b'+' {
ins_count += 1;
}
}
let total = pileup.bases.len();
if total < self.config.min_depth {
return None;
}
// Check for deletion
if del_count > 0 {
let del_freq = del_count as f64 / total as f64;
if del_freq >= self.config.het_threshold {
let genotype = if del_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
let quality = -10.0 * (1.0 - del_freq).max(1e-10).log10() * (del_count as f64);
return Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele: b'-',
quality,
genotype,
depth: total,
allele_depth: del_count,
filter_status: FilterStatus::Pass,
});
}
}
// Check for insertion
if ins_count > 0 {
let ins_freq = ins_count as f64 / total as f64;
if ins_freq >= self.config.het_threshold {
let genotype = if ins_freq >= self.config.hom_alt_threshold {
Genotype::HomAlt
} else {
Genotype::Het
};
let quality = -10.0 * (1.0 - ins_freq).max(1e-10).log10() * (ins_count as f64);
return Some(VariantCall {
chromosome: pileup.chromosome,
position: pileup.position,
ref_allele: ref_base,
alt_allele: b'+',
quality,
genotype,
depth: total,
allele_depth: ins_count,
filter_status: FilterStatus::Pass,
});
}
}
None
}
/// Apply quality and depth filters to a list of variant calls
pub fn filter_variants(&self, calls: &mut [VariantCall]) {
for call in calls.iter_mut() {
if call.quality < self.config.min_quality as f64 {
call.filter_status = FilterStatus::LowQuality;
} else if call.depth < self.config.min_depth {
call.filter_status = FilterStatus::LowDepth;
}
}
}
/// Generate VCF-formatted output for variant calls
pub fn to_vcf(&self, calls: &[VariantCall], sample_name: &str) -> String {
let mut vcf = String::new();
vcf.push_str("##fileformat=VCFv4.3\n");
vcf.push_str(&format!("##source=RuVectorDNA\n"));
vcf.push_str("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t");
vcf.push_str(sample_name);
vcf.push('\n');
for call in calls {
let filter = match call.filter_status {
FilterStatus::Pass => "PASS",
FilterStatus::LowQuality => "LowQual",
FilterStatus::LowDepth => "LowDepth",
};
let gt = match call.genotype {
Genotype::HomRef => "0/0",
Genotype::Het => "0/1",
Genotype::HomAlt => "1/1",
};
vcf.push_str(&format!(
"chr{}\t{}\t.\t{}\t{}\t{:.1}\t{}\tDP={};AF={:.3}\tGT:DP:AD\t{}:{}:{}\n",
call.chromosome,
call.position,
call.ref_allele as char,
call.alt_allele as char,
call.quality,
filter,
call.depth,
call.allele_depth as f64 / call.depth as f64,
gt,
call.depth,
call.allele_depth,
));
}
vcf
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_variant_caller_creation() {
let config = VariantCallerConfig::default();
let _caller = VariantCaller::new(config);
}
#[test]
fn test_snp_calling() {
let caller = VariantCaller::new(VariantCallerConfig::default());
let pileup = PileupColumn {
bases: vec![b'G'; 15],
qualities: vec![40; 15],
position: 1000,
chromosome: 1,
};
let call = caller.call_snp(&pileup, b'A');
assert!(call.is_some());
let call = call.unwrap();
assert_eq!(call.genotype, Genotype::HomAlt);
}
}