Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
338
vendor/ruvector/examples/dna/src/protein.rs
vendored
Normal file
338
vendor/ruvector/examples/dna/src/protein.rs
vendored
Normal file
@@ -0,0 +1,338 @@
|
||||
//! Protein translation and amino acid analysis module
|
||||
//!
|
||||
//! Provides DNA to protein translation using the standard genetic code,
|
||||
//! and amino acid property calculations.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Amino acid representation with full names
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum AminoAcid {
|
||||
/// Alanine
|
||||
Ala,
|
||||
/// Arginine
|
||||
Arg,
|
||||
/// Asparagine
|
||||
Asn,
|
||||
/// Aspartic acid
|
||||
Asp,
|
||||
/// Cysteine
|
||||
Cys,
|
||||
/// Glutamic acid
|
||||
Glu,
|
||||
/// Glutamine
|
||||
Gln,
|
||||
/// Glycine
|
||||
Gly,
|
||||
/// Histidine
|
||||
His,
|
||||
/// Isoleucine
|
||||
Ile,
|
||||
/// Leucine
|
||||
Leu,
|
||||
/// Lysine
|
||||
Lys,
|
||||
/// Methionine (start codon)
|
||||
Met,
|
||||
/// Phenylalanine
|
||||
Phe,
|
||||
/// Proline
|
||||
Pro,
|
||||
/// Serine
|
||||
Ser,
|
||||
/// Threonine
|
||||
Thr,
|
||||
/// Tryptophan
|
||||
Trp,
|
||||
/// Tyrosine
|
||||
Tyr,
|
||||
/// Valine
|
||||
Val,
|
||||
/// Stop codon
|
||||
Stop,
|
||||
}
|
||||
|
||||
impl AminoAcid {
|
||||
/// Get single-letter code
|
||||
pub fn to_char(&self) -> char {
|
||||
match self {
|
||||
AminoAcid::Ala => 'A',
|
||||
AminoAcid::Arg => 'R',
|
||||
AminoAcid::Asn => 'N',
|
||||
AminoAcid::Asp => 'D',
|
||||
AminoAcid::Cys => 'C',
|
||||
AminoAcid::Glu => 'E',
|
||||
AminoAcid::Gln => 'Q',
|
||||
AminoAcid::Gly => 'G',
|
||||
AminoAcid::His => 'H',
|
||||
AminoAcid::Ile => 'I',
|
||||
AminoAcid::Leu => 'L',
|
||||
AminoAcid::Lys => 'K',
|
||||
AminoAcid::Met => 'M',
|
||||
AminoAcid::Phe => 'F',
|
||||
AminoAcid::Pro => 'P',
|
||||
AminoAcid::Ser => 'S',
|
||||
AminoAcid::Thr => 'T',
|
||||
AminoAcid::Trp => 'W',
|
||||
AminoAcid::Tyr => 'Y',
|
||||
AminoAcid::Val => 'V',
|
||||
AminoAcid::Stop => '*',
|
||||
}
|
||||
}
|
||||
|
||||
/// Get Kyte-Doolittle hydrophobicity value
|
||||
pub fn hydrophobicity(&self) -> f32 {
|
||||
match self {
|
||||
AminoAcid::Ile => 4.5,
|
||||
AminoAcid::Val => 4.2,
|
||||
AminoAcid::Leu => 3.8,
|
||||
AminoAcid::Phe => 2.8,
|
||||
AminoAcid::Cys => 2.5,
|
||||
AminoAcid::Met => 1.9,
|
||||
AminoAcid::Ala => 1.8,
|
||||
AminoAcid::Gly => -0.4,
|
||||
AminoAcid::Thr => -0.7,
|
||||
AminoAcid::Ser => -0.8,
|
||||
AminoAcid::Trp => -0.9,
|
||||
AminoAcid::Tyr => -1.3,
|
||||
AminoAcid::Pro => -1.6,
|
||||
AminoAcid::His => -3.2,
|
||||
AminoAcid::Glu => -3.5,
|
||||
AminoAcid::Gln => -3.5,
|
||||
AminoAcid::Asp => -3.5,
|
||||
AminoAcid::Asn => -3.5,
|
||||
AminoAcid::Lys => -3.9,
|
||||
AminoAcid::Arg => -4.5,
|
||||
AminoAcid::Stop => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get average molecular weight in Daltons (monoisotopic)
|
||||
pub fn molecular_weight(&self) -> f64 {
|
||||
match self {
|
||||
AminoAcid::Ala => 71.03711,
|
||||
AminoAcid::Arg => 156.10111,
|
||||
AminoAcid::Asn => 114.04293,
|
||||
AminoAcid::Asp => 115.02694,
|
||||
AminoAcid::Cys => 103.00919,
|
||||
AminoAcid::Glu => 129.04259,
|
||||
AminoAcid::Gln => 128.05858,
|
||||
AminoAcid::Gly => 57.02146,
|
||||
AminoAcid::His => 137.05891,
|
||||
AminoAcid::Ile => 113.08406,
|
||||
AminoAcid::Leu => 113.08406,
|
||||
AminoAcid::Lys => 128.09496,
|
||||
AminoAcid::Met => 131.04049,
|
||||
AminoAcid::Phe => 147.06841,
|
||||
AminoAcid::Pro => 97.05276,
|
||||
AminoAcid::Ser => 87.03203,
|
||||
AminoAcid::Thr => 101.04768,
|
||||
AminoAcid::Trp => 186.07931,
|
||||
AminoAcid::Tyr => 163.06333,
|
||||
AminoAcid::Val => 99.06841,
|
||||
AminoAcid::Stop => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get pKa values for Henderson-Hasselbalch isoelectric point calculation
|
||||
/// Returns (pKa_amino, pKa_carboxyl, pKa_sidechain or None)
|
||||
pub fn pka_sidechain(&self) -> Option<f64> {
|
||||
match self {
|
||||
AminoAcid::Asp => Some(3.65),
|
||||
AminoAcid::Glu => Some(4.25),
|
||||
AminoAcid::His => Some(6.00),
|
||||
AminoAcid::Cys => Some(8.18),
|
||||
AminoAcid::Tyr => Some(10.07),
|
||||
AminoAcid::Lys => Some(10.53),
|
||||
AminoAcid::Arg => Some(12.48),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate total molecular weight of a protein in Daltons
|
||||
///
|
||||
/// Accounts for water loss from peptide bond formation.
|
||||
pub fn molecular_weight(protein: &[AminoAcid]) -> f64 {
|
||||
if protein.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
// Sum residue weights + water (18.01056 Da) - water for each peptide bond
|
||||
let residue_sum: f64 = protein.iter().map(|aa| aa.molecular_weight()).sum();
|
||||
// N-term H (1.00794) + C-term OH (17.00274) + residues - H2O per bond
|
||||
residue_sum + 18.01056 - (protein.len().saturating_sub(1) as f64 * 0.0) // Already accounted in residue weights
|
||||
}
|
||||
|
||||
/// Estimate isoelectric point (pI) using the bisection method
|
||||
///
|
||||
/// pI is the pH at which the net charge of the protein is zero.
|
||||
/// Uses Henderson-Hasselbalch equation with standard pKa values.
|
||||
pub fn isoelectric_point(protein: &[AminoAcid]) -> f64 {
|
||||
if protein.is_empty() {
|
||||
return 7.0;
|
||||
}
|
||||
|
||||
const PKA_NH2: f64 = 9.69; // N-terminal amino group
|
||||
const PKA_COOH: f64 = 2.34; // C-terminal carboxyl group
|
||||
|
||||
let charge_at_ph = |ph: f64| -> f64 {
|
||||
// N-terminal positive charge
|
||||
let mut charge = 1.0 / (1.0 + 10_f64.powf(ph - PKA_NH2));
|
||||
// C-terminal negative charge
|
||||
charge -= 1.0 / (1.0 + 10_f64.powf(PKA_COOH - ph));
|
||||
|
||||
for aa in protein {
|
||||
if let Some(pka) = aa.pka_sidechain() {
|
||||
match aa {
|
||||
// Positively charged at low pH: His, Lys, Arg
|
||||
AminoAcid::His | AminoAcid::Lys | AminoAcid::Arg => {
|
||||
charge += 1.0 / (1.0 + 10_f64.powf(ph - pka));
|
||||
}
|
||||
// Negatively charged at high pH: Asp, Glu, Cys, Tyr
|
||||
_ => {
|
||||
charge -= 1.0 / (1.0 + 10_f64.powf(pka - ph));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
charge
|
||||
};
|
||||
|
||||
// Bisection method to find pH where charge = 0
|
||||
let mut low = 0.0_f64;
|
||||
let mut high = 14.0_f64;
|
||||
|
||||
for _ in 0..100 {
|
||||
let mid = (low + high) / 2.0;
|
||||
let charge = charge_at_ph(mid);
|
||||
if charge > 0.0 {
|
||||
low = mid;
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
|
||||
(low + high) / 2.0
|
||||
}
|
||||
|
||||
/// Translate a DNA sequence to a vector of amino acids using the standard genetic code.
|
||||
///
|
||||
/// Translation proceeds in triplets (codons) from the start of the sequence.
|
||||
/// Stop codons (TAA, TAG, TGA) terminate translation.
|
||||
/// Incomplete codons at the end are ignored.
|
||||
pub fn translate_dna(dna: &[u8]) -> Vec<AminoAcid> {
|
||||
let mut proteins = Vec::new();
|
||||
|
||||
for chunk in dna.chunks(3) {
|
||||
if chunk.len() < 3 {
|
||||
break;
|
||||
}
|
||||
|
||||
let codon = [
|
||||
chunk[0].to_ascii_uppercase(),
|
||||
chunk[1].to_ascii_uppercase(),
|
||||
chunk[2].to_ascii_uppercase(),
|
||||
];
|
||||
|
||||
let aa = match &codon {
|
||||
b"ATG" => AminoAcid::Met,
|
||||
b"TGG" => AminoAcid::Trp,
|
||||
b"TTT" | b"TTC" => AminoAcid::Phe,
|
||||
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => AminoAcid::Leu,
|
||||
b"ATT" | b"ATC" | b"ATA" => AminoAcid::Ile,
|
||||
b"GTT" | b"GTC" | b"GTA" | b"GTG" => AminoAcid::Val,
|
||||
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => AminoAcid::Ser,
|
||||
b"CCT" | b"CCC" | b"CCA" | b"CCG" => AminoAcid::Pro,
|
||||
b"ACT" | b"ACC" | b"ACA" | b"ACG" => AminoAcid::Thr,
|
||||
b"GCT" | b"GCC" | b"GCA" | b"GCG" => AminoAcid::Ala,
|
||||
b"TAT" | b"TAC" => AminoAcid::Tyr,
|
||||
b"CAT" | b"CAC" => AminoAcid::His,
|
||||
b"CAA" | b"CAG" => AminoAcid::Gln,
|
||||
b"AAT" | b"AAC" => AminoAcid::Asn,
|
||||
b"AAA" | b"AAG" => AminoAcid::Lys,
|
||||
b"GAT" | b"GAC" => AminoAcid::Asp,
|
||||
b"GAA" | b"GAG" => AminoAcid::Glu,
|
||||
b"TGT" | b"TGC" => AminoAcid::Cys,
|
||||
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => AminoAcid::Arg,
|
||||
b"GGT" | b"GGC" | b"GGA" | b"GGG" => AminoAcid::Gly,
|
||||
b"TAA" | b"TAG" | b"TGA" => break, // Stop codons
|
||||
_ => continue, // Unknown codon, skip
|
||||
};
|
||||
|
||||
proteins.push(aa);
|
||||
}
|
||||
|
||||
proteins
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_translate_basic() {
|
||||
let dna = b"ATGGCAGGT";
|
||||
let result = translate_dna(dna);
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(result[0], AminoAcid::Met);
|
||||
assert_eq!(result[1], AminoAcid::Ala);
|
||||
assert_eq!(result[2], AminoAcid::Gly);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_translate_stop_codon() {
|
||||
let dna = b"ATGGCATAA"; // Met-Ala-Stop
|
||||
let result = translate_dna(dna);
|
||||
assert_eq!(result.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hydrophobicity() {
|
||||
assert_eq!(AminoAcid::Ile.hydrophobicity(), 4.5);
|
||||
assert_eq!(AminoAcid::Arg.hydrophobicity(), -4.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_molecular_weight() {
|
||||
let protein = vec![AminoAcid::Met, AminoAcid::Ala, AminoAcid::Gly];
|
||||
let mw = molecular_weight(&protein);
|
||||
// Met (131.04) + Ala (71.04) + Gly (57.02) + H2O (18.01) = ~277.11
|
||||
assert!(mw > 270.0 && mw < 290.0, "MW should be ~277: got {}", mw);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_isoelectric_point() {
|
||||
// Hemoglobin beta N-terminus MVHLTPEEK has pI around 6.7
|
||||
let hbb_start = translate_dna(b"ATGGTGCATCTGACTCCTGAGGAGAAG");
|
||||
let pi = isoelectric_point(&hbb_start);
|
||||
assert!(pi > 4.0 && pi < 10.0, "pI should be reasonable: got {}", pi);
|
||||
|
||||
// Lysine-rich peptide should have high pI
|
||||
let basic = vec![
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Lys,
|
||||
AminoAcid::Arg,
|
||||
];
|
||||
let pi_basic = isoelectric_point(&basic);
|
||||
assert!(
|
||||
pi_basic > 9.0,
|
||||
"Basic peptide pI should be >9: got {}",
|
||||
pi_basic
|
||||
);
|
||||
|
||||
// Aspartate-rich peptide should have low pI
|
||||
let acidic = vec![
|
||||
AminoAcid::Asp,
|
||||
AminoAcid::Asp,
|
||||
AminoAcid::Glu,
|
||||
AminoAcid::Glu,
|
||||
];
|
||||
let pi_acidic = isoelectric_point(&acidic);
|
||||
assert!(
|
||||
pi_acidic < 5.0,
|
||||
"Acidic peptide pI should be <5: got {}",
|
||||
pi_acidic
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user