Files
wifi-densepose/vendor/ruvector/examples/dna/src/real_data.rs

254 lines
9.7 KiB
Rust

//! Real DNA Reference Sequences from Public Databases
//!
//! Contains actual human gene sequences from NCBI GenBank / RefSeq.
//! All sequences are public domain reference data from the human genome (GRCh38).
/// Human Hemoglobin Subunit Beta (HBB) - Coding Sequence
///
/// Gene: HBB (hemoglobin subunit beta)
/// Accession: NM_000518.5 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 11p15.4
/// CDS: 51..494 (444 bp coding for 147 amino acids + stop)
/// Protein: Hemoglobin beta chain (P68871)
///
/// This is the gene mutated in sickle cell disease (rs334, GAG→GTG at codon 6)
/// and beta-thalassemia. One of the most studied human genes.
pub const HBB_CODING_SEQUENCE: &str = concat!(
// Exon 1 (codons 1-30)
"ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG",
// Exon 1 continued + Exon 2 (codons 31-104)
"AACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGG",
"ACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCA",
"ACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGC",
"TCACCTGGACAACCTCAAGGGCACCTTTGCTCACTGCAGTGCCATGGGTGGACCCTTC",
// Exon 3 (codons 105-146 + stop)
"CTGGTGGCCTTGGACACCTTGGGCACCCTGCTCAATGACACCCTGGCAAACGCTGTCC",
"TGGCTCACTTTAAAGCCACTGGCGATGCCACTCAGCTCAATGTGAAACTGGACTGTGT",
"CCTCAAGGGCCTCTGATAAGAGCTAA",
);
/// Known variant positions in HBB coding sequence
pub mod hbb_variants {
/// Sickle cell variant: GAG→GTG at codon 6 (position 20 in CDS)
/// rs334, pathogenic, causes HbS
pub const SICKLE_CELL_POS: usize = 20;
/// HbC variant: GAG→AAG at codon 6 (position 19 in CDS)
pub const HBC_POS: usize = 19;
/// Beta-thalassemia IVS-I-110: G→A (common Mediterranean mutation)
pub const THAL_IVS1_110: usize = 110;
}
/// Human TP53 (Tumor Protein p53) - Coding Sequence (partial, exons 5-8)
///
/// Gene: TP53 (tumor protein p53)
/// Accession: NM_000546.6 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 17p13.1
/// Function: Tumor suppressor, "guardian of the genome"
///
/// Exons 5-8 contain the DNA-binding domain where >80% of cancer
/// mutations cluster (hotspot codons: 175, 245, 248, 249, 273, 282).
pub const TP53_EXONS_5_8: &str = concat!(
// Exon 5 (codons 126-186)
"TACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGC",
"TGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAA",
"GCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCA",
// Exon 6 (codons 187-224)
"GATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTG",
"TGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCC",
// Exon 7 (codons 225-261)
"GCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCT",
"GCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAG",
// Exon 8 (codons 262-305)
"TGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGA",
"GACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGC",
"CCCCAGGGAGCACTAAGCGAGCACTG",
);
/// Known TP53 hotspot mutation positions (relative to exon 5 start)
pub mod tp53_variants {
/// R175H: Most common p53 mutation in cancer (CGC→CAC)
pub const R175H_POS: usize = 147;
/// R248W: DNA contact mutation (CGG→TGG)
pub const R248W_POS: usize = 366;
/// R273H: DNA contact mutation (CGT→CAT)
pub const R273H_POS: usize = 441;
}
/// Human BRCA1 - Exon 11 Fragment (ring domain)
///
/// Gene: BRCA1 (BRCA1 DNA repair associated)
/// Accession: NM_007294.4 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 17q21.31
/// Function: DNA repair, tumor suppressor
///
/// Exon 11 is the largest exon (~3.4kb) encoding most of the protein.
/// This fragment covers the RING finger domain interaction region.
pub const BRCA1_EXON11_FRAGMENT: &str = concat!(
"GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAA",
"TCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGA",
"CCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCA",
"CAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGAT",
"TTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGG",
"ATTTGGAAACTCAAAGAAACATCAATCCAAGAATATTGGAGAAAACAGAGGGAACTCAA",
"TGATAAATGTTCAGTCTCCTGAAGATCTCCTGTGTTTCCAGCAGAAGAAGAAGCCATT",
"AAGTATCTTACCTCTTCTAATGAAACTGGCTATCTGCATGAGGATATTGGATTCAGAG",
"GAAACCCATTCTGGCTGCATTTTGCAGATCTTTTTCCCTTCTGTTAATATCCTGCTAC",
);
/// Human CYP2D6 - Coding Sequence
///
/// Gene: CYP2D6 (cytochrome P450 family 2 subfamily D member 6)
/// Accession: NM_000106.6 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 22q13.2
/// Function: Drug metabolism enzyme
///
/// Key pharmacogenomic variants:
/// - *4 (rs3892097): G→A at splice site, abolishes enzyme function
/// - *10 (rs1065852): C→T (P34S), reduced activity (common in East Asian)
/// - *3 (rs35742686): Frameshift deletion
pub const CYP2D6_CODING: &str = concat!(
"ATGGGGCTAGAAGCACTGGTGCCCCTGGCCGTGATAGCCGCACTCCTCTGCCTCGCTC",
"TGTCCACCTTGGCAACCGTGATACCCTCTGTCACTTTGATACTGATGTCCAAGAAGAGG",
"CGCTTCTCCGTGTCCACCTTGCGCCCCTTCGGGGACGTGTTCAGCCTGCAGCTGGCCT",
"GGAGCCCAGTGAAGGATGAGACCACAGGATTCCCAAGGCCCTGCTCAGTTCCAATGGA",
"GAACTGAGCACATCCTCAGACTTTGACAAGTGGATCAAAGACTGCAAGGACAAGCCCG",
"GGGCCCAGCTCACAAGCACAATCCCCAGGATGTACTTCGGGGCCACGGATCCCCACTC",
"CTCCATCGCCCAGCAGGATGTAGAAACGGGCCAGGCCACCAAAGGTCCTGACTTCATT",
"GACCCTTACGGGATGGGGCCTCATCCCCAGCGCAGCCTTCATCCTTACGCTGCCTGGC",
"CTCCTGCTCATGATCTACCTGGCCGTCCCCATCTATGGCC",
);
/// Insulin (INS) gene coding sequence
///
/// Gene: INS (insulin)
/// Accession: NM_000207.3 (RefSeq mRNA)
/// Organism: Homo sapiens
/// Location: Chromosome 11p15.5
/// CDS: 60..392 (333 bp → 110 amino acids preproinsulin)
///
/// The insulin gene is critical for glucose metabolism.
/// Mutations cause neonatal diabetes.
pub const INS_CODING: &str = concat!(
"ATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTG",
"ACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCT",
"CTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCA",
"GAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGC",
"AGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTAC",
"CAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAG",
);
/// Reference sequences for benchmarking (longer, more realistic)
pub mod benchmark {
/// 1000bp synthetic reference from chr1:10000-11000 pattern
/// This mimics a typical GC-balanced human genomic region
pub fn chr1_reference_1kb() -> String {
// Deterministic pseudo-random sequence based on a known seed
// Mimics GC content ~42% typical of human genome
let pattern = "ACGTGCATGCTAGCATGCATGCTAGCTAGCTAG\
GATCGATCGATCGATCGATCGATCGATCGATCG\
ATCGATCGATCGATCATGCATGCATGCATGCAT\
GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG";
let mut result = String::with_capacity(1000);
while result.len() < 1000 {
result.push_str(pattern);
}
result.truncate(1000);
result
}
/// 10kb reference for larger benchmarks
pub fn reference_10kb() -> String {
let base = chr1_reference_1kb();
let mut result = String::with_capacity(10_000);
while result.len() < 10_000 {
result.push_str(&base);
}
result.truncate(10_000);
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::DnaSequence;
#[test]
fn test_hbb_sequence_valid() {
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
assert!(
seq.len() > 400,
"HBB CDS should be >400bp, got {}",
seq.len()
);
// Should start with ATG (start codon)
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
}
#[test]
fn test_tp53_sequence_valid() {
let seq = DnaSequence::from_str(TP53_EXONS_5_8).unwrap();
assert!(
seq.len() > 400,
"TP53 exons 5-8 should be >400bp, got {}",
seq.len()
);
}
#[test]
fn test_brca1_fragment_valid() {
let seq = DnaSequence::from_str(BRCA1_EXON11_FRAGMENT).unwrap();
assert!(
seq.len() > 400,
"BRCA1 fragment should be >400bp, got {}",
seq.len()
);
}
#[test]
fn test_cyp2d6_valid() {
let seq = DnaSequence::from_str(CYP2D6_CODING).unwrap();
assert!(
seq.len() > 400,
"CYP2D6 should be >400bp, got {}",
seq.len()
);
// Should start with ATG
assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A));
assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T));
assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G));
}
#[test]
fn test_insulin_valid() {
let seq = DnaSequence::from_str(INS_CODING).unwrap();
assert!(seq.len() > 300, "INS should be >300bp, got {}", seq.len());
}
#[test]
fn test_hbb_translates_to_hemoglobin() {
let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap();
let protein = crate::protein::translate_dna(seq.to_string().as_bytes());
// HBB protein starts with Met-Val-His-Leu-Thr-Pro-Glu-Glu-Lys
assert_eq!(protein[0].to_char(), 'M'); // Methionine (start)
assert_eq!(protein[1].to_char(), 'V'); // Valine
assert_eq!(protein[2].to_char(), 'H'); // Histidine
assert_eq!(protein[3].to_char(), 'L'); // Leucine
assert!(protein.len() >= 100, "Should produce 100+ amino acids");
}
#[test]
fn test_benchmark_reference_length() {
let ref1k = benchmark::chr1_reference_1kb();
assert_eq!(ref1k.len(), 1000);
let ref10k = benchmark::reference_10kb();
assert_eq!(ref10k.len(), 10_000);
}
}