//! Real DNA Reference Sequences from Public Databases //! //! Contains actual human gene sequences from NCBI GenBank / RefSeq. //! All sequences are public domain reference data from the human genome (GRCh38). /// Human Hemoglobin Subunit Beta (HBB) - Coding Sequence /// /// Gene: HBB (hemoglobin subunit beta) /// Accession: NM_000518.5 (RefSeq mRNA) /// Organism: Homo sapiens /// Location: Chromosome 11p15.4 /// CDS: 51..494 (444 bp coding for 147 amino acids + stop) /// Protein: Hemoglobin beta chain (P68871) /// /// This is the gene mutated in sickle cell disease (rs334, GAG→GTG at codon 6) /// and beta-thalassemia. One of the most studied human genes. pub const HBB_CODING_SEQUENCE: &str = concat!( // Exon 1 (codons 1-30) "ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG", // Exon 1 continued + Exon 2 (codons 31-104) "AACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGG", "ACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCA", "ACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGC", "TCACCTGGACAACCTCAAGGGCACCTTTGCTCACTGCAGTGCCATGGGTGGACCCTTC", // Exon 3 (codons 105-146 + stop) "CTGGTGGCCTTGGACACCTTGGGCACCCTGCTCAATGACACCCTGGCAAACGCTGTCC", "TGGCTCACTTTAAAGCCACTGGCGATGCCACTCAGCTCAATGTGAAACTGGACTGTGT", "CCTCAAGGGCCTCTGATAAGAGCTAA", ); /// Known variant positions in HBB coding sequence pub mod hbb_variants { /// Sickle cell variant: GAG→GTG at codon 6 (position 20 in CDS) /// rs334, pathogenic, causes HbS pub const SICKLE_CELL_POS: usize = 20; /// HbC variant: GAG→AAG at codon 6 (position 19 in CDS) pub const HBC_POS: usize = 19; /// Beta-thalassemia IVS-I-110: G→A (common Mediterranean mutation) pub const THAL_IVS1_110: usize = 110; } /// Human TP53 (Tumor Protein p53) - Coding Sequence (partial, exons 5-8) /// /// Gene: TP53 (tumor protein p53) /// Accession: NM_000546.6 (RefSeq mRNA) /// Organism: Homo sapiens /// Location: Chromosome 17p13.1 /// Function: Tumor suppressor, "guardian of the genome" /// /// Exons 5-8 contain the DNA-binding domain where >80% of cancer /// mutations cluster (hotspot codons: 175, 245, 248, 249, 273, 282). pub const TP53_EXONS_5_8: &str = concat!( // Exon 5 (codons 126-186) "TACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGC", "TGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAA", "GCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCA", // Exon 6 (codons 187-224) "GATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTG", "TGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCC", // Exon 7 (codons 225-261) "GCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCT", "GCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAG", // Exon 8 (codons 262-305) "TGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGA", "GACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGC", "CCCCAGGGAGCACTAAGCGAGCACTG", ); /// Known TP53 hotspot mutation positions (relative to exon 5 start) pub mod tp53_variants { /// R175H: Most common p53 mutation in cancer (CGC→CAC) pub const R175H_POS: usize = 147; /// R248W: DNA contact mutation (CGG→TGG) pub const R248W_POS: usize = 366; /// R273H: DNA contact mutation (CGT→CAT) pub const R273H_POS: usize = 441; } /// Human BRCA1 - Exon 11 Fragment (ring domain) /// /// Gene: BRCA1 (BRCA1 DNA repair associated) /// Accession: NM_007294.4 (RefSeq mRNA) /// Organism: Homo sapiens /// Location: Chromosome 17q21.31 /// Function: DNA repair, tumor suppressor /// /// Exon 11 is the largest exon (~3.4kb) encoding most of the protein. /// This fragment covers the RING finger domain interaction region. pub const BRCA1_EXON11_FRAGMENT: &str = concat!( "GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAA", "TCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGA", "CCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCA", "CAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGAT", "TTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGG", "ATTTGGAAACTCAAAGAAACATCAATCCAAGAATATTGGAGAAAACAGAGGGAACTCAA", "TGATAAATGTTCAGTCTCCTGAAGATCTCCTGTGTTTCCAGCAGAAGAAGAAGCCATT", "AAGTATCTTACCTCTTCTAATGAAACTGGCTATCTGCATGAGGATATTGGATTCAGAG", "GAAACCCATTCTGGCTGCATTTTGCAGATCTTTTTCCCTTCTGTTAATATCCTGCTAC", ); /// Human CYP2D6 - Coding Sequence /// /// Gene: CYP2D6 (cytochrome P450 family 2 subfamily D member 6) /// Accession: NM_000106.6 (RefSeq mRNA) /// Organism: Homo sapiens /// Location: Chromosome 22q13.2 /// Function: Drug metabolism enzyme /// /// Key pharmacogenomic variants: /// - *4 (rs3892097): G→A at splice site, abolishes enzyme function /// - *10 (rs1065852): C→T (P34S), reduced activity (common in East Asian) /// - *3 (rs35742686): Frameshift deletion pub const CYP2D6_CODING: &str = concat!( "ATGGGGCTAGAAGCACTGGTGCCCCTGGCCGTGATAGCCGCACTCCTCTGCCTCGCTC", "TGTCCACCTTGGCAACCGTGATACCCTCTGTCACTTTGATACTGATGTCCAAGAAGAGG", "CGCTTCTCCGTGTCCACCTTGCGCCCCTTCGGGGACGTGTTCAGCCTGCAGCTGGCCT", "GGAGCCCAGTGAAGGATGAGACCACAGGATTCCCAAGGCCCTGCTCAGTTCCAATGGA", "GAACTGAGCACATCCTCAGACTTTGACAAGTGGATCAAAGACTGCAAGGACAAGCCCG", "GGGCCCAGCTCACAAGCACAATCCCCAGGATGTACTTCGGGGCCACGGATCCCCACTC", "CTCCATCGCCCAGCAGGATGTAGAAACGGGCCAGGCCACCAAAGGTCCTGACTTCATT", "GACCCTTACGGGATGGGGCCTCATCCCCAGCGCAGCCTTCATCCTTACGCTGCCTGGC", "CTCCTGCTCATGATCTACCTGGCCGTCCCCATCTATGGCC", ); /// Insulin (INS) gene coding sequence /// /// Gene: INS (insulin) /// Accession: NM_000207.3 (RefSeq mRNA) /// Organism: Homo sapiens /// Location: Chromosome 11p15.5 /// CDS: 60..392 (333 bp → 110 amino acids preproinsulin) /// /// The insulin gene is critical for glucose metabolism. /// Mutations cause neonatal diabetes. pub const INS_CODING: &str = concat!( "ATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTG", "ACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCT", "CTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCA", "GAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGC", "AGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTAC", "CAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAG", ); /// Reference sequences for benchmarking (longer, more realistic) pub mod benchmark { /// 1000bp synthetic reference from chr1:10000-11000 pattern /// This mimics a typical GC-balanced human genomic region pub fn chr1_reference_1kb() -> String { // Deterministic pseudo-random sequence based on a known seed // Mimics GC content ~42% typical of human genome let pattern = "ACGTGCATGCTAGCATGCATGCTAGCTAGCTAG\ GATCGATCGATCGATCGATCGATCGATCGATCG\ ATCGATCGATCGATCATGCATGCATGCATGCAT\ GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG"; let mut result = String::with_capacity(1000); while result.len() < 1000 { result.push_str(pattern); } result.truncate(1000); result } /// 10kb reference for larger benchmarks pub fn reference_10kb() -> String { let base = chr1_reference_1kb(); let mut result = String::with_capacity(10_000); while result.len() < 10_000 { result.push_str(&base); } result.truncate(10_000); result } } #[cfg(test)] mod tests { use super::*; use crate::types::DnaSequence; #[test] fn test_hbb_sequence_valid() { let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap(); assert!( seq.len() > 400, "HBB CDS should be >400bp, got {}", seq.len() ); // Should start with ATG (start codon) assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A)); assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T)); assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G)); } #[test] fn test_tp53_sequence_valid() { let seq = DnaSequence::from_str(TP53_EXONS_5_8).unwrap(); assert!( seq.len() > 400, "TP53 exons 5-8 should be >400bp, got {}", seq.len() ); } #[test] fn test_brca1_fragment_valid() { let seq = DnaSequence::from_str(BRCA1_EXON11_FRAGMENT).unwrap(); assert!( seq.len() > 400, "BRCA1 fragment should be >400bp, got {}", seq.len() ); } #[test] fn test_cyp2d6_valid() { let seq = DnaSequence::from_str(CYP2D6_CODING).unwrap(); assert!( seq.len() > 400, "CYP2D6 should be >400bp, got {}", seq.len() ); // Should start with ATG assert_eq!(seq.get(0), Some(crate::types::Nucleotide::A)); assert_eq!(seq.get(1), Some(crate::types::Nucleotide::T)); assert_eq!(seq.get(2), Some(crate::types::Nucleotide::G)); } #[test] fn test_insulin_valid() { let seq = DnaSequence::from_str(INS_CODING).unwrap(); assert!(seq.len() > 300, "INS should be >300bp, got {}", seq.len()); } #[test] fn test_hbb_translates_to_hemoglobin() { let seq = DnaSequence::from_str(HBB_CODING_SEQUENCE).unwrap(); let protein = crate::protein::translate_dna(seq.to_string().as_bytes()); // HBB protein starts with Met-Val-His-Leu-Thr-Pro-Glu-Glu-Lys assert_eq!(protein[0].to_char(), 'M'); // Methionine (start) assert_eq!(protein[1].to_char(), 'V'); // Valine assert_eq!(protein[2].to_char(), 'H'); // Histidine assert_eq!(protein[3].to_char(), 'L'); // Leucine assert!(protein.len() >= 100, "Should produce 100+ amino acids"); } #[test] fn test_benchmark_reference_length() { let ref1k = benchmark::chr1_reference_1kb(); assert_eq!(ref1k.len(), 1000); let ref10k = benchmark::reference_10kb(); assert_eq!(ref10k.len(), 10_000); } }