//! RVDNA - AI-Native Genomic File Format //! //! A binary format purpose-built for ultra-low-latency AI genomic analysis. //! Unlike FASTA/BAM/VCF which require re-encoding for every AI pipeline, //! RVDNA stores pre-computed tensors, vector embeddings, and graph structures //! alongside the raw sequence data. //! //! ## Format Structure //! //! ```text //! ┌─────────────────────────────────┐ //! │ Header (64 bytes) │ Magic, version, section offsets //! ├─────────────────────────────────┤ //! │ Section 0: Sequence Data │ 2-bit packed nucleotides + quality //! ├─────────────────────────────────┤ //! │ Section 1: K-mer Vectors │ Pre-computed HNSW-ready embeddings //! ├─────────────────────────────────┤ //! │ Section 2: Attention Weights │ Sparse self-attention matrices //! ├─────────────────────────────────┤ //! │ Section 3: Variant Tensor │ Per-position genotype likelihoods //! ├─────────────────────────────────┤ //! │ Section 4: Protein Embeddings │ GNN node features + contact graph //! ├─────────────────────────────────┤ //! │ Section 5: Epigenomic Tracks │ Methylation betas + aging coeffs //! ├─────────────────────────────────┤ //! │ Section 6: Metadata │ JSON provenance + checksums //! └─────────────────────────────────┘ //! ``` //! //! ## Key Properties //! //! - **2-bit encoding**: 4 bases per byte (4x compression vs ASCII) //! - **Zero-copy access**: Memory-mappable with aligned sections //! - **Pre-indexed**: HNSW graph stored inline for instant similarity search //! - **Tensor-native**: Attention weights and variant probabilities stored as //! sparse tensors in COO format for direct GPU/SIMD consumption //! - **Streaming**: Chunked sections allow incremental read/write use crate::error::{DnaError, Result}; use crate::types::{DnaSequence, Nucleotide, QualityScore}; use serde::{Deserialize, Serialize}; use std::io::{Read, Write}; // ============================================================================ // Constants // ============================================================================ /// Magic bytes identifying an RVDNA file pub const MAGIC: [u8; 8] = *b"RVDNA\x01\x00\x00"; /// Current format version pub const FORMAT_VERSION: u16 = 1; /// Number of sections in the format pub const NUM_SECTIONS: usize = 7; /// Section alignment boundary (64 bytes for cache-line alignment) pub const SECTION_ALIGN: u64 = 64; /// Header size (fixed) pub const HEADER_SIZE: u64 = 64 + (NUM_SECTIONS as u64 * 16); // 64 base + 16 per section offset // ============================================================================ // Compression Codec // ============================================================================ /// Compression codec for section data #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[repr(u8)] pub enum Codec { /// No compression (zero-copy mmap friendly) None = 0, /// LZ4 fast compression (decode at ~4 GB/s) Lz4 = 1, /// Zstd balanced compression Zstd = 2, } impl Codec { fn from_u8(v: u8) -> Result { match v { 0 => Ok(Codec::None), 1 => Ok(Codec::Lz4), 2 => Ok(Codec::Zstd), _ => Err(DnaError::InvalidSequence(format!("Unknown codec: {}", v))), } } } // ============================================================================ // Section Types // ============================================================================ /// Section identifier #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[repr(u8)] pub enum SectionType { /// Raw sequence data (2-bit encoded) Sequence = 0, /// Pre-computed k-mer frequency vectors KmerVectors = 1, /// Sparse attention weight matrices AttentionWeights = 2, /// Per-position variant probability tensors VariantTensor = 3, /// Protein residue embeddings + contact graph ProteinEmbeddings = 4, /// Epigenomic tracks (methylation, chromatin) EpigenomicTracks = 5, /// JSON metadata and provenance Metadata = 6, } /// Section offset entry in the header #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct SectionEntry { /// Offset from file start (0 = section not present) pub offset: u64, /// Compressed size in bytes pub size: u64, } // ============================================================================ // File Header // ============================================================================ /// RVDNA file header (fixed-size, at byte 0) #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RvdnaHeader { /// Format version pub version: u16, /// Compression codec used pub codec: Codec, /// Flags (bit 0: little-endian, bit 1: has quality scores) pub flags: u32, /// Total sequence length in bases pub sequence_length: u64, /// Number of contigs/chromosomes pub num_contigs: u32, /// Section offset table pub sections: [SectionEntry; NUM_SECTIONS], /// CRC32 checksum of header pub header_checksum: u32, } impl RvdnaHeader { /// Create a new empty header pub fn new(sequence_length: u64, codec: Codec) -> Self { Self { version: FORMAT_VERSION, codec, flags: 0x01, // little-endian by default sequence_length, num_contigs: 1, sections: [SectionEntry { offset: 0, size: 0 }; NUM_SECTIONS], header_checksum: 0, } } /// Set the has_quality flag pub fn with_quality(mut self) -> Self { self.flags |= 0x02; self } /// Check if quality scores are present pub fn has_quality(&self) -> bool { self.flags & 0x02 != 0 } /// Serialize header to bytes pub fn to_bytes(&self) -> Vec { let mut buf = Vec::with_capacity(HEADER_SIZE as usize); // Magic (8 bytes) buf.extend_from_slice(&MAGIC); // Version (2 bytes) buf.extend_from_slice(&self.version.to_le_bytes()); // Codec (1 byte) buf.push(self.codec as u8); // Padding (1 byte) buf.push(0); // Flags (4 bytes) buf.extend_from_slice(&self.flags.to_le_bytes()); // Sequence length (8 bytes) buf.extend_from_slice(&self.sequence_length.to_le_bytes()); // Num contigs (4 bytes) buf.extend_from_slice(&self.num_contigs.to_le_bytes()); // Reserved (36 bytes to reach 64-byte base header) buf.extend_from_slice(&[0u8; 36]); // Section table (16 bytes per section: 8 offset + 8 size) for section in &self.sections { buf.extend_from_slice(§ion.offset.to_le_bytes()); buf.extend_from_slice(§ion.size.to_le_bytes()); } // Compute checksum over everything except the last 4 bytes let checksum = crc32_simple(&buf); buf.extend_from_slice(&checksum.to_le_bytes()); buf } /// Parse header from bytes pub fn from_bytes(data: &[u8]) -> Result { if data.len() < HEADER_SIZE as usize + 4 { return Err(DnaError::InvalidSequence("Header too short".to_string())); } // Verify magic if &data[0..8] != &MAGIC { return Err(DnaError::InvalidSequence( "Invalid RVDNA magic number".to_string(), )); } let version = u16::from_le_bytes([data[8], data[9]]); let codec = Codec::from_u8(data[10])?; let flags = u32::from_le_bytes([data[12], data[13], data[14], data[15]]); let sequence_length = u64::from_le_bytes(data[16..24].try_into().unwrap()); let num_contigs = u32::from_le_bytes(data[24..28].try_into().unwrap()); let mut sections = [SectionEntry { offset: 0, size: 0 }; NUM_SECTIONS]; let table_start = 64; for i in 0..NUM_SECTIONS { let base = table_start + i * 16; sections[i] = SectionEntry { offset: u64::from_le_bytes(data[base..base + 8].try_into().unwrap()), size: u64::from_le_bytes(data[base + 8..base + 16].try_into().unwrap()), }; } let checksum_offset = table_start + NUM_SECTIONS * 16; let header_checksum = u32::from_le_bytes( data[checksum_offset..checksum_offset + 4] .try_into() .unwrap(), ); // Verify checksum let computed = crc32_simple(&data[..checksum_offset]); if computed != header_checksum { return Err(DnaError::InvalidSequence(format!( "Header checksum mismatch: expected {:08x}, got {:08x}", header_checksum, computed ))); } Ok(Self { version, codec, flags, sequence_length, num_contigs, sections, header_checksum, }) } } // ============================================================================ // 2-Bit Sequence Encoding // ============================================================================ /// Encode nucleotides to 2-bit packed representation. /// /// Packing: 4 bases per byte, MSB first. /// A=00, C=01, G=10, T=11. N is encoded as 00 with a separate N-mask. /// /// Returns (packed_data, n_mask) where n_mask has 1-bits for N positions. pub fn encode_2bit(sequence: &[Nucleotide]) -> (Vec, Vec) { let num_bytes = (sequence.len() + 3) / 4; let mut packed = vec![0u8; num_bytes]; let mask_bytes = (sequence.len() + 7) / 8; let mut n_mask = vec![0u8; mask_bytes]; for (i, &base) in sequence.iter().enumerate() { let byte_idx = i / 4; let bit_offset = 6 - (i % 4) * 2; // MSB first: positions 6,4,2,0 let bits = match base { Nucleotide::A => 0b00, Nucleotide::C => 0b01, Nucleotide::G => 0b10, Nucleotide::T => 0b11, Nucleotide::N => { // Mark in N-mask n_mask[i / 8] |= 1 << (7 - i % 8); 0b00 // Encode as A, disambiguated by mask } }; packed[byte_idx] |= bits << bit_offset; } (packed, n_mask) } /// Decode 2-bit packed nucleotides back to sequence pub fn decode_2bit(packed: &[u8], n_mask: &[u8], length: usize) -> Vec { let mut sequence = Vec::with_capacity(length); for i in 0..length { let byte_idx = i / 4; let bit_offset = 6 - (i % 4) * 2; let bits = (packed[byte_idx] >> bit_offset) & 0b11; // Check N-mask let is_n = if i / 8 < n_mask.len() { (n_mask[i / 8] >> (7 - i % 8)) & 1 == 1 } else { false }; let base = if is_n { Nucleotide::N } else { match bits { 0b00 => Nucleotide::A, 0b01 => Nucleotide::C, 0b10 => Nucleotide::G, 0b11 => Nucleotide::T, _ => unreachable!(), } }; sequence.push(base); } sequence } /// Compress quality scores using 6-bit encoding (0-63 range, Phred capped) pub fn encode_quality(qualities: &[u8]) -> Vec { // Pack four 6-bit values into three bytes let mut encoded = Vec::with_capacity((qualities.len() * 6 + 7) / 8); let mut bit_buffer: u64 = 0; let mut bits_in_buffer = 0; for &q in qualities { let q6 = q.min(63) as u64; // Cap at 6 bits bit_buffer = (bit_buffer << 6) | q6; bits_in_buffer += 6; while bits_in_buffer >= 8 { bits_in_buffer -= 8; encoded.push((bit_buffer >> bits_in_buffer) as u8); bit_buffer &= (1 << bits_in_buffer) - 1; } } // Flush remaining bits if bits_in_buffer > 0 { encoded.push((bit_buffer << (8 - bits_in_buffer)) as u8); } encoded } /// Decode 6-bit compressed quality scores pub fn decode_quality(encoded: &[u8], count: usize) -> Vec { let mut qualities = Vec::with_capacity(count); let mut bit_buffer: u64 = 0; let mut bits_in_buffer = 0; let mut byte_idx = 0; for _ in 0..count { while bits_in_buffer < 6 && byte_idx < encoded.len() { bit_buffer = (bit_buffer << 8) | encoded[byte_idx] as u64; bits_in_buffer += 8; byte_idx += 1; } bits_in_buffer -= 6; let q = ((bit_buffer >> bits_in_buffer) & 0x3F) as u8; bit_buffer &= (1 << bits_in_buffer) - 1; qualities.push(q); } qualities } // ============================================================================ // Sparse Attention Matrix (COO Format) // ============================================================================ /// Sparse attention matrix stored in COO (Coordinate) format. /// Efficient for storing pre-computed attention weights between sequence positions. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SparseAttention { /// Row indices (query positions) pub rows: Vec, /// Column indices (key positions) pub cols: Vec, /// Attention weight values pub values: Vec, /// Matrix dimensions (rows, cols) pub shape: (u32, u32), /// Window size used for computation pub window_size: u32, } impl SparseAttention { /// Create from dense attention matrix, keeping only values above threshold pub fn from_dense(matrix: &[f32], rows: usize, cols: usize, threshold: f32) -> Self { let mut row_idx = Vec::new(); let mut col_idx = Vec::new(); let mut values = Vec::new(); for i in 0..rows { for j in 0..cols { let val = matrix[i * cols + j]; if val.abs() > threshold { row_idx.push(i as u32); col_idx.push(j as u32); values.push(val); } } } Self { rows: row_idx, cols: col_idx, values, shape: (rows as u32, cols as u32), window_size: cols as u32, } } /// Number of non-zero entries pub fn nnz(&self) -> usize { self.values.len() } /// Sparsity ratio (fraction of zeros) pub fn sparsity(&self) -> f64 { let total = self.shape.0 as f64 * self.shape.1 as f64; if total == 0.0 { return 1.0; } 1.0 - (self.nnz() as f64 / total) } /// Lookup attention weight at (row, col), returns 0.0 if not stored pub fn get(&self, row: u32, col: u32) -> f32 { for i in 0..self.values.len() { if self.rows[i] == row && self.cols[i] == col { return self.values[i]; } } 0.0 } /// Serialize to bytes (for file storage) pub fn to_bytes(&self) -> Vec { let mut buf = Vec::new(); // Shape (8 bytes) buf.extend_from_slice(&self.shape.0.to_le_bytes()); buf.extend_from_slice(&self.shape.1.to_le_bytes()); // Window size (4 bytes) buf.extend_from_slice(&self.window_size.to_le_bytes()); // NNZ count (4 bytes) let nnz = self.nnz() as u32; buf.extend_from_slice(&nnz.to_le_bytes()); // Row indices for &r in &self.rows { buf.extend_from_slice(&r.to_le_bytes()); } // Column indices for &c in &self.cols { buf.extend_from_slice(&c.to_le_bytes()); } // Values for &v in &self.values { buf.extend_from_slice(&v.to_le_bytes()); } buf } /// Deserialize from bytes pub fn from_bytes(data: &[u8]) -> Result { if data.len() < 20 { return Err(DnaError::InvalidSequence( "Attention data too short".to_string(), )); } let shape_0 = u32::from_le_bytes(data[0..4].try_into().unwrap()); let shape_1 = u32::from_le_bytes(data[4..8].try_into().unwrap()); let window_size = u32::from_le_bytes(data[8..12].try_into().unwrap()); let nnz = u32::from_le_bytes(data[12..16].try_into().unwrap()) as usize; let expected = 16 + nnz * 12; // 4 bytes row + 4 col + 4 value per entry if data.len() < expected { return Err(DnaError::InvalidSequence( "Attention data truncated".to_string(), )); } let mut offset = 16; let rows: Vec = (0..nnz) .map(|_| { let v = u32::from_le_bytes(data[offset..offset + 4].try_into().unwrap()); offset += 4; v }) .collect(); let cols: Vec = (0..nnz) .map(|_| { let v = u32::from_le_bytes(data[offset..offset + 4].try_into().unwrap()); offset += 4; v }) .collect(); let values: Vec = (0..nnz) .map(|_| { let v = f32::from_le_bytes(data[offset..offset + 4].try_into().unwrap()); offset += 4; v }) .collect(); Ok(Self { rows, cols, values, shape: (shape_0, shape_1), window_size, }) } } // ============================================================================ // Variant Tensor (Per-Position Genotype Likelihoods) // ============================================================================ /// Per-position variant probability tensor. /// Stores genotype likelihoods for each genomic position using f16-quantized values. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VariantTensor { /// Genomic positions with variant data pub positions: Vec, /// Reference alleles (2-bit encoded) pub ref_alleles: Vec, /// Alternate alleles (2-bit encoded) pub alt_alleles: Vec, /// Genotype likelihoods: [P(0/0), P(0/1), P(1/1)] per position (f16 as u16) pub likelihoods: Vec<[u16; 3]>, /// Quality scores (Phred-scaled) pub qualities: Vec, } impl VariantTensor { /// Create empty tensor pub fn new() -> Self { Self { positions: Vec::new(), ref_alleles: Vec::new(), alt_alleles: Vec::new(), likelihoods: Vec::new(), qualities: Vec::new(), } } /// Add a variant position with genotype likelihoods pub fn add_variant( &mut self, position: u64, ref_allele: Nucleotide, alt_allele: Nucleotide, gl_hom_ref: f32, gl_het: f32, gl_hom_alt: f32, quality: u8, ) { self.positions.push(position); self.ref_alleles.push(nucleotide_to_2bit(ref_allele)); self.alt_alleles.push(nucleotide_to_2bit(alt_allele)); self.likelihoods.push([ f32_to_f16(gl_hom_ref), f32_to_f16(gl_het), f32_to_f16(gl_hom_alt), ]); self.qualities.push(quality); } /// Number of variant positions pub fn len(&self) -> usize { self.positions.len() } /// Check if empty pub fn is_empty(&self) -> bool { self.positions.is_empty() } /// Get genotype likelihoods at a position (binary search) pub fn get_likelihoods(&self, position: u64) -> Option<[f32; 3]> { self.positions.binary_search(&position).ok().map(|idx| { [ f16_to_f32(self.likelihoods[idx][0]), f16_to_f32(self.likelihoods[idx][1]), f16_to_f32(self.likelihoods[idx][2]), ] }) } /// Serialize to bytes pub fn to_bytes(&self) -> Vec { let mut buf = Vec::new(); let count = self.len() as u32; buf.extend_from_slice(&count.to_le_bytes()); for &pos in &self.positions { buf.extend_from_slice(&pos.to_le_bytes()); } buf.extend_from_slice(&self.ref_alleles); buf.extend_from_slice(&self.alt_alleles); for &gl in &self.likelihoods { for &v in &gl { buf.extend_from_slice(&v.to_le_bytes()); } } buf.extend_from_slice(&self.qualities); buf } /// Deserialize from bytes pub fn from_bytes(data: &[u8]) -> Result { if data.len() < 4 { return Err(DnaError::InvalidSequence( "Variant tensor too short".to_string(), )); } let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize; let mut offset = 4; let positions: Vec = (0..count) .map(|_| { let v = u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap()); offset += 8; v }) .collect(); let ref_alleles = data[offset..offset + count].to_vec(); offset += count; let alt_alleles = data[offset..offset + count].to_vec(); offset += count; let likelihoods: Vec<[u16; 3]> = (0..count) .map(|_| { let a = u16::from_le_bytes(data[offset..offset + 2].try_into().unwrap()); offset += 2; let b = u16::from_le_bytes(data[offset..offset + 2].try_into().unwrap()); offset += 2; let c = u16::from_le_bytes(data[offset..offset + 2].try_into().unwrap()); offset += 2; [a, b, c] }) .collect(); let qualities = data[offset..offset + count].to_vec(); Ok(Self { positions, ref_alleles, alt_alleles, likelihoods, qualities, }) } } // ============================================================================ // K-mer Vector Section // ============================================================================ /// Pre-computed k-mer vector block for HNSW-ready storage #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KmerVectorBlock { /// K-mer size used pub k: u32, /// Vector dimensions pub dimensions: u32, /// Region start position in sequence pub start_pos: u64, /// Region length in bases pub region_len: u64, /// The k-mer frequency vector (f32) pub vector: Vec, /// Optional quantized vector (int8) for fast approximate search pub quantized: Option>, /// Quantization scale factor (to reconstruct f32 from int8) pub quant_scale: f32, } impl KmerVectorBlock { /// Create from a DnaSequence region pub fn from_sequence( sequence: &DnaSequence, start: u64, len: u64, k: u32, dimensions: u32, ) -> Result { let end = (start + len).min(sequence.len() as u64); let subseq_bases: Vec = (start as usize..end as usize) .map(|i| sequence.get(i).unwrap_or(Nucleotide::N)) .collect(); let subseq = DnaSequence::new(subseq_bases); let vector = subseq.to_kmer_vector(k as usize, dimensions as usize)?; // Quantize to int8 let max_abs = vector.iter().map(|v| v.abs()).fold(0.0f32, f32::max); let scale = if max_abs > 0.0 { max_abs / 127.0 } else { 1.0 }; let quantized: Vec = vector .iter() .map(|&v| (v / scale).round().max(-128.0).min(127.0) as i8) .collect(); Ok(Self { k, dimensions, start_pos: start, region_len: end - start, vector, quantized: Some(quantized), quant_scale: scale, }) } /// Cosine similarity between this block and another vector pub fn cosine_similarity(&self, other: &[f32]) -> f32 { if self.vector.len() != other.len() { return 0.0; } let dot: f32 = self.vector.iter().zip(other).map(|(a, b)| a * b).sum(); let mag_a: f32 = self.vector.iter().map(|a| a * a).sum::().sqrt(); let mag_b: f32 = other.iter().map(|b| b * b).sum::().sqrt(); if mag_a == 0.0 || mag_b == 0.0 { 0.0 } else { dot / (mag_a * mag_b) } } /// Fast approximate similarity using quantized vectors (4x less memory, ~3x faster) pub fn fast_similarity(&self, other_quantized: &[i8]) -> f32 { match &self.quantized { Some(q) if q.len() == other_quantized.len() => { let dot: i32 = q .iter() .zip(other_quantized) .map(|(&a, &b)| a as i32 * b as i32) .sum(); dot as f32 * self.quant_scale * self.quant_scale } _ => 0.0, } } } // ============================================================================ // RVDNA File Writer // ============================================================================ /// Builder for creating RVDNA files pub struct RvdnaWriter { header: RvdnaHeader, sequence_data: Option<(Vec, Vec)>, // (packed, n_mask) quality_data: Option>, kmer_blocks: Vec, attention: Option, variants: Option, metadata: Option, } impl RvdnaWriter { /// Create a new writer for a sequence pub fn new(sequence: &DnaSequence, codec: Codec) -> Self { let (packed, n_mask) = encode_2bit(sequence.bases()); Self { header: RvdnaHeader::new(sequence.len() as u64, codec), sequence_data: Some((packed, n_mask)), quality_data: None, kmer_blocks: Vec::new(), attention: None, variants: None, metadata: None, } } /// Add quality scores pub fn with_quality(mut self, qualities: &[u8]) -> Self { self.quality_data = Some(encode_quality(qualities)); self.header = self.header.with_quality(); self } /// Pre-compute and add k-mer vectors for the sequence pub fn with_kmer_vectors( mut self, sequence: &DnaSequence, k: u32, dimensions: u32, block_size: u64, ) -> Result { let seq_len = sequence.len() as u64; let mut pos = 0u64; while pos < seq_len { let len = block_size.min(seq_len - pos); if len >= k as u64 { let block = KmerVectorBlock::from_sequence(sequence, pos, len, k, dimensions)?; self.kmer_blocks.push(block); } pos += block_size; } Ok(self) } /// Add pre-computed attention weights pub fn with_attention(mut self, attention: SparseAttention) -> Self { self.attention = Some(attention); self } /// Add variant tensor pub fn with_variants(mut self, variants: VariantTensor) -> Self { self.variants = Some(variants); self } /// Add metadata pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self { self.metadata = Some(metadata); self } /// Write the complete RVDNA file pub fn write(&mut self, writer: &mut W) -> Result { let mut sections_data: Vec> = vec![Vec::new(); NUM_SECTIONS]; // Section 0: Sequence data if let Some((ref packed, ref n_mask)) = self.sequence_data { let mut sec = Vec::new(); // Packed length (4 bytes) sec.extend_from_slice(&(packed.len() as u32).to_le_bytes()); // N-mask length (4 bytes) sec.extend_from_slice(&(n_mask.len() as u32).to_le_bytes()); // Packed data sec.extend_from_slice(packed); // N-mask sec.extend_from_slice(n_mask); // Quality (if present) if let Some(ref qual) = self.quality_data { sec.extend_from_slice(&(qual.len() as u32).to_le_bytes()); sec.extend_from_slice(qual); } sections_data[0] = sec; } // Section 1: K-mer vectors if !self.kmer_blocks.is_empty() { let mut sec = Vec::new(); sec.extend_from_slice(&(self.kmer_blocks.len() as u32).to_le_bytes()); for block in &self.kmer_blocks { let block_bytes = serde_json::to_vec(block) .map_err(|e| DnaError::PipelineError(e.to_string()))?; sec.extend_from_slice(&(block_bytes.len() as u32).to_le_bytes()); sec.extend_from_slice(&block_bytes); } sections_data[1] = sec; } // Section 2: Attention weights if let Some(ref attn) = self.attention { sections_data[2] = attn.to_bytes(); } // Section 3: Variant tensor if let Some(ref variants) = self.variants { sections_data[3] = variants.to_bytes(); } // Section 6: Metadata if let Some(ref meta) = self.metadata { let meta_bytes = serde_json::to_vec(meta).map_err(|e| DnaError::PipelineError(e.to_string()))?; sections_data[6] = meta_bytes; } // Calculate section offsets (align each section) let header_len = HEADER_SIZE + 4; // +4 for checksum let mut current_offset = align_up(header_len, SECTION_ALIGN); for i in 0..NUM_SECTIONS { if !sections_data[i].is_empty() { self.header.sections[i] = SectionEntry { offset: current_offset, size: sections_data[i].len() as u64, }; current_offset = align_up( current_offset + sections_data[i].len() as u64, SECTION_ALIGN, ); } } // Write header let header_bytes = self.header.to_bytes(); writer.write_all(&header_bytes).map_err(DnaError::IoError)?; // Pad to first section let pad_len = align_up(header_bytes.len() as u64, SECTION_ALIGN) - header_bytes.len() as u64; writer .write_all(&vec![0u8; pad_len as usize]) .map_err(DnaError::IoError)?; let mut total_written = header_bytes.len() + pad_len as usize; // Write sections for i in 0..NUM_SECTIONS { if !sections_data[i].is_empty() { // Pad to alignment let needed = self.header.sections[i].offset as usize - total_written; if needed > 0 { writer .write_all(&vec![0u8; needed]) .map_err(DnaError::IoError)?; total_written += needed; } writer .write_all(§ions_data[i]) .map_err(DnaError::IoError)?; total_written += sections_data[i].len(); } } Ok(total_written) } } // ============================================================================ // RVDNA File Reader // ============================================================================ /// Reader for RVDNA files pub struct RvdnaReader { /// Parsed file header pub header: RvdnaHeader, /// Raw file data (for section access) data: Vec, } impl RvdnaReader { /// Open and parse an RVDNA file from bytes pub fn from_bytes(data: Vec) -> Result { let header = RvdnaHeader::from_bytes(&data)?; Ok(Self { header, data }) } /// Read from a reader pub fn from_reader(reader: &mut R) -> Result { let mut data = Vec::new(); reader.read_to_end(&mut data).map_err(DnaError::IoError)?; Self::from_bytes(data) } /// Extract the DNA sequence pub fn read_sequence(&self) -> Result { let section = &self.header.sections[SectionType::Sequence as usize]; if section.size == 0 { return Err(DnaError::EmptySequence); } let start = section.offset as usize; let packed_len = u32::from_le_bytes(self.data[start..start + 4].try_into().unwrap()) as usize; let mask_len = u32::from_le_bytes(self.data[start + 4..start + 8].try_into().unwrap()) as usize; let packed = &self.data[start + 8..start + 8 + packed_len]; let n_mask = &self.data[start + 8 + packed_len..start + 8 + packed_len + mask_len]; let bases = decode_2bit(packed, n_mask, self.header.sequence_length as usize); Ok(DnaSequence::new(bases)) } /// Read k-mer vector blocks pub fn read_kmer_vectors(&self) -> Result> { let section = &self.header.sections[SectionType::KmerVectors as usize]; if section.size == 0 { return Ok(Vec::new()); } let start = section.offset as usize; let count = u32::from_le_bytes(self.data[start..start + 4].try_into().unwrap()) as usize; let mut blocks = Vec::with_capacity(count); let mut offset = start + 4; for _ in 0..count { let block_len = u32::from_le_bytes(self.data[offset..offset + 4].try_into().unwrap()) as usize; offset += 4; let block: KmerVectorBlock = serde_json::from_slice(&self.data[offset..offset + block_len]) .map_err(|e| DnaError::PipelineError(e.to_string()))?; blocks.push(block); offset += block_len; } Ok(blocks) } /// Read attention weights pub fn read_attention(&self) -> Result> { let section = &self.header.sections[SectionType::AttentionWeights as usize]; if section.size == 0 { return Ok(None); } let start = section.offset as usize; let end = start + section.size as usize; Ok(Some(SparseAttention::from_bytes(&self.data[start..end])?)) } /// Read variant tensor pub fn read_variants(&self) -> Result> { let section = &self.header.sections[SectionType::VariantTensor as usize]; if section.size == 0 { return Ok(None); } let start = section.offset as usize; let end = start + section.size as usize; Ok(Some(VariantTensor::from_bytes(&self.data[start..end])?)) } /// Read metadata pub fn read_metadata(&self) -> Result> { let section = &self.header.sections[SectionType::Metadata as usize]; if section.size == 0 { return Ok(None); } let start = section.offset as usize; let end = start + section.size as usize; let meta: serde_json::Value = serde_json::from_slice(&self.data[start..end]) .map_err(|e| DnaError::PipelineError(e.to_string()))?; Ok(Some(meta)) } /// Get file size statistics pub fn stats(&self) -> RvdnaStats { let mut section_sizes = [0u64; NUM_SECTIONS]; for i in 0..NUM_SECTIONS { section_sizes[i] = self.header.sections[i].size; } let total_size = self.data.len() as u64; let seq_len = self.header.sequence_length; let bits_per_base = if seq_len > 0 { (section_sizes[0] as f64 * 8.0) / seq_len as f64 } else { 0.0 }; RvdnaStats { total_size, sequence_length: seq_len, bits_per_base, section_sizes, compression_ratio: if seq_len > 0 { seq_len as f64 / total_size as f64 } else { 0.0 }, } } } /// File statistics #[derive(Debug, Clone)] pub struct RvdnaStats { /// Total file size in bytes pub total_size: u64, /// Sequence length in bases pub sequence_length: u64, /// Average bits per base for sequence section pub bits_per_base: f64, /// Size of each section in bytes pub section_sizes: [u64; NUM_SECTIONS], /// Bases per byte (overall compression) pub compression_ratio: f64, } // ============================================================================ // Conversion: FASTA → RVDNA // ============================================================================ /// Convert a FASTA-like string to RVDNA format with pre-computed AI features pub fn fasta_to_rvdna( sequence_str: &str, k: u32, vector_dims: u32, block_size: u64, ) -> Result> { let sequence = DnaSequence::from_str(sequence_str)?; let metadata = serde_json::json!({ "format": "RVDNA", "version": FORMAT_VERSION, "source": "fasta_conversion", "sequence_length": sequence.len(), "kmer_k": k, "vector_dimensions": vector_dims, "block_size": block_size, }); let mut writer = RvdnaWriter::new(&sequence, Codec::None) .with_kmer_vectors(&sequence, k, vector_dims, block_size)? .with_metadata(metadata); let mut output = Vec::new(); writer.write(&mut output)?; Ok(output) } // ============================================================================ // Utility Functions // ============================================================================ /// CRC32 lookup table (precomputed for IEEE polynomial 0xEDB88320) const CRC32_TABLE: [u32; 256] = { let mut table = [0u32; 256]; let mut i = 0u32; while i < 256 { let mut crc = i; let mut j = 0; while j < 8 { if crc & 1 != 0 { crc = (crc >> 1) ^ 0xEDB88320; } else { crc >>= 1; } j += 1; } table[i as usize] = crc; i += 1; } table }; /// CRC32 using precomputed lookup table (~8x faster than bit-by-bit) fn crc32_simple(data: &[u8]) -> u32 { let mut crc: u32 = 0xFFFFFFFF; for &byte in data { let idx = ((crc ^ byte as u32) & 0xFF) as usize; crc = CRC32_TABLE[idx] ^ (crc >> 8); } !crc } /// Align value up to boundary fn align_up(value: u64, alignment: u64) -> u64 { (value + alignment - 1) & !(alignment - 1) } /// Convert nucleotide to 2-bit encoding fn nucleotide_to_2bit(n: Nucleotide) -> u8 { match n { Nucleotide::A => 0, Nucleotide::C => 1, Nucleotide::G => 2, Nucleotide::T => 3, Nucleotide::N => 0, } } /// Simple f32 → f16 conversion (IEEE 754 half precision) fn f32_to_f16(value: f32) -> u16 { let bits = value.to_bits(); let sign = (bits >> 31) & 1; let exponent = ((bits >> 23) & 0xFF) as i32; let mantissa = bits & 0x7FFFFF; if exponent == 0xFF { // Inf/NaN return ((sign << 15) | 0x7C00 | (mantissa >> 13).min(1)) as u16; } let new_exp = exponent - 127 + 15; if new_exp >= 31 { // Overflow → Inf return ((sign << 15) | 0x7C00) as u16; } if new_exp <= 0 { // Underflow → 0 return (sign << 15) as u16; } ((sign << 15) | (new_exp as u32) << 10 | (mantissa >> 13)) as u16 } /// Simple f16 → f32 conversion fn f16_to_f32(half: u16) -> f32 { let sign = ((half >> 15) & 1) as u32; let exponent = ((half >> 10) & 0x1F) as u32; let mantissa = (half & 0x3FF) as u32; if exponent == 0x1F { // Inf/NaN let bits = (sign << 31) | 0x7F800000 | (mantissa << 13); return f32::from_bits(bits); } if exponent == 0 { if mantissa == 0 { return f32::from_bits(sign << 31); // +/- 0 } // Denormalized let bits = (sign << 31) | ((exponent + 127 - 15) << 23) | (mantissa << 13); return f32::from_bits(bits); } let bits = (sign << 31) | ((exponent + 127 - 15) << 23) | (mantissa << 13); f32::from_bits(bits) } // ============================================================================ // Tests // ============================================================================ #[cfg(test)] mod tests { use super::*; #[test] fn test_2bit_encoding_roundtrip() { let bases = vec![ Nucleotide::A, Nucleotide::C, Nucleotide::G, Nucleotide::T, Nucleotide::A, Nucleotide::N, Nucleotide::G, Nucleotide::T, Nucleotide::C, ]; let (packed, mask) = encode_2bit(&bases); let decoded = decode_2bit(&packed, &mask, bases.len()); assert_eq!(bases, decoded); } #[test] fn test_2bit_compression_ratio() { // 1000 bases should pack into 250 bytes let bases: Vec = (0..1000) .map(|i| match i % 4 { 0 => Nucleotide::A, 1 => Nucleotide::C, 2 => Nucleotide::G, _ => Nucleotide::T, }) .collect(); let (packed, _mask) = encode_2bit(&bases); assert_eq!(packed.len(), 250); // 4x compression vs 1 byte per base } #[test] fn test_quality_encoding_roundtrip() { let qualities: Vec = (0..100).map(|i| (i % 42) as u8).collect(); let encoded = encode_quality(&qualities); let decoded = decode_quality(&encoded, qualities.len()); assert_eq!(qualities, decoded); } #[test] fn test_quality_compression_ratio() { // 6-bit encoding: 100 values = 75 bytes (vs 100 bytes raw) let qualities: Vec = vec![30; 100]; let encoded = encode_quality(&qualities); assert!( encoded.len() <= 75, "6-bit should compress: {} bytes", encoded.len() ); } #[test] fn test_sparse_attention_roundtrip() { let dense = vec![ 0.0, 0.5, 0.0, 0.0, 0.3, 0.0, 0.0, 0.7, 0.0, 0.0, 0.9, 0.0, 0.0, 0.1, 0.0, 0.0, ]; let sparse = SparseAttention::from_dense(&dense, 4, 4, 0.05); assert_eq!(sparse.nnz(), 5); // 5 values > 0.05 assert!(sparse.sparsity() > 0.6); // Roundtrip through bytes let bytes = sparse.to_bytes(); let restored = SparseAttention::from_bytes(&bytes).unwrap(); assert_eq!(restored.nnz(), 5); assert!((restored.get(0, 1) - 0.5).abs() < 1e-6); assert!((restored.get(1, 3) - 0.7).abs() < 1e-6); } #[test] fn test_variant_tensor_binary_search() { let mut vt = VariantTensor::new(); vt.add_variant(100, Nucleotide::A, Nucleotide::G, 0.01, 0.99, 0.0, 40); vt.add_variant(200, Nucleotide::C, Nucleotide::T, 0.0, 0.5, 0.5, 35); vt.add_variant(300, Nucleotide::G, Nucleotide::A, 0.9, 0.1, 0.0, 50); let gl = vt.get_likelihoods(200).unwrap(); assert!(gl[1] > 0.4); // Het likelihood assert!(vt.get_likelihoods(150).is_none()); // Not found // Roundtrip let bytes = vt.to_bytes(); let restored = VariantTensor::from_bytes(&bytes).unwrap(); assert_eq!(restored.len(), 3); } #[test] fn test_f16_roundtrip() { for &val in &[0.0f32, 1.0, -1.0, 0.5, 0.001, 100.0] { let half = f32_to_f16(val); let back = f16_to_f32(half); let rel_err = if val.abs() > 0.0 { (back - val).abs() / val.abs() } else { back.abs() }; assert!( rel_err < 0.01, "f16 roundtrip failed for {}: got {}", val, back ); } } #[test] fn test_header_roundtrip() { let mut header = RvdnaHeader::new(10000, Codec::None).with_quality(); header.sections[0] = SectionEntry { offset: 192, size: 2500, }; header.sections[1] = SectionEntry { offset: 2752, size: 8192, }; let bytes = header.to_bytes(); let restored = RvdnaHeader::from_bytes(&bytes).unwrap(); assert_eq!(restored.version, FORMAT_VERSION); assert_eq!(restored.codec, Codec::None); assert!(restored.has_quality()); assert_eq!(restored.sequence_length, 10000); assert_eq!(restored.sections[0].offset, 192); assert_eq!(restored.sections[0].size, 2500); } #[test] fn test_full_rvdna_write_read() { // Create a sequence let bases: Vec = "ACGTACGTACGTACGTACGTACGTACGTACGT" .chars() .map(|c| match c { 'A' => Nucleotide::A, 'C' => Nucleotide::C, 'G' => Nucleotide::G, 'T' => Nucleotide::T, _ => Nucleotide::N, }) .collect(); let sequence = DnaSequence::new(bases); // Build RVDNA file let mut writer = RvdnaWriter::new(&sequence, Codec::None) .with_metadata(serde_json::json!({"sample": "test"})); let mut output = Vec::new(); let written = writer.write(&mut output).unwrap(); assert!(written > 0); // Read it back let reader = RvdnaReader::from_bytes(output).unwrap(); assert_eq!(reader.header.sequence_length, 32); assert_eq!(reader.header.codec, Codec::None); let restored_seq = reader.read_sequence().unwrap(); assert_eq!(restored_seq.len(), 32); assert_eq!(restored_seq.to_string(), sequence.to_string()); let meta = reader.read_metadata().unwrap().unwrap(); assert_eq!(meta["sample"], "test"); // Check stats let stats = reader.stats(); assert!( stats.bits_per_base < 8.0, "Should compress below 1 byte/base" ); } #[test] fn test_rvdna_with_kmer_vectors() { let seq = DnaSequence::from_str("ACGTACGTACGTACGTACGTACGTACGTACGT").unwrap(); let mut writer = RvdnaWriter::new(&seq, Codec::None) .with_kmer_vectors(&seq, 11, 256, 32) .unwrap(); let mut output = Vec::new(); writer.write(&mut output).unwrap(); let reader = RvdnaReader::from_bytes(output).unwrap(); let blocks = reader.read_kmer_vectors().unwrap(); assert!(!blocks.is_empty()); assert_eq!(blocks[0].k, 11); assert_eq!(blocks[0].dimensions, 256); assert!(blocks[0].quantized.is_some()); } #[test] fn test_fasta_to_rvdna_conversion() { let fasta_seq = "ACGTACGTACGTACGTACGTACGTACGTACGT"; let rvdna_bytes = fasta_to_rvdna(fasta_seq, 11, 256, 1000).unwrap(); let reader = RvdnaReader::from_bytes(rvdna_bytes).unwrap(); let restored = reader.read_sequence().unwrap(); assert_eq!(restored.to_string(), fasta_seq); let stats = reader.stats(); assert!(stats.total_size > 0); // Sequence section uses 2-bit encoding (~2 bits/base at scale) // For short sequences, overhead from length headers increases ratio // At 1000+ bases, this drops well below 3 bits/base assert!( stats.bits_per_base < 8.0, "Should beat 1-byte-per-base encoding, got {:.1} bits/base", stats.bits_per_base ); } #[test] fn test_kmer_vector_similarity() { let seq1 = DnaSequence::from_str("ACGTACGTACGTACGTACGTACGTACGTACGT").unwrap(); let seq2 = DnaSequence::from_str("ACGTACGTACGTACGTACGTACGTACGTACGG").unwrap(); // 1 base diff let seq3 = DnaSequence::from_str("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT").unwrap(); // very different let block1 = KmerVectorBlock::from_sequence(&seq1, 0, 32, 11, 256).unwrap(); let vec2 = seq2.to_kmer_vector(11, 256).unwrap(); let vec3 = seq3.to_kmer_vector(11, 256).unwrap(); let sim_similar = block1.cosine_similarity(&vec2); let sim_different = block1.cosine_similarity(&vec3); assert!( sim_similar > sim_different, "Similar sequence ({}) should score higher than different ({})", sim_similar, sim_different ); } }