//! INDEX_SEG encode/decode: varint delta encoding with restart points. //! //! Implements the binary layout from the RVF wire spec for INDEX_SEG payloads. extern crate alloc; use alloc::vec::Vec; /// Default restart interval for varint delta encoding. pub const DEFAULT_RESTART_INTERVAL: u32 = 64; /// Index segment header (64-byte aligned). #[derive(Clone, Debug, PartialEq)] pub struct IndexSegHeader { /// 0 = HNSW, 1 = IVF, 2 = flat. pub index_type: u8, /// Layer level: 0 = A, 1 = B, 2 = C. pub layer_level: u8, /// HNSW max neighbors per layer. pub m: u16, /// ef_construction parameter. pub ef_construction: u32, /// Number of nodes in this segment. pub node_count: u64, } /// Encoded adjacency data for a single node. #[derive(Clone, Debug, PartialEq)] pub struct NodeAdjacency { /// The node ID. pub node_id: u64, /// Neighbor IDs per HNSW layer (index 0 = layer 0). pub layers: Vec>, } /// Full decoded index segment data. #[derive(Clone, Debug, PartialEq)] pub struct IndexSegData { pub header: IndexSegHeader, pub restart_interval: u32, pub nodes: Vec, } // ── Varint Encoding (LEB128) ───────────────────────────────────── /// Encode a u64 as LEB128 varint. pub fn encode_varint(mut value: u64, buf: &mut Vec) { loop { let mut byte = (value & 0x7F) as u8; value >>= 7; if value != 0 { byte |= 0x80; } buf.push(byte); if value == 0 { break; } } } /// Decode a LEB128 varint from a byte slice. Returns `(value, bytes_consumed)`. pub fn decode_varint(data: &[u8]) -> Option<(u64, usize)> { let mut value: u64 = 0; let mut shift: u32 = 0; for (i, &byte) in data.iter().enumerate() { if shift >= 64 { return None; // Overflow. } value |= ((byte & 0x7F) as u64) << shift; shift += 7; if byte & 0x80 == 0 { return Some((value, i + 1)); } } None // Incomplete. } // ── Delta Encoding ─────────────────────────────────────────────── /// Delta-encode a sorted sequence of u64 values. pub fn delta_encode(sorted_ids: &[u64]) -> Vec { if sorted_ids.is_empty() { return Vec::new(); } let mut deltas = Vec::with_capacity(sorted_ids.len()); deltas.push(sorted_ids[0]); for i in 1..sorted_ids.len() { deltas.push(sorted_ids[i] - sorted_ids[i - 1]); } deltas } /// Decode delta-encoded values back to absolute IDs. pub fn delta_decode(deltas: &[u64]) -> Vec { if deltas.is_empty() { return Vec::new(); } let mut ids = Vec::with_capacity(deltas.len()); ids.push(deltas[0]); for i in 1..deltas.len() { ids.push(ids[i - 1] + deltas[i]); } ids } // ── INDEX_SEG Encode ───────────────────────────────────────────── /// Encode an INDEX_SEG payload. /// /// Layout: /// 1. Index header (padded to 64 bytes) /// 2. Restart point index (padded to 64 bytes) /// 3. Adjacency data with delta-encoded neighbor lists pub fn encode_index_seg(data: &IndexSegData) -> Vec { let mut buf = Vec::new(); // 1. Header (pad to 64 bytes). buf.push(data.header.index_type); buf.push(data.header.layer_level); buf.extend_from_slice(&data.header.m.to_le_bytes()); buf.extend_from_slice(&data.header.ef_construction.to_le_bytes()); buf.extend_from_slice(&data.header.node_count.to_le_bytes()); pad_to_alignment(&mut buf, 64); // 2. Encode adjacency data with restart points. let restart_interval = data.restart_interval; let mut adj_buf = Vec::new(); let mut restart_offsets: Vec = Vec::new(); for (idx, node) in data.nodes.iter().enumerate() { if (idx as u32).is_multiple_of(restart_interval) { restart_offsets.push(adj_buf.len() as u32); } // Encode layer count. encode_varint(node.layers.len() as u64, &mut adj_buf); // Encode each layer's neighbors. for neighbors in &node.layers { encode_varint(neighbors.len() as u64, &mut adj_buf); // Delta-encode sorted neighbor IDs. let mut sorted = neighbors.clone(); sorted.sort(); let is_restart = (idx as u32).is_multiple_of(restart_interval); if is_restart { // At restart points, encode absolute IDs. for &nid in &sorted { encode_varint(nid, &mut adj_buf); } } else { // Delta encode. let deltas = delta_encode(&sorted); for &d in &deltas { encode_varint(d, &mut adj_buf); } } } } // Write restart point index. buf.extend_from_slice(&restart_interval.to_le_bytes()); let restart_count = restart_offsets.len() as u32; buf.extend_from_slice(&restart_count.to_le_bytes()); for offset in &restart_offsets { buf.extend_from_slice(&offset.to_le_bytes()); } pad_to_alignment(&mut buf, 64); // Write adjacency data. buf.extend_from_slice(&adj_buf); pad_to_alignment(&mut buf, 64); buf } /// Decode an INDEX_SEG payload. pub fn decode_index_seg(data: &[u8]) -> Result { if data.len() < 64 { return Err(CodecError::TooShort); } // 1. Parse header. let index_type = data[0]; let layer_level = data[1]; let m = u16::from_le_bytes([data[2], data[3]]); let ef_construction = u32::from_le_bytes([data[4], data[5], data[6], data[7]]); let node_count = u64::from_le_bytes([ data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15], ]); let header = IndexSegHeader { index_type, layer_level, m, ef_construction, node_count, }; // Skip header padding. let mut pos = 64; // 2. Parse restart point index. if pos + 8 > data.len() { return Err(CodecError::TooShort); } let restart_interval = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]); pos += 4; let restart_count = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]); pos += 4; let mut restart_offsets = Vec::with_capacity(restart_count as usize); for _ in 0..restart_count { if pos + 4 > data.len() { return Err(CodecError::TooShort); } let offset = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]); restart_offsets.push(offset); pos += 4; } // Skip padding to 64-byte alignment. pos = align_up(pos, 64); // 3. Parse adjacency data. let adj_start = pos; let adj_data = &data[adj_start..]; let mut nodes = Vec::new(); let mut adj_pos = 0; for node_idx in 0..node_count as usize { let is_restart = (node_idx as u32).is_multiple_of(restart_interval); // Decode layer count. let (layer_count, consumed) = decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?; adj_pos += consumed; let mut layers = Vec::with_capacity(layer_count as usize); for _ in 0..layer_count { let (neighbor_count, consumed) = decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?; adj_pos += consumed; let mut neighbor_ids = Vec::with_capacity(neighbor_count as usize); if is_restart { // Absolute IDs at restart points. for _ in 0..neighbor_count { let (nid, consumed) = decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?; adj_pos += consumed; neighbor_ids.push(nid); } } else { // Delta-encoded IDs. let mut deltas = Vec::with_capacity(neighbor_count as usize); for _ in 0..neighbor_count { let (d, consumed) = decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?; adj_pos += consumed; deltas.push(d); } neighbor_ids = delta_decode(&deltas); } layers.push(neighbor_ids); } nodes.push(NodeAdjacency { node_id: node_idx as u64, layers, }); } Ok(IndexSegData { header, restart_interval, nodes, }) } /// Errors that can occur during INDEX_SEG codec operations. #[derive(Clone, Debug, PartialEq)] pub enum CodecError { /// Input data is shorter than expected. TooShort, /// Invalid varint encountered. InvalidVarint, /// Unknown index type. UnknownIndexType(u8), } impl core::fmt::Display for CodecError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Self::TooShort => write!(f, "input data too short"), Self::InvalidVarint => write!(f, "invalid varint encoding"), Self::UnknownIndexType(t) => write!(f, "unknown index type: {}", t), } } } // ── Helpers ────────────────────────────────────────────────────── /// Pad `buf` with zeros to the next multiple of `alignment`. fn pad_to_alignment(buf: &mut Vec, alignment: usize) { let rem = buf.len() % alignment; if rem != 0 { buf.resize(buf.len() + (alignment - rem), 0); } } /// Round `offset` up to the next multiple of `alignment`. fn align_up(offset: usize, alignment: usize) -> usize { let rem = offset % alignment; if rem == 0 { offset } else { offset + (alignment - rem) } } #[cfg(test)] mod tests { use super::*; #[test] fn varint_round_trip() { let values = [0, 1, 127, 128, 16383, 16384, 2097151, u64::MAX]; for &val in &values { let mut buf = Vec::new(); encode_varint(val, &mut buf); let (decoded, consumed) = decode_varint(&buf).unwrap(); assert_eq!(decoded, val); assert_eq!(consumed, buf.len()); } } #[test] fn varint_encoding_sizes() { let mut buf = Vec::new(); encode_varint(0, &mut buf); assert_eq!(buf.len(), 1); buf.clear(); encode_varint(127, &mut buf); assert_eq!(buf.len(), 1); buf.clear(); encode_varint(128, &mut buf); assert_eq!(buf.len(), 2); buf.clear(); encode_varint(16383, &mut buf); assert_eq!(buf.len(), 2); buf.clear(); encode_varint(16384, &mut buf); assert_eq!(buf.len(), 3); } #[test] fn delta_encode_decode_round_trip() { let ids = vec![100, 105, 108, 120, 200]; let deltas = delta_encode(&ids); assert_eq!(deltas, vec![100, 5, 3, 12, 80]); let decoded = delta_decode(&deltas); assert_eq!(decoded, ids); } #[test] fn delta_encode_empty() { assert!(delta_encode(&[]).is_empty()); assert!(delta_decode(&[]).is_empty()); } #[test] fn index_seg_round_trip() { let data = IndexSegData { header: IndexSegHeader { index_type: 0, // HNSW layer_level: 2, // Layer C m: 16, ef_construction: 200, node_count: 5, }, restart_interval: 3, nodes: vec![ NodeAdjacency { node_id: 0, layers: vec![vec![1, 2, 3], vec![1]], }, NodeAdjacency { node_id: 1, layers: vec![vec![0, 2, 4]], }, NodeAdjacency { node_id: 2, layers: vec![vec![0, 1, 3, 4]], }, NodeAdjacency { node_id: 3, layers: vec![vec![0, 2, 4], vec![4]], }, NodeAdjacency { node_id: 4, layers: vec![vec![1, 2, 3]], }, ], }; let encoded = encode_index_seg(&data); let decoded = decode_index_seg(&encoded).unwrap(); assert_eq!(decoded.header, data.header); assert_eq!(decoded.restart_interval, data.restart_interval); assert_eq!(decoded.nodes.len(), data.nodes.len()); // Verify each node's adjacency. Note: neighbors are sorted during encoding. for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) { assert_eq!(dec.node_id, orig.node_id); assert_eq!(dec.layers.len(), orig.layers.len()); for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) { let mut sorted_orig = ol.clone(); sorted_orig.sort(); assert_eq!(*dl, sorted_orig); } } } #[test] fn index_seg_larger_with_restart() { // Test with enough nodes to exercise multiple restart groups. let num_nodes = 200; let restart_interval = 64; let nodes: Vec = (0..num_nodes) .map(|i| { let neighbors: Vec = (0..8).map(|j| ((i + j + 1) % num_nodes) as u64).collect(); NodeAdjacency { node_id: i as u64, layers: vec![neighbors], } }) .collect(); let data = IndexSegData { header: IndexSegHeader { index_type: 0, layer_level: 2, m: 16, ef_construction: 200, node_count: num_nodes as u64, }, restart_interval, nodes, }; let encoded = encode_index_seg(&data); let decoded = decode_index_seg(&encoded).unwrap(); assert_eq!(decoded.header, data.header); assert_eq!(decoded.nodes.len(), data.nodes.len()); for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) { assert_eq!(dec.layers.len(), orig.layers.len()); for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) { let mut sorted_orig = ol.clone(); sorted_orig.sort(); assert_eq!(*dl, sorted_orig); } } } #[test] fn delta_encoding_sorted_u64_sequences() { // Verify exact round-trip for various sorted u64 sequences. let sequences: Vec> = vec![ vec![0, 1, 2, 3, 4], vec![1000, 2000, 3000, 4000], vec![0, 100, 200, 300, 400, 500], vec![ u64::MAX - 4, u64::MAX - 3, u64::MAX - 2, u64::MAX - 1, u64::MAX, ], ]; for seq in sequences { let deltas = delta_encode(&seq); let decoded = delta_decode(&deltas); assert_eq!(decoded, seq, "Failed for sequence: {:?}", seq); } } }