Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
503
vendor/ruvector/crates/rvf/rvf-index/src/codec.rs
vendored
Normal file
503
vendor/ruvector/crates/rvf/rvf-index/src/codec.rs
vendored
Normal file
@@ -0,0 +1,503 @@
|
||||
//! INDEX_SEG encode/decode: varint delta encoding with restart points.
|
||||
//!
|
||||
//! Implements the binary layout from the RVF wire spec for INDEX_SEG payloads.
|
||||
|
||||
extern crate alloc;
|
||||
|
||||
use alloc::vec::Vec;
|
||||
|
||||
/// Default restart interval for varint delta encoding.
|
||||
pub const DEFAULT_RESTART_INTERVAL: u32 = 64;
|
||||
|
||||
/// Index segment header (64-byte aligned).
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct IndexSegHeader {
|
||||
/// 0 = HNSW, 1 = IVF, 2 = flat.
|
||||
pub index_type: u8,
|
||||
/// Layer level: 0 = A, 1 = B, 2 = C.
|
||||
pub layer_level: u8,
|
||||
/// HNSW max neighbors per layer.
|
||||
pub m: u16,
|
||||
/// ef_construction parameter.
|
||||
pub ef_construction: u32,
|
||||
/// Number of nodes in this segment.
|
||||
pub node_count: u64,
|
||||
}
|
||||
|
||||
/// Encoded adjacency data for a single node.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct NodeAdjacency {
|
||||
/// The node ID.
|
||||
pub node_id: u64,
|
||||
/// Neighbor IDs per HNSW layer (index 0 = layer 0).
|
||||
pub layers: Vec<Vec<u64>>,
|
||||
}
|
||||
|
||||
/// Full decoded index segment data.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct IndexSegData {
|
||||
pub header: IndexSegHeader,
|
||||
pub restart_interval: u32,
|
||||
pub nodes: Vec<NodeAdjacency>,
|
||||
}
|
||||
|
||||
// ── Varint Encoding (LEB128) ─────────────────────────────────────
|
||||
|
||||
/// Encode a u64 as LEB128 varint.
|
||||
pub fn encode_varint(mut value: u64, buf: &mut Vec<u8>) {
|
||||
loop {
|
||||
let mut byte = (value & 0x7F) as u8;
|
||||
value >>= 7;
|
||||
if value != 0 {
|
||||
byte |= 0x80;
|
||||
}
|
||||
buf.push(byte);
|
||||
if value == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode a LEB128 varint from a byte slice. Returns `(value, bytes_consumed)`.
|
||||
pub fn decode_varint(data: &[u8]) -> Option<(u64, usize)> {
|
||||
let mut value: u64 = 0;
|
||||
let mut shift: u32 = 0;
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
if shift >= 64 {
|
||||
return None; // Overflow.
|
||||
}
|
||||
value |= ((byte & 0x7F) as u64) << shift;
|
||||
shift += 7;
|
||||
if byte & 0x80 == 0 {
|
||||
return Some((value, i + 1));
|
||||
}
|
||||
}
|
||||
None // Incomplete.
|
||||
}
|
||||
|
||||
// ── Delta Encoding ───────────────────────────────────────────────
|
||||
|
||||
/// Delta-encode a sorted sequence of u64 values.
|
||||
pub fn delta_encode(sorted_ids: &[u64]) -> Vec<u64> {
|
||||
if sorted_ids.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let mut deltas = Vec::with_capacity(sorted_ids.len());
|
||||
deltas.push(sorted_ids[0]);
|
||||
for i in 1..sorted_ids.len() {
|
||||
deltas.push(sorted_ids[i] - sorted_ids[i - 1]);
|
||||
}
|
||||
deltas
|
||||
}
|
||||
|
||||
/// Decode delta-encoded values back to absolute IDs.
|
||||
pub fn delta_decode(deltas: &[u64]) -> Vec<u64> {
|
||||
if deltas.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let mut ids = Vec::with_capacity(deltas.len());
|
||||
ids.push(deltas[0]);
|
||||
for i in 1..deltas.len() {
|
||||
ids.push(ids[i - 1] + deltas[i]);
|
||||
}
|
||||
ids
|
||||
}
|
||||
|
||||
// ── INDEX_SEG Encode ─────────────────────────────────────────────
|
||||
|
||||
/// Encode an INDEX_SEG payload.
|
||||
///
|
||||
/// Layout:
|
||||
/// 1. Index header (padded to 64 bytes)
|
||||
/// 2. Restart point index (padded to 64 bytes)
|
||||
/// 3. Adjacency data with delta-encoded neighbor lists
|
||||
pub fn encode_index_seg(data: &IndexSegData) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
|
||||
// 1. Header (pad to 64 bytes).
|
||||
buf.push(data.header.index_type);
|
||||
buf.push(data.header.layer_level);
|
||||
buf.extend_from_slice(&data.header.m.to_le_bytes());
|
||||
buf.extend_from_slice(&data.header.ef_construction.to_le_bytes());
|
||||
buf.extend_from_slice(&data.header.node_count.to_le_bytes());
|
||||
pad_to_alignment(&mut buf, 64);
|
||||
|
||||
// 2. Encode adjacency data with restart points.
|
||||
let restart_interval = data.restart_interval;
|
||||
let mut adj_buf = Vec::new();
|
||||
let mut restart_offsets: Vec<u32> = Vec::new();
|
||||
|
||||
for (idx, node) in data.nodes.iter().enumerate() {
|
||||
if (idx as u32).is_multiple_of(restart_interval) {
|
||||
restart_offsets.push(adj_buf.len() as u32);
|
||||
}
|
||||
|
||||
// Encode layer count.
|
||||
encode_varint(node.layers.len() as u64, &mut adj_buf);
|
||||
|
||||
// Encode each layer's neighbors.
|
||||
for neighbors in &node.layers {
|
||||
encode_varint(neighbors.len() as u64, &mut adj_buf);
|
||||
// Delta-encode sorted neighbor IDs.
|
||||
let mut sorted = neighbors.clone();
|
||||
sorted.sort();
|
||||
|
||||
let is_restart = (idx as u32).is_multiple_of(restart_interval);
|
||||
if is_restart {
|
||||
// At restart points, encode absolute IDs.
|
||||
for &nid in &sorted {
|
||||
encode_varint(nid, &mut adj_buf);
|
||||
}
|
||||
} else {
|
||||
// Delta encode.
|
||||
let deltas = delta_encode(&sorted);
|
||||
for &d in &deltas {
|
||||
encode_varint(d, &mut adj_buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write restart point index.
|
||||
buf.extend_from_slice(&restart_interval.to_le_bytes());
|
||||
let restart_count = restart_offsets.len() as u32;
|
||||
buf.extend_from_slice(&restart_count.to_le_bytes());
|
||||
for offset in &restart_offsets {
|
||||
buf.extend_from_slice(&offset.to_le_bytes());
|
||||
}
|
||||
pad_to_alignment(&mut buf, 64);
|
||||
|
||||
// Write adjacency data.
|
||||
buf.extend_from_slice(&adj_buf);
|
||||
pad_to_alignment(&mut buf, 64);
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode an INDEX_SEG payload.
|
||||
pub fn decode_index_seg(data: &[u8]) -> Result<IndexSegData, CodecError> {
|
||||
if data.len() < 64 {
|
||||
return Err(CodecError::TooShort);
|
||||
}
|
||||
|
||||
// 1. Parse header.
|
||||
let index_type = data[0];
|
||||
let layer_level = data[1];
|
||||
let m = u16::from_le_bytes([data[2], data[3]]);
|
||||
let ef_construction = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
|
||||
let node_count = u64::from_le_bytes([
|
||||
data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
|
||||
]);
|
||||
|
||||
let header = IndexSegHeader {
|
||||
index_type,
|
||||
layer_level,
|
||||
m,
|
||||
ef_construction,
|
||||
node_count,
|
||||
};
|
||||
|
||||
// Skip header padding.
|
||||
let mut pos = 64;
|
||||
|
||||
// 2. Parse restart point index.
|
||||
if pos + 8 > data.len() {
|
||||
return Err(CodecError::TooShort);
|
||||
}
|
||||
let restart_interval =
|
||||
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
|
||||
pos += 4;
|
||||
let restart_count =
|
||||
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
|
||||
pos += 4;
|
||||
|
||||
let mut restart_offsets = Vec::with_capacity(restart_count as usize);
|
||||
for _ in 0..restart_count {
|
||||
if pos + 4 > data.len() {
|
||||
return Err(CodecError::TooShort);
|
||||
}
|
||||
let offset = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
|
||||
restart_offsets.push(offset);
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
// Skip padding to 64-byte alignment.
|
||||
pos = align_up(pos, 64);
|
||||
|
||||
// 3. Parse adjacency data.
|
||||
let adj_start = pos;
|
||||
let adj_data = &data[adj_start..];
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
let mut adj_pos = 0;
|
||||
|
||||
for node_idx in 0..node_count as usize {
|
||||
let is_restart = (node_idx as u32).is_multiple_of(restart_interval);
|
||||
|
||||
// Decode layer count.
|
||||
let (layer_count, consumed) =
|
||||
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
|
||||
adj_pos += consumed;
|
||||
|
||||
let mut layers = Vec::with_capacity(layer_count as usize);
|
||||
|
||||
for _ in 0..layer_count {
|
||||
let (neighbor_count, consumed) =
|
||||
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
|
||||
adj_pos += consumed;
|
||||
|
||||
let mut neighbor_ids = Vec::with_capacity(neighbor_count as usize);
|
||||
|
||||
if is_restart {
|
||||
// Absolute IDs at restart points.
|
||||
for _ in 0..neighbor_count {
|
||||
let (nid, consumed) =
|
||||
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
|
||||
adj_pos += consumed;
|
||||
neighbor_ids.push(nid);
|
||||
}
|
||||
} else {
|
||||
// Delta-encoded IDs.
|
||||
let mut deltas = Vec::with_capacity(neighbor_count as usize);
|
||||
for _ in 0..neighbor_count {
|
||||
let (d, consumed) =
|
||||
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
|
||||
adj_pos += consumed;
|
||||
deltas.push(d);
|
||||
}
|
||||
neighbor_ids = delta_decode(&deltas);
|
||||
}
|
||||
|
||||
layers.push(neighbor_ids);
|
||||
}
|
||||
|
||||
nodes.push(NodeAdjacency {
|
||||
node_id: node_idx as u64,
|
||||
layers,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(IndexSegData {
|
||||
header,
|
||||
restart_interval,
|
||||
nodes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Errors that can occur during INDEX_SEG codec operations.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum CodecError {
|
||||
/// Input data is shorter than expected.
|
||||
TooShort,
|
||||
/// Invalid varint encountered.
|
||||
InvalidVarint,
|
||||
/// Unknown index type.
|
||||
UnknownIndexType(u8),
|
||||
}
|
||||
|
||||
impl core::fmt::Display for CodecError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self {
|
||||
Self::TooShort => write!(f, "input data too short"),
|
||||
Self::InvalidVarint => write!(f, "invalid varint encoding"),
|
||||
Self::UnknownIndexType(t) => write!(f, "unknown index type: {}", t),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────
|
||||
|
||||
/// Pad `buf` with zeros to the next multiple of `alignment`.
|
||||
fn pad_to_alignment(buf: &mut Vec<u8>, alignment: usize) {
|
||||
let rem = buf.len() % alignment;
|
||||
if rem != 0 {
|
||||
buf.resize(buf.len() + (alignment - rem), 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Round `offset` up to the next multiple of `alignment`.
|
||||
fn align_up(offset: usize, alignment: usize) -> usize {
|
||||
let rem = offset % alignment;
|
||||
if rem == 0 {
|
||||
offset
|
||||
} else {
|
||||
offset + (alignment - rem)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn varint_round_trip() {
|
||||
let values = [0, 1, 127, 128, 16383, 16384, 2097151, u64::MAX];
|
||||
for &val in &values {
|
||||
let mut buf = Vec::new();
|
||||
encode_varint(val, &mut buf);
|
||||
let (decoded, consumed) = decode_varint(&buf).unwrap();
|
||||
assert_eq!(decoded, val);
|
||||
assert_eq!(consumed, buf.len());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn varint_encoding_sizes() {
|
||||
let mut buf = Vec::new();
|
||||
|
||||
encode_varint(0, &mut buf);
|
||||
assert_eq!(buf.len(), 1);
|
||||
buf.clear();
|
||||
|
||||
encode_varint(127, &mut buf);
|
||||
assert_eq!(buf.len(), 1);
|
||||
buf.clear();
|
||||
|
||||
encode_varint(128, &mut buf);
|
||||
assert_eq!(buf.len(), 2);
|
||||
buf.clear();
|
||||
|
||||
encode_varint(16383, &mut buf);
|
||||
assert_eq!(buf.len(), 2);
|
||||
buf.clear();
|
||||
|
||||
encode_varint(16384, &mut buf);
|
||||
assert_eq!(buf.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_encode_decode_round_trip() {
|
||||
let ids = vec![100, 105, 108, 120, 200];
|
||||
let deltas = delta_encode(&ids);
|
||||
assert_eq!(deltas, vec![100, 5, 3, 12, 80]);
|
||||
let decoded = delta_decode(&deltas);
|
||||
assert_eq!(decoded, ids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_encode_empty() {
|
||||
assert!(delta_encode(&[]).is_empty());
|
||||
assert!(delta_decode(&[]).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_seg_round_trip() {
|
||||
let data = IndexSegData {
|
||||
header: IndexSegHeader {
|
||||
index_type: 0, // HNSW
|
||||
layer_level: 2, // Layer C
|
||||
m: 16,
|
||||
ef_construction: 200,
|
||||
node_count: 5,
|
||||
},
|
||||
restart_interval: 3,
|
||||
nodes: vec![
|
||||
NodeAdjacency {
|
||||
node_id: 0,
|
||||
layers: vec![vec![1, 2, 3], vec![1]],
|
||||
},
|
||||
NodeAdjacency {
|
||||
node_id: 1,
|
||||
layers: vec![vec![0, 2, 4]],
|
||||
},
|
||||
NodeAdjacency {
|
||||
node_id: 2,
|
||||
layers: vec![vec![0, 1, 3, 4]],
|
||||
},
|
||||
NodeAdjacency {
|
||||
node_id: 3,
|
||||
layers: vec![vec![0, 2, 4], vec![4]],
|
||||
},
|
||||
NodeAdjacency {
|
||||
node_id: 4,
|
||||
layers: vec![vec![1, 2, 3]],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let encoded = encode_index_seg(&data);
|
||||
let decoded = decode_index_seg(&encoded).unwrap();
|
||||
|
||||
assert_eq!(decoded.header, data.header);
|
||||
assert_eq!(decoded.restart_interval, data.restart_interval);
|
||||
assert_eq!(decoded.nodes.len(), data.nodes.len());
|
||||
|
||||
// Verify each node's adjacency. Note: neighbors are sorted during encoding.
|
||||
for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) {
|
||||
assert_eq!(dec.node_id, orig.node_id);
|
||||
assert_eq!(dec.layers.len(), orig.layers.len());
|
||||
for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) {
|
||||
let mut sorted_orig = ol.clone();
|
||||
sorted_orig.sort();
|
||||
assert_eq!(*dl, sorted_orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_seg_larger_with_restart() {
|
||||
// Test with enough nodes to exercise multiple restart groups.
|
||||
let num_nodes = 200;
|
||||
let restart_interval = 64;
|
||||
let nodes: Vec<NodeAdjacency> = (0..num_nodes)
|
||||
.map(|i| {
|
||||
let neighbors: Vec<u64> =
|
||||
(0..8).map(|j| ((i + j + 1) % num_nodes) as u64).collect();
|
||||
NodeAdjacency {
|
||||
node_id: i as u64,
|
||||
layers: vec![neighbors],
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let data = IndexSegData {
|
||||
header: IndexSegHeader {
|
||||
index_type: 0,
|
||||
layer_level: 2,
|
||||
m: 16,
|
||||
ef_construction: 200,
|
||||
node_count: num_nodes as u64,
|
||||
},
|
||||
restart_interval,
|
||||
nodes,
|
||||
};
|
||||
|
||||
let encoded = encode_index_seg(&data);
|
||||
let decoded = decode_index_seg(&encoded).unwrap();
|
||||
|
||||
assert_eq!(decoded.header, data.header);
|
||||
assert_eq!(decoded.nodes.len(), data.nodes.len());
|
||||
|
||||
for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) {
|
||||
assert_eq!(dec.layers.len(), orig.layers.len());
|
||||
for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) {
|
||||
let mut sorted_orig = ol.clone();
|
||||
sorted_orig.sort();
|
||||
assert_eq!(*dl, sorted_orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_encoding_sorted_u64_sequences() {
|
||||
// Verify exact round-trip for various sorted u64 sequences.
|
||||
let sequences: Vec<Vec<u64>> = vec![
|
||||
vec![0, 1, 2, 3, 4],
|
||||
vec![1000, 2000, 3000, 4000],
|
||||
vec![0, 100, 200, 300, 400, 500],
|
||||
vec![
|
||||
u64::MAX - 4,
|
||||
u64::MAX - 3,
|
||||
u64::MAX - 2,
|
||||
u64::MAX - 1,
|
||||
u64::MAX,
|
||||
],
|
||||
];
|
||||
|
||||
for seq in sequences {
|
||||
let deltas = delta_encode(&seq);
|
||||
let decoded = delta_decode(&deltas);
|
||||
assert_eq!(decoded, seq, "Failed for sequence: {:?}", seq);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user