Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,503 @@
//! INDEX_SEG encode/decode: varint delta encoding with restart points.
//!
//! Implements the binary layout from the RVF wire spec for INDEX_SEG payloads.
extern crate alloc;
use alloc::vec::Vec;
/// Default restart interval for varint delta encoding.
pub const DEFAULT_RESTART_INTERVAL: u32 = 64;
/// Index segment header (64-byte aligned).
#[derive(Clone, Debug, PartialEq)]
pub struct IndexSegHeader {
/// 0 = HNSW, 1 = IVF, 2 = flat.
pub index_type: u8,
/// Layer level: 0 = A, 1 = B, 2 = C.
pub layer_level: u8,
/// HNSW max neighbors per layer.
pub m: u16,
/// ef_construction parameter.
pub ef_construction: u32,
/// Number of nodes in this segment.
pub node_count: u64,
}
/// Encoded adjacency data for a single node.
#[derive(Clone, Debug, PartialEq)]
pub struct NodeAdjacency {
/// The node ID.
pub node_id: u64,
/// Neighbor IDs per HNSW layer (index 0 = layer 0).
pub layers: Vec<Vec<u64>>,
}
/// Full decoded index segment data.
#[derive(Clone, Debug, PartialEq)]
pub struct IndexSegData {
pub header: IndexSegHeader,
pub restart_interval: u32,
pub nodes: Vec<NodeAdjacency>,
}
// ── Varint Encoding (LEB128) ─────────────────────────────────────
/// Encode a u64 as LEB128 varint.
pub fn encode_varint(mut value: u64, buf: &mut Vec<u8>) {
loop {
let mut byte = (value & 0x7F) as u8;
value >>= 7;
if value != 0 {
byte |= 0x80;
}
buf.push(byte);
if value == 0 {
break;
}
}
}
/// Decode a LEB128 varint from a byte slice. Returns `(value, bytes_consumed)`.
pub fn decode_varint(data: &[u8]) -> Option<(u64, usize)> {
let mut value: u64 = 0;
let mut shift: u32 = 0;
for (i, &byte) in data.iter().enumerate() {
if shift >= 64 {
return None; // Overflow.
}
value |= ((byte & 0x7F) as u64) << shift;
shift += 7;
if byte & 0x80 == 0 {
return Some((value, i + 1));
}
}
None // Incomplete.
}
// ── Delta Encoding ───────────────────────────────────────────────
/// Delta-encode a sorted sequence of u64 values.
pub fn delta_encode(sorted_ids: &[u64]) -> Vec<u64> {
if sorted_ids.is_empty() {
return Vec::new();
}
let mut deltas = Vec::with_capacity(sorted_ids.len());
deltas.push(sorted_ids[0]);
for i in 1..sorted_ids.len() {
deltas.push(sorted_ids[i] - sorted_ids[i - 1]);
}
deltas
}
/// Decode delta-encoded values back to absolute IDs.
pub fn delta_decode(deltas: &[u64]) -> Vec<u64> {
if deltas.is_empty() {
return Vec::new();
}
let mut ids = Vec::with_capacity(deltas.len());
ids.push(deltas[0]);
for i in 1..deltas.len() {
ids.push(ids[i - 1] + deltas[i]);
}
ids
}
// ── INDEX_SEG Encode ─────────────────────────────────────────────
/// Encode an INDEX_SEG payload.
///
/// Layout:
/// 1. Index header (padded to 64 bytes)
/// 2. Restart point index (padded to 64 bytes)
/// 3. Adjacency data with delta-encoded neighbor lists
pub fn encode_index_seg(data: &IndexSegData) -> Vec<u8> {
let mut buf = Vec::new();
// 1. Header (pad to 64 bytes).
buf.push(data.header.index_type);
buf.push(data.header.layer_level);
buf.extend_from_slice(&data.header.m.to_le_bytes());
buf.extend_from_slice(&data.header.ef_construction.to_le_bytes());
buf.extend_from_slice(&data.header.node_count.to_le_bytes());
pad_to_alignment(&mut buf, 64);
// 2. Encode adjacency data with restart points.
let restart_interval = data.restart_interval;
let mut adj_buf = Vec::new();
let mut restart_offsets: Vec<u32> = Vec::new();
for (idx, node) in data.nodes.iter().enumerate() {
if (idx as u32).is_multiple_of(restart_interval) {
restart_offsets.push(adj_buf.len() as u32);
}
// Encode layer count.
encode_varint(node.layers.len() as u64, &mut adj_buf);
// Encode each layer's neighbors.
for neighbors in &node.layers {
encode_varint(neighbors.len() as u64, &mut adj_buf);
// Delta-encode sorted neighbor IDs.
let mut sorted = neighbors.clone();
sorted.sort();
let is_restart = (idx as u32).is_multiple_of(restart_interval);
if is_restart {
// At restart points, encode absolute IDs.
for &nid in &sorted {
encode_varint(nid, &mut adj_buf);
}
} else {
// Delta encode.
let deltas = delta_encode(&sorted);
for &d in &deltas {
encode_varint(d, &mut adj_buf);
}
}
}
}
// Write restart point index.
buf.extend_from_slice(&restart_interval.to_le_bytes());
let restart_count = restart_offsets.len() as u32;
buf.extend_from_slice(&restart_count.to_le_bytes());
for offset in &restart_offsets {
buf.extend_from_slice(&offset.to_le_bytes());
}
pad_to_alignment(&mut buf, 64);
// Write adjacency data.
buf.extend_from_slice(&adj_buf);
pad_to_alignment(&mut buf, 64);
buf
}
/// Decode an INDEX_SEG payload.
pub fn decode_index_seg(data: &[u8]) -> Result<IndexSegData, CodecError> {
if data.len() < 64 {
return Err(CodecError::TooShort);
}
// 1. Parse header.
let index_type = data[0];
let layer_level = data[1];
let m = u16::from_le_bytes([data[2], data[3]]);
let ef_construction = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
let node_count = u64::from_le_bytes([
data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
]);
let header = IndexSegHeader {
index_type,
layer_level,
m,
ef_construction,
node_count,
};
// Skip header padding.
let mut pos = 64;
// 2. Parse restart point index.
if pos + 8 > data.len() {
return Err(CodecError::TooShort);
}
let restart_interval =
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
pos += 4;
let restart_count =
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
pos += 4;
let mut restart_offsets = Vec::with_capacity(restart_count as usize);
for _ in 0..restart_count {
if pos + 4 > data.len() {
return Err(CodecError::TooShort);
}
let offset = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
restart_offsets.push(offset);
pos += 4;
}
// Skip padding to 64-byte alignment.
pos = align_up(pos, 64);
// 3. Parse adjacency data.
let adj_start = pos;
let adj_data = &data[adj_start..];
let mut nodes = Vec::new();
let mut adj_pos = 0;
for node_idx in 0..node_count as usize {
let is_restart = (node_idx as u32).is_multiple_of(restart_interval);
// Decode layer count.
let (layer_count, consumed) =
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
adj_pos += consumed;
let mut layers = Vec::with_capacity(layer_count as usize);
for _ in 0..layer_count {
let (neighbor_count, consumed) =
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
adj_pos += consumed;
let mut neighbor_ids = Vec::with_capacity(neighbor_count as usize);
if is_restart {
// Absolute IDs at restart points.
for _ in 0..neighbor_count {
let (nid, consumed) =
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
adj_pos += consumed;
neighbor_ids.push(nid);
}
} else {
// Delta-encoded IDs.
let mut deltas = Vec::with_capacity(neighbor_count as usize);
for _ in 0..neighbor_count {
let (d, consumed) =
decode_varint(&adj_data[adj_pos..]).ok_or(CodecError::InvalidVarint)?;
adj_pos += consumed;
deltas.push(d);
}
neighbor_ids = delta_decode(&deltas);
}
layers.push(neighbor_ids);
}
nodes.push(NodeAdjacency {
node_id: node_idx as u64,
layers,
});
}
Ok(IndexSegData {
header,
restart_interval,
nodes,
})
}
/// Errors that can occur during INDEX_SEG codec operations.
#[derive(Clone, Debug, PartialEq)]
pub enum CodecError {
/// Input data is shorter than expected.
TooShort,
/// Invalid varint encountered.
InvalidVarint,
/// Unknown index type.
UnknownIndexType(u8),
}
impl core::fmt::Display for CodecError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
Self::TooShort => write!(f, "input data too short"),
Self::InvalidVarint => write!(f, "invalid varint encoding"),
Self::UnknownIndexType(t) => write!(f, "unknown index type: {}", t),
}
}
}
// ── Helpers ──────────────────────────────────────────────────────
/// Pad `buf` with zeros to the next multiple of `alignment`.
fn pad_to_alignment(buf: &mut Vec<u8>, alignment: usize) {
let rem = buf.len() % alignment;
if rem != 0 {
buf.resize(buf.len() + (alignment - rem), 0);
}
}
/// Round `offset` up to the next multiple of `alignment`.
fn align_up(offset: usize, alignment: usize) -> usize {
let rem = offset % alignment;
if rem == 0 {
offset
} else {
offset + (alignment - rem)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn varint_round_trip() {
let values = [0, 1, 127, 128, 16383, 16384, 2097151, u64::MAX];
for &val in &values {
let mut buf = Vec::new();
encode_varint(val, &mut buf);
let (decoded, consumed) = decode_varint(&buf).unwrap();
assert_eq!(decoded, val);
assert_eq!(consumed, buf.len());
}
}
#[test]
fn varint_encoding_sizes() {
let mut buf = Vec::new();
encode_varint(0, &mut buf);
assert_eq!(buf.len(), 1);
buf.clear();
encode_varint(127, &mut buf);
assert_eq!(buf.len(), 1);
buf.clear();
encode_varint(128, &mut buf);
assert_eq!(buf.len(), 2);
buf.clear();
encode_varint(16383, &mut buf);
assert_eq!(buf.len(), 2);
buf.clear();
encode_varint(16384, &mut buf);
assert_eq!(buf.len(), 3);
}
#[test]
fn delta_encode_decode_round_trip() {
let ids = vec![100, 105, 108, 120, 200];
let deltas = delta_encode(&ids);
assert_eq!(deltas, vec![100, 5, 3, 12, 80]);
let decoded = delta_decode(&deltas);
assert_eq!(decoded, ids);
}
#[test]
fn delta_encode_empty() {
assert!(delta_encode(&[]).is_empty());
assert!(delta_decode(&[]).is_empty());
}
#[test]
fn index_seg_round_trip() {
let data = IndexSegData {
header: IndexSegHeader {
index_type: 0, // HNSW
layer_level: 2, // Layer C
m: 16,
ef_construction: 200,
node_count: 5,
},
restart_interval: 3,
nodes: vec![
NodeAdjacency {
node_id: 0,
layers: vec![vec![1, 2, 3], vec![1]],
},
NodeAdjacency {
node_id: 1,
layers: vec![vec![0, 2, 4]],
},
NodeAdjacency {
node_id: 2,
layers: vec![vec![0, 1, 3, 4]],
},
NodeAdjacency {
node_id: 3,
layers: vec![vec![0, 2, 4], vec![4]],
},
NodeAdjacency {
node_id: 4,
layers: vec![vec![1, 2, 3]],
},
],
};
let encoded = encode_index_seg(&data);
let decoded = decode_index_seg(&encoded).unwrap();
assert_eq!(decoded.header, data.header);
assert_eq!(decoded.restart_interval, data.restart_interval);
assert_eq!(decoded.nodes.len(), data.nodes.len());
// Verify each node's adjacency. Note: neighbors are sorted during encoding.
for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) {
assert_eq!(dec.node_id, orig.node_id);
assert_eq!(dec.layers.len(), orig.layers.len());
for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) {
let mut sorted_orig = ol.clone();
sorted_orig.sort();
assert_eq!(*dl, sorted_orig);
}
}
}
#[test]
fn index_seg_larger_with_restart() {
// Test with enough nodes to exercise multiple restart groups.
let num_nodes = 200;
let restart_interval = 64;
let nodes: Vec<NodeAdjacency> = (0..num_nodes)
.map(|i| {
let neighbors: Vec<u64> =
(0..8).map(|j| ((i + j + 1) % num_nodes) as u64).collect();
NodeAdjacency {
node_id: i as u64,
layers: vec![neighbors],
}
})
.collect();
let data = IndexSegData {
header: IndexSegHeader {
index_type: 0,
layer_level: 2,
m: 16,
ef_construction: 200,
node_count: num_nodes as u64,
},
restart_interval,
nodes,
};
let encoded = encode_index_seg(&data);
let decoded = decode_index_seg(&encoded).unwrap();
assert_eq!(decoded.header, data.header);
assert_eq!(decoded.nodes.len(), data.nodes.len());
for (orig, dec) in data.nodes.iter().zip(decoded.nodes.iter()) {
assert_eq!(dec.layers.len(), orig.layers.len());
for (ol, dl) in orig.layers.iter().zip(dec.layers.iter()) {
let mut sorted_orig = ol.clone();
sorted_orig.sort();
assert_eq!(*dl, sorted_orig);
}
}
}
#[test]
fn delta_encoding_sorted_u64_sequences() {
// Verify exact round-trip for various sorted u64 sequences.
let sequences: Vec<Vec<u64>> = vec![
vec![0, 1, 2, 3, 4],
vec![1000, 2000, 3000, 4000],
vec![0, 100, 200, 300, 400, 500],
vec![
u64::MAX - 4,
u64::MAX - 3,
u64::MAX - 2,
u64::MAX - 1,
u64::MAX,
],
];
for seq in sequences {
let deltas = delta_encode(&seq);
let decoded = delta_decode(&deltas);
assert_eq!(decoded, seq, "Failed for sequence: {:?}", seq);
}
}
}