Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
[package]
name = "rvf-integration-tests"
version = "0.1.0"
edition = "2021"
publish = false
description = "Integration and acceptance tests for the RVF crate family"
[dependencies]
rvf-types = { path = "../../rvf-types", features = ["std"] }
rvf-wire = { path = "../../rvf-wire" }
rvf-manifest = { path = "../../rvf-manifest" }
rvf-index = { path = "../../rvf-index" }
rvf-quant = { path = "../../rvf-quant" }
rvf-crypto = { path = "../../rvf-crypto" }
rvf-runtime = { path = "../../rvf-runtime" }
rvf-adapter-rvlite = { path = "../../rvf-adapters/rvlite" }
ed25519-dalek = { version = "2", features = ["rand_core"] }
rand = "0.8"
tempfile = "3"

View File

@@ -0,0 +1,2 @@
// This crate exists solely for integration tests.
// All tests live in the tests/ directory.

View File

@@ -0,0 +1,365 @@
//! Attestation system integration tests.
//!
//! Exercises the Confidential Core attestation APIs end-to-end:
//! record encoding/decoding, witness chain integrity, tamper detection,
//! TEE-bound key lifecycle, segment flags, and mixed witness type chains.
use rvf_crypto::attestation::{
build_attestation_witness_payload, decode_attestation_record, decode_tee_bound_key,
encode_attestation_record, encode_tee_bound_key, verify_attestation_witness_payload,
verify_key_binding, TeeBoundKeyRecord,
};
use rvf_crypto::hash::{shake256_128, shake256_256};
use rvf_crypto::witness::{create_witness_chain, verify_witness_chain, WitnessEntry};
use rvf_types::{
AttestationHeader, AttestationWitnessType, ErrorCode, RvfError, SegmentFlags, TeePlatform,
KEY_TYPE_TEE_BOUND,
};
// --------------------------------------------------------------------------
// 1. Attestation record round trip
// --------------------------------------------------------------------------
#[test]
fn attestation_record_round_trip() {
let mut header = AttestationHeader::new(
TeePlatform::SoftwareTee as u8,
AttestationWitnessType::PlatformAttestation as u8,
);
header.measurement = shake256_256(b"test-enclave");
header.nonce = [0x42; 16];
header.quote_length = 64;
header.report_data_len = 32;
header.flags = AttestationHeader::FLAG_HAS_REPORT_DATA;
let report_data: Vec<u8> = (0..32).map(|i| (i * 3) as u8).collect();
let quote: Vec<u8> = (0..64).map(|i| (i ^ 0xAB) as u8).collect();
// Encode.
let encoded = encode_attestation_record(&header, &report_data, &quote);
assert_eq!(
encoded.len(),
112 + 32 + 64,
"total record should be header + report_data + quote"
);
// Decode.
let (dec_hdr, dec_rd, dec_q) = decode_attestation_record(&encoded).unwrap();
// Verify all header fields match.
assert_eq!(dec_hdr.platform, TeePlatform::SoftwareTee as u8);
assert_eq!(
dec_hdr.attestation_type,
AttestationWitnessType::PlatformAttestation as u8
);
assert_eq!(dec_hdr.measurement, header.measurement);
assert_eq!(dec_hdr.nonce, [0x42; 16]);
assert_eq!(dec_hdr.quote_length, 64);
assert_eq!(dec_hdr.report_data_len, 32);
assert_eq!(dec_hdr.flags, AttestationHeader::FLAG_HAS_REPORT_DATA);
assert!(dec_hdr.has_report_data());
assert!(!dec_hdr.is_debuggable());
// Verify variable-length sections.
assert_eq!(dec_rd, report_data);
assert_eq!(dec_q, quote);
}
// --------------------------------------------------------------------------
// 2. Attestation witness chain integrity
// --------------------------------------------------------------------------
#[test]
fn attestation_witness_chain_integrity() {
// Create 3 attestation records with different platforms and witness types.
let configs: &[(TeePlatform, AttestationWitnessType)] = &[
(
TeePlatform::Sgx,
AttestationWitnessType::PlatformAttestation,
),
(
TeePlatform::SevSnp,
AttestationWitnessType::ComputationProof,
),
(TeePlatform::Tdx, AttestationWitnessType::DataProvenance),
];
let mut records: Vec<Vec<u8>> = Vec::new();
let mut timestamps: Vec<u64> = Vec::new();
let mut witness_types: Vec<AttestationWitnessType> = Vec::new();
for (i, &(platform, wit_type)) in configs.iter().enumerate() {
let mut header = AttestationHeader::new(platform as u8, wit_type as u8);
header.measurement = shake256_256(format!("enclave-{i}").as_bytes());
header.nonce = [(i + 1) as u8; 16];
header.quote_length = 32;
header.report_data_len = 16;
header.flags = AttestationHeader::FLAG_HAS_REPORT_DATA;
let report_data: Vec<u8> = vec![i as u8; 16];
let quote: Vec<u8> = vec![(i + 0x10) as u8; 32];
records.push(encode_attestation_record(&header, &report_data, &quote));
timestamps.push(1_000_000_000 + i as u64);
witness_types.push(wit_type);
}
// Build witness payload.
let payload = build_attestation_witness_payload(&records, &timestamps, &witness_types).unwrap();
// Verify.
let verified = verify_attestation_witness_payload(&payload).unwrap();
assert_eq!(verified.len(), 3, "should have 3 verified entries");
// Check each entry has the correct action_hash and witness type.
for (i, (entry, header, rd, q)) in verified.iter().enumerate() {
let expected_hash = shake256_256(&records[i]);
assert_eq!(
entry.action_hash, expected_hash,
"entry {i}: action_hash should match SHAKE-256 of record"
);
assert_eq!(
entry.witness_type, witness_types[i] as u8,
"entry {i}: witness_type mismatch"
);
assert_eq!(
header.platform, configs[i].0 as u8,
"entry {i}: platform mismatch"
);
assert_eq!(rd.len(), 16, "entry {i}: report_data length");
assert_eq!(q.len(), 32, "entry {i}: quote length");
}
}
// --------------------------------------------------------------------------
// 3. Attestation witness tamper detection
// --------------------------------------------------------------------------
#[test]
fn attestation_witness_tamper_detection() {
// Build a payload with 2 entries.
let mut records: Vec<Vec<u8>> = Vec::new();
let mut timestamps: Vec<u64> = Vec::new();
let mut witness_types: Vec<AttestationWitnessType> = Vec::new();
for i in 0..2 {
let mut header = AttestationHeader::new(
TeePlatform::SoftwareTee as u8,
AttestationWitnessType::PlatformAttestation as u8,
);
header.measurement = shake256_256(format!("tamper-test-{i}").as_bytes());
header.quote_length = 48;
header.report_data_len = 24;
header.flags = AttestationHeader::FLAG_HAS_REPORT_DATA;
let report_data: Vec<u8> = vec![i as u8; 24];
let quote: Vec<u8> = vec![0xDD; 48];
records.push(encode_attestation_record(&header, &report_data, &quote));
timestamps.push(2_000_000_000 + i);
witness_types.push(AttestationWitnessType::PlatformAttestation);
}
let mut payload =
build_attestation_witness_payload(&records, &timestamps, &witness_types).unwrap();
// The payload layout is:
// [4 bytes: count][2*8 bytes: offsets][2*73 bytes: chain][records...]
// Records start at offset = 4 + 16 + 146 = 166.
// Flip a byte somewhere in the records section to simulate tampering.
let records_start = 4 + 2 * 8 + 2 * 73;
assert!(
records_start + 50 < payload.len(),
"payload should be large enough to tamper"
);
payload[records_start + 50] ^= 0xFF;
// Verification should fail with InvalidChecksum.
let result = verify_attestation_witness_payload(&payload);
assert!(result.is_err(), "tampered payload should fail verification");
assert_eq!(
result.unwrap_err(),
RvfError::Code(ErrorCode::InvalidChecksum),
"error should be InvalidChecksum"
);
}
// --------------------------------------------------------------------------
// 4. TEE-bound key lifecycle
// --------------------------------------------------------------------------
#[test]
fn tee_bound_key_lifecycle() {
let measurement = shake256_256(b"test-measurement");
let sealed_key: Vec<u8> = vec![0xAA; 32];
let record = TeeBoundKeyRecord {
key_type: KEY_TYPE_TEE_BOUND,
algorithm: 1,
sealed_key_length: sealed_key.len() as u16,
key_id: shake256_128(b"test-key-id"),
measurement,
platform: TeePlatform::SoftwareTee as u8,
reserved: [0u8; 3],
valid_from: 0,
valid_until: 0, // no expiry
sealed_key: sealed_key.clone(),
};
// Encode and decode round-trip.
let encoded = encode_tee_bound_key(&record);
let decoded = decode_tee_bound_key(&encoded).unwrap();
assert_eq!(decoded.key_type, KEY_TYPE_TEE_BOUND);
assert_eq!(decoded.measurement, measurement);
assert_eq!(decoded.sealed_key, sealed_key);
assert_eq!(decoded.platform, TeePlatform::SoftwareTee as u8);
assert_eq!(decoded.sealed_key_length, 32);
// Verify key binding with matching platform and measurement -> Ok.
let result = verify_key_binding(&decoded, TeePlatform::SoftwareTee, &measurement, 1_000_000);
assert!(result.is_ok(), "matching binding should succeed");
// Wrong platform -> KeyNotBound.
let result = verify_key_binding(
&decoded,
TeePlatform::Sgx, // wrong
&measurement,
1_000_000,
);
assert_eq!(
result,
Err(RvfError::Code(ErrorCode::KeyNotBound)),
"wrong platform should return KeyNotBound"
);
// Wrong measurement -> KeyNotBound.
let wrong_measurement = shake256_256(b"wrong-measurement");
let result = verify_key_binding(
&decoded,
TeePlatform::SoftwareTee,
&wrong_measurement,
1_000_000,
);
assert_eq!(
result,
Err(RvfError::Code(ErrorCode::KeyNotBound)),
"wrong measurement should return KeyNotBound"
);
}
// --------------------------------------------------------------------------
// 5. Attested segment flag
// --------------------------------------------------------------------------
#[test]
fn attested_segment_flag() {
// ATTESTED flag alone.
let flags = SegmentFlags::empty().with(SegmentFlags::ATTESTED);
assert!(
flags.contains(SegmentFlags::ATTESTED),
"ATTESTED flag should be set"
);
assert!(
!flags.contains(SegmentFlags::SIGNED),
"SIGNED should not be set when only ATTESTED is"
);
assert!(
!flags.contains(SegmentFlags::SEALED),
"SEALED should not be set when only ATTESTED is"
);
// Combined flags: SIGNED | SEALED | ATTESTED.
let combined = SegmentFlags::empty()
.with(SegmentFlags::SIGNED)
.with(SegmentFlags::SEALED)
.with(SegmentFlags::ATTESTED);
assert!(combined.contains(SegmentFlags::SIGNED));
assert!(combined.contains(SegmentFlags::SEALED));
assert!(combined.contains(SegmentFlags::ATTESTED));
// Verify bit positions.
assert_eq!(SegmentFlags::ATTESTED, 0x0400, "ATTESTED should be bit 10");
let expected_bits = 0x0004 | 0x0008 | 0x0400;
assert_eq!(
combined.bits(),
expected_bits,
"combined bits should be SIGNED|SEALED|ATTESTED"
);
}
// --------------------------------------------------------------------------
// 6. Mixed witness types in chain
// --------------------------------------------------------------------------
#[test]
fn mixed_witness_types_in_chain() {
// Build a chain with both standard and attestation witness types.
let entries = vec![
// Entry 1: standard PROVENANCE (0x01).
WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(b"provenance-data"),
timestamp_ns: 1_000_000_001,
witness_type: 0x01,
},
// Entry 2: new PLATFORM_ATTESTATION (0x05).
WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(b"platform-attestation-data"),
timestamp_ns: 1_000_000_002,
witness_type: AttestationWitnessType::PlatformAttestation as u8,
},
// Entry 3: standard COMPUTATION (0x02).
WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(b"computation-data"),
timestamp_ns: 1_000_000_003,
witness_type: 0x02,
},
// Entry 4: new COMPUTATION_PROOF (0x07).
WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(b"computation-proof-data"),
timestamp_ns: 1_000_000_004,
witness_type: AttestationWitnessType::ComputationProof as u8,
},
];
// Create the chain (links entries via prev_hash).
let chain = create_witness_chain(&entries);
assert_eq!(
chain.len(),
4 * 73,
"chain should have 4 entries of 73 bytes each"
);
// Verify chain integrity.
let verified = verify_witness_chain(&chain).unwrap();
assert_eq!(verified.len(), 4, "all 4 entries should verify");
// Check witness_type values.
assert_eq!(verified[0].witness_type, 0x01, "entry 0: PROVENANCE");
assert_eq!(
verified[1].witness_type, 0x05,
"entry 1: PLATFORM_ATTESTATION"
);
assert_eq!(verified[2].witness_type, 0x02, "entry 2: COMPUTATION");
assert_eq!(verified[3].witness_type, 0x07, "entry 3: COMPUTATION_PROOF");
// Verify action hashes are preserved.
assert_eq!(verified[0].action_hash, shake256_256(b"provenance-data"));
assert_eq!(
verified[1].action_hash,
shake256_256(b"platform-attestation-data")
);
assert_eq!(verified[2].action_hash, shake256_256(b"computation-data"));
assert_eq!(
verified[3].action_hash,
shake256_256(b"computation-proof-data")
);
// First entry has zero prev_hash, subsequent are chained.
assert_eq!(
verified[0].prev_hash, [0u8; 32],
"first entry should have zero prev_hash"
);
assert_ne!(
verified[1].prev_hash, [0u8; 32],
"second entry should have non-zero prev_hash"
);
}

View File

@@ -0,0 +1,113 @@
//! Bit-flip detection tests: verify that hash/CRC catches random corruption.
//!
//! From acceptance spec section 4: "Bit Flip Detection"
//! Pass criteria: 100% detection of single-bit flips. Corruption isolated to
//! affected segment.
use rvf_types::{SegmentFlags, SegmentType, SEGMENT_HEADER_SIZE};
use rvf_wire::{read_segment, validate_segment, write_segment};
#[test]
fn single_bit_flip_in_payload_detected() {
let payload = b"important vector data that must not be corrupted";
let encoded = write_segment(SegmentType::Vec as u8, payload, SegmentFlags::empty(), 1);
let (header, _) = read_segment(&encoded).unwrap();
// Flip each bit in the payload region and verify detection.
let payload_start = SEGMENT_HEADER_SIZE;
let payload_end = payload_start + payload.len();
let mut detected = 0;
let total = (payload_end - payload_start) * 8;
for byte_idx in payload_start..payload_end {
for bit in 0..8 {
let mut corrupted = encoded.clone();
corrupted[byte_idx] ^= 1 << bit;
let corrupted_payload = &corrupted[payload_start..payload_end];
if validate_segment(&header, corrupted_payload).is_err() {
detected += 1;
}
}
}
assert_eq!(
detected, total,
"detected {detected}/{total} single-bit flips in payload"
);
}
#[test]
fn multi_bit_corruption_detected() {
let payload: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
let encoded = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), 2);
let (header, _) = read_segment(&encoded).unwrap();
// Corrupt multiple bytes.
let payload_start = SEGMENT_HEADER_SIZE;
let mut corrupted = encoded.clone();
corrupted[payload_start] ^= 0xFF;
corrupted[payload_start + 100] ^= 0x55;
corrupted[payload_start + 200] ^= 0xAA;
let corrupted_payload = &corrupted[payload_start..payload_start + payload.len()];
assert!(
validate_segment(&header, corrupted_payload).is_err(),
"multi-byte corruption should be detected"
);
}
#[test]
fn corruption_in_one_segment_does_not_affect_another() {
// Build two segments.
let payload_a = b"segment A vector data";
let payload_b = b"segment B vector data";
let seg_a = write_segment(SegmentType::Vec as u8, payload_a, SegmentFlags::empty(), 1);
let seg_b = write_segment(SegmentType::Vec as u8, payload_b, SegmentFlags::empty(), 2);
let mut file = seg_a.clone();
let seg_b_offset = file.len();
file.extend_from_slice(&seg_b);
// Corrupt segment A's payload.
let mut corrupted = file.clone();
corrupted[SEGMENT_HEADER_SIZE] ^= 0xFF;
// Segment A should fail validation.
let (hdr_a, _) = read_segment(&seg_a).unwrap();
let corrupted_payload_a =
&corrupted[SEGMENT_HEADER_SIZE..SEGMENT_HEADER_SIZE + payload_a.len()];
assert!(
validate_segment(&hdr_a, corrupted_payload_a).is_err(),
"corrupted segment A should fail"
);
// Segment B should still validate fine.
let (hdr_b, payload_b_decoded) = read_segment(&corrupted[seg_b_offset..]).unwrap();
assert!(
validate_segment(&hdr_b, payload_b_decoded).is_ok(),
"uncorrupted segment B should still pass"
);
}
#[test]
fn header_magic_corruption_detected() {
let encoded = write_segment(SegmentType::Vec as u8, b"data", SegmentFlags::empty(), 1);
let mut corrupted = encoded.clone();
// Corrupt the magic bytes.
corrupted[0] ^= 0x01;
assert!(
read_segment(&corrupted).is_err(),
"corrupted magic should cause read failure"
);
}
#[test]
fn zero_payload_hash_is_valid() {
// Even an empty payload should have a valid hash.
let encoded = write_segment(SegmentType::Meta as u8, &[], SegmentFlags::empty(), 0);
let (header, payload) = read_segment(&encoded).unwrap();
assert!(validate_segment(&header, payload).is_ok());
}

View File

@@ -0,0 +1,986 @@
//! Integration tests for the RVF computational container segments:
//! KERNEL_SEG (0x0E) and EBPF_SEG (0x0F).
//!
//! These tests exercise the raw binary format for embedded kernel images and
//! eBPF programs within RVF files. Because the high-level kernel/eBPF APIs
//! may not exist yet (other agents may be creating them), all tests construct
//! segment headers and payloads via raw byte manipulation. This ensures the
//! tests work regardless of whether typed wrappers are available.
//!
//! Wire format references:
//! - KERNEL_SEG segment type: 0x0E (SegmentType::Kernel)
//! - EBPF_SEG segment type: 0x0F (SegmentType::Ebpf)
//! - Segment header: 64 bytes (SEGMENT_HEADER_SIZE)
//! - KernelHeader payload: 128 bytes (magic 0x52564B4E = "RVKN")
//! - EbpfHeader payload: 64 bytes (magic 0x52564250 = "RVBP")
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{SegmentFlags, SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, SEGMENT_VERSION};
use rvf_wire::{read_segment, validate_segment, write_segment};
use std::fs::OpenOptions;
use std::io::{Read, Write};
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Constants for the computational container sub-headers
// ---------------------------------------------------------------------------
/// KernelHeader magic: "RVKN" as big-endian u32 => 0x52564B4E.
const KERNEL_MAGIC: u32 = 0x5256_4B4E;
/// EbpfHeader magic: "RVBP" as big-endian u32 => 0x52564250.
const EBPF_MAGIC: u32 = 0x5256_4250;
/// Size of the KernelHeader in bytes.
const KERNEL_HEADER_SIZE: usize = 128;
/// Size of the EbpfHeader in bytes.
const EBPF_HEADER_SIZE: usize = 64;
/// Architecture discriminants for KernelHeader.arch field.
const ARCH_X86_64: u8 = 0x00;
const ARCH_AARCH64: u8 = 0x01;
/// Kernel type discriminants for KernelHeader.kernel_type field.
const KERNEL_TYPE_UNIKERNEL: u8 = 0x00;
const KERNEL_TYPE_TEST_STUB: u8 = 0xFD;
/// Kernel flags (stored in a u32 at offset 8 of the KernelHeader).
const KERNEL_FLAG_SIGNED: u32 = 0x0000_0001;
const KERNEL_FLAG_REQUIRES_TEE: u32 = 0x0000_0002;
const KERNEL_FLAG_READ_ONLY: u32 = 0x0000_0004;
const KERNEL_FLAG_INGEST_ENABLED: u32 = 0x0000_0008;
// ---------------------------------------------------------------------------
// Helper: construct a 128-byte KernelHeader payload
// ---------------------------------------------------------------------------
/// Build a 128-byte KernelHeader with the given parameters.
///
/// Layout (all little-endian):
/// [0..4] magic: u32 = 0x52564B4E
/// [4..6] version: u16
/// [6] arch: u8
/// [7] kernel_type: u8
/// [8..12] flags: u32
/// [12..16] entry_point: u32
/// [16..24] image_size: u64
/// [24..28] bss_size: u32
/// [28..30] stack_pages: u16
/// [30..32] max_dimension: u16
/// [32..64] image_hash: [u8; 32] (SHAKE-256-256 of the image bytes)
/// [64..80] reserved_0: [u8; 16]
/// [80..128] reserved_1: [u8; 48]
fn make_kernel_header(
arch: u8,
kernel_type: u8,
flags: u32,
entry_point: u32,
image_size: u64,
bss_size: u32,
stack_pages: u16,
max_dimension: u16,
image_hash: [u8; 32],
) -> [u8; KERNEL_HEADER_SIZE] {
let mut buf = [0u8; KERNEL_HEADER_SIZE];
// magic
buf[0..4].copy_from_slice(&KERNEL_MAGIC.to_le_bytes());
// version
buf[4..6].copy_from_slice(&1u16.to_le_bytes());
// arch
buf[6] = arch;
// kernel_type
buf[7] = kernel_type;
// flags
buf[8..12].copy_from_slice(&flags.to_le_bytes());
// entry_point
buf[12..16].copy_from_slice(&entry_point.to_le_bytes());
// image_size
buf[16..24].copy_from_slice(&image_size.to_le_bytes());
// bss_size
buf[24..28].copy_from_slice(&bss_size.to_le_bytes());
// stack_pages
buf[28..30].copy_from_slice(&stack_pages.to_le_bytes());
// max_dimension
buf[30..32].copy_from_slice(&max_dimension.to_le_bytes());
// image_hash
buf[32..64].copy_from_slice(&image_hash);
// reserved fields stay zeroed
buf
}
// ---------------------------------------------------------------------------
// Helper: construct a 64-byte EbpfHeader payload
// ---------------------------------------------------------------------------
/// Build a 64-byte EbpfHeader with the given parameters.
///
/// Layout (all little-endian):
/// [0..4] magic: u32 = 0x52564250
/// [4..6] version: u16
/// [6] program_type: u8
/// [7] attach_point: u8
/// [8..12] flags: u32
/// [12..16] insn_count: u32
/// [16..20] map_count: u32
/// [20..22] max_dimension: u16
/// [22..24] reserved_0: u16
/// [24..32] program_hash: [u8; 8] (truncated hash of bytecode)
/// [32..64] reserved_1: [u8; 32]
fn make_ebpf_header(
program_type: u8,
attach_point: u8,
flags: u32,
insn_count: u32,
map_count: u32,
max_dimension: u16,
program_hash: [u8; 8],
) -> [u8; EBPF_HEADER_SIZE] {
let mut buf = [0u8; EBPF_HEADER_SIZE];
// magic
buf[0..4].copy_from_slice(&EBPF_MAGIC.to_le_bytes());
// version
buf[4..6].copy_from_slice(&1u16.to_le_bytes());
// program_type
buf[6] = program_type;
// attach_point
buf[7] = attach_point;
// flags
buf[8..12].copy_from_slice(&flags.to_le_bytes());
// insn_count
buf[12..16].copy_from_slice(&insn_count.to_le_bytes());
// map_count
buf[16..20].copy_from_slice(&map_count.to_le_bytes());
// max_dimension
buf[20..22].copy_from_slice(&max_dimension.to_le_bytes());
// reserved_0
buf[22..24].copy_from_slice(&0u16.to_le_bytes());
// program_hash
buf[24..32].copy_from_slice(&program_hash);
// reserved_1 stays zeroed
buf
}
// ---------------------------------------------------------------------------
// Helper: build a raw 64-byte RVF segment header
// ---------------------------------------------------------------------------
fn build_raw_segment_header(
seg_type: u8,
seg_id: u64,
payload_len: u64,
) -> [u8; SEGMENT_HEADER_SIZE] {
let mut buf = [0u8; SEGMENT_HEADER_SIZE];
buf[0x00..0x04].copy_from_slice(&SEGMENT_MAGIC.to_le_bytes());
buf[0x04] = SEGMENT_VERSION;
buf[0x05] = seg_type;
// flags at 0x06..0x08 stay zero
buf[0x08..0x10].copy_from_slice(&seg_id.to_le_bytes());
buf[0x10..0x18].copy_from_slice(&payload_len.to_le_bytes());
buf
}
// ---------------------------------------------------------------------------
// Helper: simple hash for testing (non-cryptographic)
// ---------------------------------------------------------------------------
/// A simple deterministic hash for testing purposes. Produces a 32-byte digest.
fn simple_test_hash(data: &[u8]) -> [u8; 32] {
let mut out = [0u8; 32];
for (i, &b) in data.iter().enumerate() {
out[i % 32] = out[i % 32].wrapping_add(b);
let j = (i + 13) % 32;
out[j] = out[j].wrapping_add(out[i % 32].rotate_left(3));
}
out
}
// ---------------------------------------------------------------------------
// Helper: read entire file into bytes
// ---------------------------------------------------------------------------
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = OpenOptions::new().read(true).open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
// ---------------------------------------------------------------------------
// Helper: scan file for segment headers, return (offset, type, id, payload_len)
// ---------------------------------------------------------------------------
fn scan_segments(file_bytes: &[u8]) -> Vec<(usize, u8, u64, u64)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut segments = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return segments;
}
let last_possible = file_bytes.len() - SEGMENT_HEADER_SIZE;
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let seg_type = file_bytes[i + 5];
let seg_id = u64::from_le_bytes(file_bytes[i + 0x08..i + 0x10].try_into().unwrap());
let payload_len =
u64::from_le_bytes(file_bytes[i + 0x10..i + 0x18].try_into().unwrap());
segments.push((i, seg_type, seg_id, payload_len));
}
}
segments
}
// ---------------------------------------------------------------------------
// Helper: make RvfStore options
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ===========================================================================
// TEST 1: kernel_header_round_trip
// ===========================================================================
/// Construct a 128-byte KernelHeader, wrap it in a KERNEL_SEG (type 0x0E)
/// using the rvf-wire writer, read it back, and verify all fields match.
#[test]
fn kernel_header_round_trip() {
let image_hash = simple_test_hash(b"test kernel image bytes");
let kernel_hdr = make_kernel_header(
ARCH_X86_64, // arch
KERNEL_TYPE_UNIKERNEL, // kernel_type
KERNEL_FLAG_SIGNED | KERNEL_FLAG_READ_ONLY, // flags
0x0000_1000, // entry_point
4096, // image_size
512, // bss_size
4, // stack_pages
256, // max_dimension
image_hash,
);
// Write as a KERNEL_SEG using rvf-wire
let seg_flags = SegmentFlags::empty();
let encoded = write_segment(
SegmentType::Kernel as u8,
&kernel_hdr,
seg_flags,
100, // segment_id
);
// Read back the RVF segment
let (header, payload) = read_segment(&encoded).unwrap();
// Verify outer segment header
assert_eq!(header.magic, SEGMENT_MAGIC, "segment magic mismatch");
assert_eq!(header.version, SEGMENT_VERSION, "segment version mismatch");
assert_eq!(
header.seg_type,
SegmentType::Kernel as u8,
"segment type should be Kernel (0x0E)"
);
assert_eq!(header.segment_id, 100, "segment_id mismatch");
assert_eq!(
header.payload_length, KERNEL_HEADER_SIZE as u64,
"payload length mismatch"
);
// Validate content hash
validate_segment(&header, payload).expect("content hash validation should pass");
// Verify inner KernelHeader fields
assert_eq!(
payload.len(),
KERNEL_HEADER_SIZE,
"kernel header payload size"
);
let magic = u32::from_le_bytes(payload[0..4].try_into().unwrap());
assert_eq!(magic, KERNEL_MAGIC, "kernel magic mismatch");
let version = u16::from_le_bytes(payload[4..6].try_into().unwrap());
assert_eq!(version, 1, "kernel version mismatch");
assert_eq!(payload[6], ARCH_X86_64, "arch mismatch");
assert_eq!(payload[7], KERNEL_TYPE_UNIKERNEL, "kernel_type mismatch");
let flags = u32::from_le_bytes(payload[8..12].try_into().unwrap());
assert_eq!(
flags,
KERNEL_FLAG_SIGNED | KERNEL_FLAG_READ_ONLY,
"kernel flags mismatch"
);
let entry_point = u32::from_le_bytes(payload[12..16].try_into().unwrap());
assert_eq!(entry_point, 0x0000_1000, "entry_point mismatch");
let image_size = u64::from_le_bytes(payload[16..24].try_into().unwrap());
assert_eq!(image_size, 4096, "image_size mismatch");
let bss_size = u32::from_le_bytes(payload[24..28].try_into().unwrap());
assert_eq!(bss_size, 512, "bss_size mismatch");
let stack_pages = u16::from_le_bytes(payload[28..30].try_into().unwrap());
assert_eq!(stack_pages, 4, "stack_pages mismatch");
let max_dimension = u16::from_le_bytes(payload[30..32].try_into().unwrap());
assert_eq!(max_dimension, 256, "max_dimension mismatch");
let mut read_hash = [0u8; 32];
read_hash.copy_from_slice(&payload[32..64]);
assert_eq!(read_hash, image_hash, "image_hash mismatch");
println!("PASS: kernel_header_round_trip -- all fields verified");
}
// ===========================================================================
// TEST 2: ebpf_header_round_trip
// ===========================================================================
/// Construct a 64-byte EbpfHeader, wrap it in an EBPF_SEG (type 0x0F)
/// using the rvf-wire writer, read it back, and verify all fields match.
#[test]
fn ebpf_header_round_trip() {
let program_hash: [u8; 8] = [0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE];
let ebpf_hdr = make_ebpf_header(
0x01, // program_type: filter
0x02, // attach_point: ingress
0x0003, // flags
256, // insn_count
4, // map_count
128, // max_dimension
program_hash,
);
// Write as an EBPF_SEG
let encoded = write_segment(
SegmentType::Ebpf as u8,
&ebpf_hdr,
SegmentFlags::empty(),
200,
);
// Read back
let (header, payload) = read_segment(&encoded).unwrap();
// Verify outer segment header
assert_eq!(
header.seg_type,
SegmentType::Ebpf as u8,
"segment type should be Ebpf (0x0F)"
);
assert_eq!(header.segment_id, 200);
assert_eq!(header.payload_length, EBPF_HEADER_SIZE as u64);
// Validate hash
validate_segment(&header, payload).expect("ebpf content hash should validate");
// Verify inner EbpfHeader fields
assert_eq!(payload.len(), EBPF_HEADER_SIZE);
let magic = u32::from_le_bytes(payload[0..4].try_into().unwrap());
assert_eq!(magic, EBPF_MAGIC, "ebpf magic mismatch");
let version = u16::from_le_bytes(payload[4..6].try_into().unwrap());
assert_eq!(version, 1);
assert_eq!(payload[6], 0x01, "program_type mismatch");
assert_eq!(payload[7], 0x02, "attach_point mismatch");
let flags = u32::from_le_bytes(payload[8..12].try_into().unwrap());
assert_eq!(flags, 0x0003, "ebpf flags mismatch");
let insn_count = u32::from_le_bytes(payload[12..16].try_into().unwrap());
assert_eq!(insn_count, 256, "insn_count mismatch");
let map_count = u32::from_le_bytes(payload[16..20].try_into().unwrap());
assert_eq!(map_count, 4, "map_count mismatch");
let max_dim = u16::from_le_bytes(payload[20..22].try_into().unwrap());
assert_eq!(max_dim, 128, "max_dimension mismatch");
let mut read_hash = [0u8; 8];
read_hash.copy_from_slice(&payload[24..32]);
assert_eq!(read_hash, program_hash, "program_hash mismatch");
println!("PASS: ebpf_header_round_trip -- all fields verified");
}
// ===========================================================================
// TEST 3: kernel_segment_survives_store_reopen
// ===========================================================================
/// Create an RVF store, add vectors, manually append a fake KERNEL_SEG
/// (type 0x0E) to the file, close and reopen the store, then verify the
/// kernel segment is still present when scanning the raw file bytes.
#[test]
fn kernel_segment_survives_store_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("kernel_reopen.rvf");
let dim: u16 = 4;
// Step 1: Create a store with some vectors
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Step 2: Manually append a KERNEL_SEG
let kernel_payload = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_TEST_STUB,
KERNEL_FLAG_INGEST_ENABLED,
0x0000_2000,
8192,
1024,
8,
512,
[0xAA; 32],
);
let kernel_seg_id: u64 = 5000;
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let seg_header = build_raw_segment_header(
SegmentType::Kernel as u8,
kernel_seg_id,
kernel_payload.len() as u64,
);
file.write_all(&seg_header).unwrap();
file.write_all(&kernel_payload).unwrap();
file.sync_all().unwrap();
}
// Step 3: Verify the kernel segment is in the file
let bytes_before = read_file_bytes(&path);
let segs_before = scan_segments(&bytes_before);
let kernel_segs_before: Vec<_> = segs_before
.iter()
.filter(|s| s.1 == SegmentType::Kernel as u8)
.collect();
assert_eq!(
kernel_segs_before.len(),
1,
"expected 1 KERNEL_SEG before reopen, found {}",
kernel_segs_before.len()
);
assert_eq!(
kernel_segs_before[0].2, kernel_seg_id,
"segment ID mismatch before reopen"
);
// Step 4: Reopen the store (readonly) -- should not panic
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
10,
"store should still report 10 vectors after reopen with kernel segment"
);
// Step 5: Verify the kernel segment is still present in the raw file
let bytes_after = read_file_bytes(&path);
let segs_after = scan_segments(&bytes_after);
let kernel_segs_after: Vec<_> = segs_after
.iter()
.filter(|s| s.1 == SegmentType::Kernel as u8)
.collect();
assert_eq!(
kernel_segs_after.len(),
1,
"KERNEL_SEG should still be present after store reopen, found {}",
kernel_segs_after.len()
);
assert_eq!(
kernel_segs_after[0].2, kernel_seg_id,
"segment ID mismatch after reopen"
);
// Verify the payload is intact
let offset = kernel_segs_after[0].0;
let payload_start = offset + SEGMENT_HEADER_SIZE;
let payload_end = payload_start + KERNEL_HEADER_SIZE;
assert!(
bytes_after.len() >= payload_end,
"file too short to contain kernel payload"
);
assert_eq!(
&bytes_after[payload_start..payload_end],
&kernel_payload[..],
"kernel payload bytes should be preserved after reopen"
);
println!("PASS: kernel_segment_survives_store_reopen");
}
// ===========================================================================
// TEST 4: multi_arch_kernel_segments
// ===========================================================================
/// Create an RVF file with two KERNEL_SEGs: one for x86_64 (arch=0) and
/// one for aarch64 (arch=1). Verify both are present and distinguishable.
#[test]
fn multi_arch_kernel_segments() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_arch.rvf");
let dim: u16 = 4;
// Create a store with some vectors
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..5).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=5).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Append two KERNEL_SEGs with different architectures
let x86_kernel = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_UNIKERNEL,
0,
0x1000,
4096,
256,
2,
128,
[0x11; 32],
);
let arm_kernel = make_kernel_header(
ARCH_AARCH64,
KERNEL_TYPE_UNIKERNEL,
0,
0x2000,
8192,
512,
4,
256,
[0x22; 32],
);
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
// x86_64 kernel
let h1 = build_raw_segment_header(SegmentType::Kernel as u8, 6001, x86_kernel.len() as u64);
file.write_all(&h1).unwrap();
file.write_all(&x86_kernel).unwrap();
// aarch64 kernel
let h2 = build_raw_segment_header(SegmentType::Kernel as u8, 6002, arm_kernel.len() as u64);
file.write_all(&h2).unwrap();
file.write_all(&arm_kernel).unwrap();
file.sync_all().unwrap();
}
// Scan the file for KERNEL_SEGs
let bytes = read_file_bytes(&path);
let segs = scan_segments(&bytes);
let kernel_segs: Vec<_> = segs
.iter()
.filter(|s| s.1 == SegmentType::Kernel as u8)
.collect();
assert_eq!(
kernel_segs.len(),
2,
"expected 2 KERNEL_SEGs (x86_64 + aarch64), found {}",
kernel_segs.len()
);
// Extract and verify architectures
let mut archs = Vec::new();
for &(offset, _, seg_id, _) in &kernel_segs {
let payload_start = offset + SEGMENT_HEADER_SIZE;
let arch_byte = bytes[payload_start + 6]; // arch is at offset 6 in KernelHeader
archs.push((seg_id, arch_byte));
println!(" KERNEL_SEG id={} arch=0x{:02X}", seg_id, arch_byte);
}
// One should be x86_64 (0x00), the other aarch64 (0x01)
let has_x86 = archs.iter().any(|&(_, a)| a == ARCH_X86_64);
let has_arm = archs.iter().any(|&(_, a)| a == ARCH_AARCH64);
assert!(has_x86, "should have an x86_64 KERNEL_SEG");
assert!(has_arm, "should have an aarch64 KERNEL_SEG");
// Verify entry points are different
let x86_entry = {
let &(off, _, _, _) = kernel_segs
.iter()
.find(|s| bytes[s.0 + SEGMENT_HEADER_SIZE + 6] == ARCH_X86_64)
.unwrap();
u32::from_le_bytes(
bytes[off + SEGMENT_HEADER_SIZE + 12..off + SEGMENT_HEADER_SIZE + 16]
.try_into()
.unwrap(),
)
};
let arm_entry = {
let &(off, _, _, _) = kernel_segs
.iter()
.find(|s| bytes[s.0 + SEGMENT_HEADER_SIZE + 6] == ARCH_AARCH64)
.unwrap();
u32::from_le_bytes(
bytes[off + SEGMENT_HEADER_SIZE + 12..off + SEGMENT_HEADER_SIZE + 16]
.try_into()
.unwrap(),
)
};
assert_eq!(x86_entry, 0x1000, "x86_64 entry_point mismatch");
assert_eq!(arm_entry, 0x2000, "aarch64 entry_point mismatch");
println!("PASS: multi_arch_kernel_segments -- both architectures found and distinguishable");
}
// ===========================================================================
// TEST 5: kernel_image_hash_verification
// ===========================================================================
/// Embed a kernel with a known hash, read it back, compute the hash of the
/// image bytes, and verify it matches the image_hash field in the header.
#[test]
fn kernel_image_hash_verification() {
// Fake kernel image data
let image_data: Vec<u8> = (0..256u16).map(|i| (i & 0xFF) as u8).collect();
let expected_hash = simple_test_hash(&image_data);
// Build a KernelHeader with the image hash and the image as payload
let kernel_hdr = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_UNIKERNEL,
0,
0x0000_1000,
image_data.len() as u64,
0,
2,
64,
expected_hash,
);
// Construct a full payload: KernelHeader + image_data
let mut full_payload = Vec::with_capacity(KERNEL_HEADER_SIZE + image_data.len());
full_payload.extend_from_slice(&kernel_hdr);
full_payload.extend_from_slice(&image_data);
// Write as a KERNEL_SEG
let encoded = write_segment(
SegmentType::Kernel as u8,
&full_payload,
SegmentFlags::empty(),
300,
);
// Read back
let (header, payload) = read_segment(&encoded).unwrap();
validate_segment(&header, payload).expect("segment hash should validate");
// Extract the KernelHeader from the payload
assert!(payload.len() >= KERNEL_HEADER_SIZE + image_data.len());
// Read image_hash from offset 32..64 of the KernelHeader
let mut stored_hash = [0u8; 32];
stored_hash.copy_from_slice(&payload[32..64]);
// Read image_size from offset 16..24
let stored_image_size = u64::from_le_bytes(payload[16..24].try_into().unwrap());
assert_eq!(
stored_image_size,
image_data.len() as u64,
"image_size should match"
);
// Extract image bytes from after the KernelHeader
let image_start = KERNEL_HEADER_SIZE;
let image_end = image_start + stored_image_size as usize;
let extracted_image = &payload[image_start..image_end];
// Compute hash of extracted image
let computed_hash = simple_test_hash(extracted_image);
// Verify hash match
assert_eq!(
stored_hash, computed_hash,
"image_hash in KernelHeader should match computed hash of image bytes"
);
assert_eq!(
stored_hash, expected_hash,
"image_hash should match the original expected hash"
);
println!("PASS: kernel_image_hash_verification -- hash verified successfully");
}
// ===========================================================================
// TEST 6: kernel_flags_validation
// ===========================================================================
/// Test that SIGNED, REQUIRES_TEE, READ_ONLY, and INGEST_ENABLED flags
/// are preserved through a write/read cycle.
#[test]
fn kernel_flags_validation() {
// Test each flag individually
let flag_tests: Vec<(u32, &str)> = vec![
(KERNEL_FLAG_SIGNED, "SIGNED"),
(KERNEL_FLAG_REQUIRES_TEE, "REQUIRES_TEE"),
(KERNEL_FLAG_READ_ONLY, "READ_ONLY"),
(KERNEL_FLAG_INGEST_ENABLED, "INGEST_ENABLED"),
];
for (flag, name) in &flag_tests {
let kernel_hdr = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_UNIKERNEL,
*flag,
0,
0,
0,
0,
0,
[0u8; 32],
);
let encoded = write_segment(
SegmentType::Kernel as u8,
&kernel_hdr,
SegmentFlags::empty(),
400,
);
let (_header, payload) = read_segment(&encoded).unwrap();
let read_flags = u32::from_le_bytes(payload[8..12].try_into().unwrap());
assert_eq!(
read_flags, *flag,
"flag {name} (0x{flag:08X}) not preserved: got 0x{read_flags:08X}"
);
assert!(read_flags & *flag != 0, "flag {name} bit should be set");
println!(" flag {name} (0x{flag:08X}): OK");
}
// Test all flags combined
let all_flags = KERNEL_FLAG_SIGNED
| KERNEL_FLAG_REQUIRES_TEE
| KERNEL_FLAG_READ_ONLY
| KERNEL_FLAG_INGEST_ENABLED;
let kernel_hdr = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_UNIKERNEL,
all_flags,
0,
0,
0,
0,
0,
[0u8; 32],
);
let encoded = write_segment(
SegmentType::Kernel as u8,
&kernel_hdr,
SegmentFlags::empty(),
401,
);
let (_header, payload) = read_segment(&encoded).unwrap();
let read_flags = u32::from_le_bytes(payload[8..12].try_into().unwrap());
assert_eq!(
read_flags, all_flags,
"all kernel flags combined (0x{all_flags:08X}) not preserved: got 0x{read_flags:08X}"
);
assert!(
read_flags & KERNEL_FLAG_SIGNED != 0,
"SIGNED bit missing from combined"
);
assert!(
read_flags & KERNEL_FLAG_REQUIRES_TEE != 0,
"REQUIRES_TEE bit missing from combined"
);
assert!(
read_flags & KERNEL_FLAG_READ_ONLY != 0,
"READ_ONLY bit missing from combined"
);
assert!(
read_flags & KERNEL_FLAG_INGEST_ENABLED != 0,
"INGEST_ENABLED bit missing from combined"
);
println!("PASS: kernel_flags_validation -- all flag bits preserved");
}
// ===========================================================================
// TEST 7: ebpf_max_dimension_check
// ===========================================================================
/// Create an EBPF_SEG with max_dimension=128 and verify the field is
/// correctly stored and retrieved through a write/read cycle.
#[test]
fn ebpf_max_dimension_check() {
let test_cases: &[(u16, &str)] = &[
(0, "zero"),
(1, "minimum"),
(128, "typical"),
(256, "larger"),
(1024, "large"),
(u16::MAX, "max u16"),
];
for &(max_dim, label) in test_cases {
let ebpf_hdr = make_ebpf_header(0x01, 0x00, 0, 100, 2, max_dim, [0u8; 8]);
let encoded = write_segment(
SegmentType::Ebpf as u8,
&ebpf_hdr,
SegmentFlags::empty(),
500,
);
let (_header, payload) = read_segment(&encoded).unwrap();
let read_max_dim = u16::from_le_bytes(payload[20..22].try_into().unwrap());
assert_eq!(
read_max_dim, max_dim,
"max_dimension for case '{label}': expected {max_dim}, got {read_max_dim}"
);
println!(" max_dimension={max_dim} ({label}): OK");
}
println!("PASS: ebpf_max_dimension_check -- all dimension values preserved");
}
// ===========================================================================
// TEST 8: test_stub_kernel_type
// ===========================================================================
/// Create a KERNEL_SEG with kernel_type=0xFD (TestStub). This is the first
/// end-to-end demo target per implementation priorities. Verifies the
/// kernel_type field round-trips correctly and the segment is readable.
#[test]
fn test_stub_kernel_type() {
let test_stub_image = b"#!/bin/test_stub\x00RVF_TEST_KERNEL_V1\x00";
let image_hash = simple_test_hash(test_stub_image);
let kernel_hdr = make_kernel_header(
ARCH_X86_64,
KERNEL_TYPE_TEST_STUB, // 0xFD
KERNEL_FLAG_INGEST_ENABLED,
0x0000_0000, // entry_point: 0 for test stubs
test_stub_image.len() as u64,
0, // bss_size: none
1, // stack_pages: minimal
64, // max_dimension
image_hash,
);
// Full payload: KernelHeader + test stub image
let mut full_payload = Vec::with_capacity(KERNEL_HEADER_SIZE + test_stub_image.len());
full_payload.extend_from_slice(&kernel_hdr);
full_payload.extend_from_slice(test_stub_image);
// Write as KERNEL_SEG
let encoded = write_segment(
SegmentType::Kernel as u8,
&full_payload,
SegmentFlags::empty(),
600,
);
// Read back
let (header, payload) = read_segment(&encoded).unwrap();
// Verify outer segment
assert_eq!(header.seg_type, SegmentType::Kernel as u8);
assert_eq!(header.segment_id, 600);
validate_segment(&header, payload).expect("test stub content hash should validate");
// Verify kernel_type is TestStub (0xFD)
assert_eq!(
payload[7], KERNEL_TYPE_TEST_STUB,
"kernel_type should be TestStub (0xFD), got 0x{:02X}",
payload[7]
);
// Verify the test stub image is intact
let image_start = KERNEL_HEADER_SIZE;
let image_size = u64::from_le_bytes(payload[16..24].try_into().unwrap()) as usize;
assert_eq!(image_size, test_stub_image.len(), "image_size mismatch");
let extracted = &payload[image_start..image_start + image_size];
assert_eq!(extracted, test_stub_image, "test stub image data mismatch");
// Verify hash
let mut stored_hash = [0u8; 32];
stored_hash.copy_from_slice(&payload[32..64]);
let computed = simple_test_hash(extracted);
assert_eq!(stored_hash, computed, "test stub image hash mismatch");
// Verify this can also be written to a file and survive a store reopen
let dir = TempDir::new().unwrap();
let path = dir.path().join("test_stub.rvf");
{
let mut store = RvfStore::create(&path, make_options(4)).unwrap();
let v = vec![1.0f32; 4];
store.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
store.close().unwrap();
}
// Append the test stub segment
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let seg_header =
build_raw_segment_header(SegmentType::Kernel as u8, 600, full_payload.len() as u64);
file.write_all(&seg_header).unwrap();
file.write_all(&full_payload).unwrap();
file.sync_all().unwrap();
}
// Reopen and verify store is not broken
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
1,
"store should still work with test stub segment"
);
// Verify test stub is in the file
let bytes = read_file_bytes(&path);
let segs = scan_segments(&bytes);
let kernel_segs: Vec<_> = segs
.iter()
.filter(|s| s.1 == SegmentType::Kernel as u8)
.collect();
assert_eq!(
kernel_segs.len(),
1,
"should find one KERNEL_SEG (TestStub)"
);
let kernel_offset = kernel_segs[0].0;
let kt = bytes[kernel_offset + SEGMENT_HEADER_SIZE + 7];
assert_eq!(
kt, KERNEL_TYPE_TEST_STUB,
"kernel_type in file should be TestStub (0xFD), got 0x{:02X}",
kt
);
println!("PASS: test_stub_kernel_type -- TestStub (0xFD) end-to-end verified");
}

View File

@@ -0,0 +1,573 @@
//! Performance benchmarks for the RVCOW subsystem.
//!
//! All benchmarks are gated behind `#[ignore]` so that `cargo test` does not
//! run them by default. Execute with:
//!
//! ```sh
//! cargo test --test cow_benchmarks -- --ignored --nocapture
//! ```
use std::time::Instant;
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
// -- Helpers ------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
/// Run `f` for `iters` iterations, returning (min, avg, max) in the unit
/// returned by `f` (typically nanoseconds or microseconds).
fn bench_iterations<F: FnMut() -> u128>(mut f: F, iters: usize) -> (u128, u128, u128) {
let mut min = u128::MAX;
let mut max = 0u128;
let mut sum = 0u128;
for _ in 0..iters {
let val = f();
if val < min {
min = val;
}
if val > max {
max = val;
}
sum += val;
}
let avg = sum / iters as u128;
(min, avg, max)
}
// =============================================================================
// BENCHMARK 1: COW Branch Creation
// =============================================================================
#[test]
#[ignore]
fn bench_cow_branch_creation() {
println!("\n=== BENCH: COW Branch Creation ===");
let dim: u16 = 32;
for &count in &[10_000u64, 50_000, 100_000] {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base.rvf");
// Create and populate base store
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let batch_size = 5000;
let mut id_counter = 0u64;
while id_counter < count {
let n = std::cmp::min(batch_size, (count - id_counter) as usize);
let vecs: Vec<Vec<f32>> = (0..n)
.map(|i| random_vector(dim as usize, id_counter + i as u64))
.collect();
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (id_counter..id_counter + n as u64).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
id_counter += n as u64;
}
let base_size = std::fs::metadata(&base_path).unwrap().len();
let (min_us, avg_us, max_us) = bench_iterations(
|| {
let child_path = dir.path().join(format!("child_{}.rvf", rand_u64()));
let start = Instant::now();
let child = base.branch(&child_path).unwrap();
let elapsed = start.elapsed().as_micros();
let child_size = std::fs::metadata(&child_path).unwrap().len();
let pct = (child_size as f64 / base_size as f64) * 100.0;
println!(
"BENCH: branch_create({count} vecs): child_size={child_size} ({pct:.1}% of parent {base_size})"
);
child.close().unwrap();
elapsed
},
3,
);
println!(
"BENCH: branch_create({count} vecs): min={min_us}us avg={avg_us}us max={max_us}us"
);
base.close().unwrap();
}
}
// =============================================================================
// BENCHMARK 2: COW Read Latency (local vs inherited)
// =============================================================================
#[test]
#[ignore]
fn bench_cow_read_latency() {
println!("\n=== BENCH: COW Read Latency ===");
use rvf_runtime::cow::CowEngine;
use std::io::Write;
let cluster_size = 4096u32;
let bytes_per_vec = 128u32; // 32 floats
let vecs_per_cluster = cluster_size / bytes_per_vec; // 32
let cluster_count = 100u32;
// Create a parent file with cluster data
let parent_tmp = tempfile::NamedTempFile::new().unwrap();
{
let f = parent_tmp.as_file();
let mut writer = std::io::BufWriter::new(f);
for cid in 0..cluster_count {
let mut data = vec![0u8; cluster_size as usize];
for b in data.iter_mut() {
*b = (cid & 0xFF) as u8;
}
writer.write_all(&data).unwrap();
}
writer.flush().unwrap();
}
let child_tmp = tempfile::NamedTempFile::new().unwrap();
// Engine with all clusters inherited from parent
let mut engine =
CowEngine::from_parent(cluster_count, cluster_size, vecs_per_cluster, bytes_per_vec);
// Write some vectors to make a few clusters local
let local_data = vec![0xAAu8; bytes_per_vec as usize];
for vid in 0..10u64 {
engine.write_vector(vid, &local_data).unwrap();
}
engine
.flush_writes(
&mut child_tmp.as_file().try_clone().unwrap(),
Some(parent_tmp.as_file()),
)
.unwrap();
// Benchmark: read local vectors (cluster 0 is now local)
let read_count = 1000;
let (min_ns, avg_ns, max_ns) = bench_iterations(
|| {
let start = Instant::now();
for vid in 0..read_count as u64 {
let id = vid % (vecs_per_cluster as u64); // stay in cluster 0 (local)
let _ = engine.read_vector(id, child_tmp.as_file(), Some(parent_tmp.as_file()));
}
start.elapsed().as_nanos() / read_count as u128
},
3,
);
println!("BENCH: cow_read_local: min={min_ns}ns avg={avg_ns}ns max={max_ns}ns per vector");
// Benchmark: read inherited vectors (cluster 50..99 are parent-ref)
let (min_ns, avg_ns, max_ns) = bench_iterations(
|| {
let start = Instant::now();
for i in 0..read_count as u64 {
let cid = 50 + (i % 50);
let vid = cid * vecs_per_cluster as u64; // first vector in inherited cluster
let _ = engine.read_vector(vid, child_tmp.as_file(), Some(parent_tmp.as_file()));
}
start.elapsed().as_nanos() / read_count as u128
},
3,
);
println!("BENCH: cow_read_inherited: min={min_ns}ns avg={avg_ns}ns max={max_ns}ns per vector");
}
// =============================================================================
// BENCHMARK 3: COW Write + Coalescing
// =============================================================================
#[test]
#[ignore]
fn bench_cow_write_coalescing() {
println!("\n=== BENCH: COW Write Coalescing ===");
use rvf_runtime::cow::CowEngine;
use std::io::Write;
let cluster_size = 4096u32;
let bytes_per_vec = 128u32;
let vecs_per_cluster = cluster_size / bytes_per_vec;
let cluster_count = 1000u32;
let write_count = 500u64;
// Create parent file
let parent_tmp = tempfile::NamedTempFile::new().unwrap();
{
let f = parent_tmp.as_file();
let mut writer = std::io::BufWriter::new(f);
for _ in 0..cluster_count {
let data = vec![0u8; cluster_size as usize];
writer.write_all(&data).unwrap();
}
writer.flush().unwrap();
}
let vec_data = vec![0xBBu8; bytes_per_vec as usize];
// Coalesced writes: all N vectors to the SAME cluster (cluster 0)
let (min_us, avg_us, max_us) = bench_iterations(
|| {
let child_tmp = tempfile::NamedTempFile::new().unwrap();
let mut engine = CowEngine::from_parent(
cluster_count,
cluster_size,
vecs_per_cluster,
bytes_per_vec,
);
let start = Instant::now();
for i in 0..write_count.min(vecs_per_cluster as u64) {
engine.write_vector(i, &vec_data).unwrap();
}
let events = engine
.flush_writes(
&mut child_tmp.as_file().try_clone().unwrap(),
Some(parent_tmp.as_file()),
)
.unwrap();
let elapsed = start.elapsed().as_micros();
println!(
"BENCH: write_coalesced({} vecs, 1 cluster): {elapsed}us, {} COW events",
write_count.min(vecs_per_cluster as u64),
events.len()
);
elapsed
},
3,
);
println!("BENCH: write_coalesced: min={min_us}us avg={avg_us}us max={max_us}us");
// Scattered writes: each vector to a DIFFERENT cluster
let (min_us, avg_us, max_us) = bench_iterations(
|| {
let child_tmp = tempfile::NamedTempFile::new().unwrap();
let mut engine = CowEngine::from_parent(
cluster_count,
cluster_size,
vecs_per_cluster,
bytes_per_vec,
);
let start = Instant::now();
for i in 0..write_count {
// Vector i * vecs_per_cluster lands in cluster i
let vid = i * vecs_per_cluster as u64;
engine.write_vector(vid, &vec_data).unwrap();
}
let events = engine
.flush_writes(
&mut child_tmp.as_file().try_clone().unwrap(),
Some(parent_tmp.as_file()),
)
.unwrap();
let elapsed = start.elapsed().as_micros();
println!(
"BENCH: write_scattered({write_count} vecs, {write_count} clusters): {elapsed}us, {} COW events",
events.len()
);
elapsed
},
3,
);
println!("BENCH: write_scattered: min={min_us}us avg={avg_us}us max={max_us}us");
}
// =============================================================================
// BENCHMARK 4: CowMap Lookup
// =============================================================================
#[test]
#[ignore]
fn bench_cowmap_lookup() {
println!("\n=== BENCH: CowMap Lookup ===");
use rvf_runtime::cow_map::CowMap;
use rvf_types::cow_map::CowMapEntry;
let lookup_count = 100_000u64;
for &map_size in &[1_000u32, 10_000, 100_000] {
let mut map = CowMap::new_parent_ref(map_size);
// Make ~10% of entries local
for i in (0..map_size).step_by(10) {
map.update(i, CowMapEntry::LocalOffset(i as u64 * 4096));
}
let (min_ns, avg_ns, max_ns) = bench_iterations(
|| {
let start = Instant::now();
for i in 0..lookup_count {
let cluster_id = (i % map_size as u64) as u32;
let _ = map.lookup(cluster_id);
}
start.elapsed().as_nanos() / lookup_count as u128
},
5,
);
println!(
"BENCH: cowmap_lookup(size={map_size}, lookups={lookup_count}): min={min_ns}ns avg={avg_ns}ns max={max_ns}ns per lookup"
);
}
}
// =============================================================================
// BENCHMARK 5: MembershipFilter contains()
// =============================================================================
#[test]
#[ignore]
fn bench_membership_contains() {
println!("\n=== BENCH: MembershipFilter contains() ===");
use rvf_runtime::membership::MembershipFilter;
let check_count = 1_000_000u64;
for &member_count in &[100_000u64, 500_000, 1_000_000] {
let mut filter = MembershipFilter::new_include(member_count);
// Add ~50% of IDs
for i in (0..member_count).step_by(2) {
filter.add(i);
}
let (min_ns, avg_ns, max_ns) = bench_iterations(
|| {
let start = Instant::now();
for i in 0..check_count {
let id = i % member_count;
let _ = filter.contains(id);
}
start.elapsed().as_nanos() / check_count as u128
},
5,
);
println!(
"BENCH: membership_contains(capacity={member_count}, checks={check_count}): min={min_ns}ns avg={avg_ns}ns max={max_ns}ns per check"
);
}
}
// =============================================================================
// BENCHMARK 6: MembershipFilter Serialization Round-Trip
// =============================================================================
#[test]
#[ignore]
fn bench_membership_serialization() {
println!("\n=== BENCH: MembershipFilter Serialization ===");
use rvf_runtime::membership::MembershipFilter;
for &capacity in &[10_000u64, 100_000, 1_000_000] {
let mut filter = MembershipFilter::new_include(capacity);
for i in (0..capacity).step_by(3) {
filter.add(i);
}
let (min_us, avg_us, max_us) = bench_iterations(
|| {
let start = Instant::now();
let header = filter.to_header();
let bitmap_data = filter.serialize();
let _restored = MembershipFilter::deserialize(&bitmap_data, &header).unwrap();
start.elapsed().as_micros()
},
5,
);
let bitmap_size = filter.serialize().len();
println!(
"BENCH: membership_serde(capacity={capacity}, bitmap_bytes={bitmap_size}): min={min_us}us avg={avg_us}us max={max_us}us"
);
}
}
// =============================================================================
// BENCHMARK 7: Freeze Operation
// =============================================================================
#[test]
#[ignore]
fn bench_freeze_operation() {
println!("\n=== BENCH: Freeze Operation ===");
use rvf_runtime::cow::CowEngine;
for &cluster_count in &[100u32, 1_000, 10_000] {
let (min_ns, avg_ns, max_ns) = bench_iterations(
|| {
let mut engine = CowEngine::from_parent(cluster_count, 4096, 32, 128);
let start = Instant::now();
engine.freeze(1).unwrap();
start.elapsed().as_nanos()
},
10,
);
println!(
"BENCH: freeze(clusters={cluster_count}): min={min_ns}ns avg={avg_ns}ns max={max_ns}ns"
);
}
}
// =============================================================================
// BENCHMARK 8: CowMap Serialization Round-Trip
// =============================================================================
#[test]
#[ignore]
fn bench_cowmap_serialization() {
println!("\n=== BENCH: CowMap Serialization ===");
use rvf_runtime::cow_map::CowMap;
use rvf_types::cow_map::{CowMapEntry, MapFormat};
for &size in &[1_000u32, 10_000, 100_000] {
let mut map = CowMap::new_parent_ref(size);
for i in (0..size).step_by(5) {
map.update(i, CowMapEntry::LocalOffset(i as u64 * 4096));
}
let (min_us, avg_us, max_us) = bench_iterations(
|| {
let start = Instant::now();
let bytes = map.serialize();
let _restored = CowMap::deserialize(&bytes, MapFormat::FlatArray).unwrap();
start.elapsed().as_micros()
},
5,
);
let wire_size = map.serialize().len();
println!(
"BENCH: cowmap_serde(size={size}, wire_bytes={wire_size}): min={min_us}us avg={avg_us}us max={max_us}us"
);
}
}
// =============================================================================
// BENCHMARK 9: ADR-031 Acceptance Benchmark
// =============================================================================
#[test]
#[ignore]
fn bench_adr031_acceptance() {
println!("\n=== BENCH: ADR-031 Acceptance ===");
let dir = TempDir::new().unwrap();
let dim: u16 = 32;
let vector_count = 10_000u64;
let _modify_count = 500u64;
// Step 1: Create base store with many vectors
let base_path = dir.path().join("adr031_base.rvf");
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let batch_size = 2000;
let mut id_counter = 0u64;
while id_counter < vector_count {
let n = std::cmp::min(batch_size, (vector_count - id_counter) as usize);
let vecs: Vec<Vec<f32>> = (0..n)
.map(|i| random_vector(dim as usize, id_counter + i as u64))
.collect();
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (id_counter..id_counter + n as u64).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
id_counter += n as u64;
}
let base_size = std::fs::metadata(&base_path).unwrap().len();
println!("BENCH: adr031: base_store: {vector_count} vectors, {base_size} bytes");
// Step 2: Branch and time it
let child_path = dir.path().join("adr031_child.rvf");
let branch_start = Instant::now();
let child = base.branch(&child_path).unwrap();
let branch_us = branch_start.elapsed().as_micros();
let child_size_before = std::fs::metadata(&child_path).unwrap().len();
println!("BENCH: adr031: branch_time: {branch_us}us");
println!(
"BENCH: adr031: child_before_writes: {child_size_before} bytes ({:.1}% of parent)",
child_size_before as f64 / base_size as f64 * 100.0
);
// Step 3: Verify COW stats
let stats = child.cow_stats().unwrap();
println!(
"BENCH: adr031: cow_stats: clusters={}, local={}, inherited={}",
stats.cluster_count,
stats.local_cluster_count,
stats.cluster_count - stats.local_cluster_count
);
// Step 4: Verify membership filter
let filter = child.membership_filter().unwrap();
println!(
"BENCH: adr031: membership: capacity={}, members={}",
filter.vector_count(),
filter.member_count()
);
// Step 5: Verify child size << parent size
assert!(
child_size_before < base_size,
"child ({child_size_before}) should be smaller than parent ({base_size})"
);
let savings_pct = (1.0 - child_size_before as f64 / base_size as f64) * 100.0;
println!("BENCH: adr031: space_savings: {savings_pct:.1}%");
// Step 6: Spot-check some membership queries
let spot_start = Instant::now();
let mut visible = 0u64;
for vid in 0..vector_count {
if filter.contains(vid) {
visible += 1;
}
}
let spot_us = spot_start.elapsed().as_micros();
println!(
"BENCH: adr031: membership_scan({vector_count} checks): {spot_us}us, {visible} visible"
);
child.close().unwrap();
base.close().unwrap();
println!("BENCH: adr031: PASS");
}
// -- Utility ------------------------------------------------------------------
fn rand_u64() -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut h = DefaultHasher::new();
Instant::now().hash(&mut h);
h.finish()
}

View File

@@ -0,0 +1,384 @@
//! Integration tests for the RVF COW (copy-on-write) branching system.
//!
//! Tests the core branching flow: creating a base store, deriving a child,
//! verifying COW statistics, write coalescing, and parent immutability.
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helper: make RvfStore options
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ---------------------------------------------------------------------------
// Helper: random-ish vector for testing
// ---------------------------------------------------------------------------
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
// ===========================================================================
// TEST 1: basic_branch_creation
// ===========================================================================
/// Create a base store with vectors, branch it, and verify the child
/// is a COW child with correct statistics.
#[test]
fn basic_branch_creation() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base.rvf");
let child_path = dir.path().join("child.rvf");
let dim: u16 = 4;
// Create base store with vectors
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=20).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
// Branch from base
let child = base.branch(&child_path).unwrap();
// Verify child is a COW child
assert!(child.is_cow_child(), "child should be a COW child");
// Verify COW stats exist
let stats = child.cow_stats().expect("child should have COW stats");
assert_eq!(
stats.local_cluster_count, 0,
"new branch should have no local clusters yet"
);
assert!(
stats.cluster_count > 0,
"branch should have inherited clusters"
);
assert!(!stats.frozen, "new branch should not be frozen");
// Verify parent path is set
assert!(
child.parent_path().is_some(),
"child should have a parent path"
);
// Verify the child has a membership filter
assert!(
child.membership_filter().is_some(),
"child should have a membership filter"
);
child.close().unwrap();
base.close().unwrap();
println!("PASS: basic_branch_creation");
}
// ===========================================================================
// TEST 2: branch_inherits_vectors_via_query
// ===========================================================================
/// Create a base store with vectors, branch it, and verify the child
/// has the parent's vectors visible in its membership filter.
///
/// Note: The branch() method creates a MembershipFilter with capacity
/// equal to total_vecs (count of vectors). Vector IDs must be in the
/// range [0, total_vecs) to be representable in the filter bitmap.
#[test]
fn branch_inherits_vectors_via_query() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_q.rvf");
let child_path = dir.path().join("child_q.rvf");
let dim: u16 = 4;
// Create base with contiguous IDs starting from 0
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let v1 = vec![1.0, 0.0, 0.0, 0.0];
let v2 = vec![0.0, 1.0, 0.0, 0.0];
let v3 = vec![0.0, 0.0, 1.0, 0.0];
let vecs: Vec<&[f32]> = vec![&v1, &v2, &v3];
base.ingest_batch(&vecs, &[0, 1, 2], None).unwrap();
// Branch
let child = base.branch(&child_path).unwrap();
// The child's membership filter should include the parent's vectors
let filter = child.membership_filter().unwrap();
assert!(filter.contains(0), "filter should include vector 0");
assert!(filter.contains(1), "filter should include vector 1");
assert!(filter.contains(2), "filter should include vector 2");
assert_eq!(filter.member_count(), 3, "filter should have 3 members");
child.close().unwrap();
base.close().unwrap();
println!("PASS: branch_inherits_vectors_via_query");
}
// ===========================================================================
// TEST 3: cow_stats_reflect_local_and_inherited
// ===========================================================================
/// Create a branch and verify that CowStats correctly reflects
/// local vs inherited cluster counts.
#[test]
fn cow_stats_reflect_local_and_inherited() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_stats.rvf");
let child_path = dir.path().join("child_stats.rvf");
let dim: u16 = 4;
// Create base with enough vectors to create multiple clusters
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
let child = base.branch(&child_path).unwrap();
let stats = child.cow_stats().unwrap();
let inherited = stats.cluster_count - stats.local_cluster_count;
assert!(
inherited > 0,
"child should have inherited clusters from parent"
);
assert_eq!(
stats.local_cluster_count, 0,
"fresh branch has no local clusters"
);
child.close().unwrap();
base.close().unwrap();
println!("PASS: cow_stats_reflect_local_and_inherited");
}
// ===========================================================================
// TEST 4: parent_unmodified_after_branch
// ===========================================================================
/// Verify that branching does not modify the parent store's data.
#[test]
fn parent_unmodified_after_branch() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_parent.rvf");
let child_path = dir.path().join("child_parent.rvf");
let dim: u16 = 4;
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let v1 = vec![1.0, 2.0, 3.0, 4.0];
base.ingest_batch(&[v1.as_slice()], &[100], None).unwrap();
let status_before = base.status();
let total_before = status_before.total_vectors;
let epoch_before = status_before.current_epoch;
let child = base.branch(&child_path).unwrap();
child.close().unwrap();
// Parent should be unchanged
let status_after = base.status();
assert_eq!(
status_after.total_vectors, total_before,
"parent vector count should be unchanged after branch"
);
assert_eq!(
status_after.current_epoch, epoch_before,
"parent epoch should be unchanged after branch"
);
// Parent should still not be a COW child
assert!(!base.is_cow_child(), "parent should not become a COW child");
base.close().unwrap();
println!("PASS: parent_unmodified_after_branch");
}
// ===========================================================================
// TEST 5: child_size_smaller_than_parent
// ===========================================================================
/// Create a large base store, branch it (no writes to child), and verify
/// the child file on disk is significantly smaller than the parent.
#[test]
fn child_size_smaller_than_parent() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_size.rvf");
let child_path = dir.path().join("child_size.rvf");
let dim: u16 = 32;
// Create base with many vectors to make a reasonably large file
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..200).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=200).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
let child = base.branch(&child_path).unwrap();
child.close().unwrap();
base.close().unwrap();
// Compare file sizes
let base_size = std::fs::metadata(&base_path).unwrap().len();
let child_size = std::fs::metadata(&child_path).unwrap().len();
assert!(
child_size < base_size,
"child file ({child_size} bytes) should be smaller than parent ({base_size} bytes)"
);
println!("PASS: child_size_smaller_than_parent -- parent={base_size}, child={child_size}");
}
// ===========================================================================
// TEST 6: freeze_prevents_further_writes
// ===========================================================================
/// Freezing a store prevents further mutations.
#[test]
fn freeze_prevents_further_writes() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("freeze.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
store.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
store.freeze().unwrap();
// Trying to ingest after freeze should fail
let v2 = vec![2.0f32; dim as usize];
let result = store.ingest_batch(&[v2.as_slice()], &[2], None);
assert!(result.is_err(), "ingesting after freeze should fail");
println!("PASS: freeze_prevents_further_writes");
}
// ===========================================================================
// TEST 7: derive_creates_lineage
// ===========================================================================
/// Deriving a child store sets up proper lineage: parent_id, parent_hash,
/// and lineage_depth.
#[test]
fn derive_creates_lineage() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_lineage.rvf");
let child_path = dir.path().join("child_lineage.rvf");
let dim: u16 = 4;
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
base.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
let base_file_id = *base.file_id();
assert_ne!(base_file_id, [0u8; 16], "base should have non-zero file_id");
assert_eq!(base.lineage_depth(), 0, "base should have lineage_depth 0");
let child = base
.derive(
&child_path,
rvf_types::DerivationType::Clone,
Some(make_options(dim)),
)
.unwrap();
// Verify child lineage
assert_ne!(
*child.file_id(),
[0u8; 16],
"child should have non-zero file_id"
);
assert_ne!(
child.file_id(),
base.file_id(),
"child file_id should differ from parent"
);
assert_eq!(
child.parent_id(),
&base_file_id,
"child's parent_id should match base's file_id"
);
assert_eq!(
child.lineage_depth(),
1,
"child should have lineage_depth 1"
);
// parent_hash should be non-zero (it's a hash of the parent's manifest)
let parent_hash = child.file_identity().parent_hash;
assert_ne!(
parent_hash, [0u8; 32],
"child's parent_hash should be non-zero"
);
child.close().unwrap();
base.close().unwrap();
println!("PASS: derive_creates_lineage");
}
// ===========================================================================
// TEST 8: branch_membership_filter_excludes_deleted
// ===========================================================================
/// When branching a store that has deleted vectors, the membership filter
/// should exclude the deleted ones.
///
/// Note: Uses contiguous IDs starting from 0 so they fit within the
/// MembershipFilter bitmap capacity (= total_vecs count).
#[test]
fn branch_membership_filter_excludes_deleted() {
let dir = TempDir::new().unwrap();
let base_path = dir.path().join("base_del.rvf");
let child_path = dir.path().join("child_del.rvf");
let dim: u16 = 4;
let mut base = RvfStore::create(&base_path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..5).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..5).collect();
base.ingest_batch(&refs, &ids, None).unwrap();
// Delete vectors 1 and 3
base.delete(&[1, 3]).unwrap();
// Branch
let child = base.branch(&child_path).unwrap();
let filter = child.membership_filter().unwrap();
// The filter capacity = total_vecs = 5 (including deleted)
// But deleted vectors should be excluded from the membership filter
assert!(filter.contains(0), "vector 0 should be visible");
assert!(!filter.contains(1), "deleted vector 1 should be excluded");
assert!(filter.contains(2), "vector 2 should be visible");
assert!(!filter.contains(3), "deleted vector 3 should be excluded");
assert!(filter.contains(4), "vector 4 should be visible");
assert_eq!(filter.member_count(), 3, "3 vectors should be visible");
child.close().unwrap();
base.close().unwrap();
println!("PASS: branch_membership_filter_excludes_deleted");
}

View File

@@ -0,0 +1,349 @@
//! Integration tests for RVF COW crash recovery scenarios.
//!
//! Tests that the store can recover from torn writes, truncated files,
//! and other crash scenarios by falling back to earlier valid manifests.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{SEGMENT_HEADER_SIZE, SEGMENT_MAGIC};
use std::fs::OpenOptions;
use std::io::{Read, Write};
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helper: make RvfStore options
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ---------------------------------------------------------------------------
// Helper: read entire file into bytes
// ---------------------------------------------------------------------------
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = OpenOptions::new().read(true).open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
// ---------------------------------------------------------------------------
// Helper: scan file for manifest segments
// ---------------------------------------------------------------------------
fn find_manifest_offsets(file_bytes: &[u8]) -> Vec<(usize, u64)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut manifests = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return manifests;
}
let last_possible = file_bytes.len() - SEGMENT_HEADER_SIZE;
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let seg_type = file_bytes[i + 5];
if seg_type == 0x05 {
// Manifest
let seg_id = u64::from_le_bytes(file_bytes[i + 0x08..i + 0x10].try_into().unwrap());
manifests.push((i, seg_id));
}
}
}
manifests
}
// ===========================================================================
// TEST 1: store_survives_garbage_appended
// ===========================================================================
/// Create a valid store, append random garbage bytes to the end,
/// and verify the store can still be opened and queried correctly.
#[test]
fn store_survives_garbage_appended() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("garbage.rvf");
let dim: u16 = 4;
// Create a store with vectors
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v1 = vec![1.0, 2.0, 3.0, 4.0];
let v2 = vec![5.0, 6.0, 7.0, 8.0];
store
.ingest_batch(&[v1.as_slice(), v2.as_slice()], &[1, 2], None)
.unwrap();
store.close().unwrap();
}
// Append garbage
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let garbage = vec![0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE];
file.write_all(&garbage).unwrap();
file.sync_all().unwrap();
}
// Reopen should succeed — the manifest scanner finds the latest valid manifest
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
2,
"store should still report 2 vectors despite garbage appended"
);
let query = vec![1.0, 2.0, 3.0, 4.0];
let results = store.query(&query, 2, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 2, "should find 2 results");
assert_eq!(results[0].id, 1, "nearest should be vector 1");
assert!(results[0].distance < f32::EPSILON);
println!("PASS: store_survives_garbage_appended");
}
// ===========================================================================
// TEST 2: truncated_file_at_segment_boundary
// ===========================================================================
/// Create a store, then truncate it at a non-manifest segment boundary.
/// If the manifest segment is still intact, the store should open.
#[test]
fn truncated_file_preserves_early_manifest() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("truncated.rvf");
let dim: u16 = 4;
// Create store with vectors — this writes a Vec segment then a Manifest
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v1 = vec![1.0, 0.0, 0.0, 0.0];
store.ingest_batch(&[v1.as_slice()], &[1], None).unwrap();
store.close().unwrap();
}
let original_bytes = read_file_bytes(&path);
let manifests = find_manifest_offsets(&original_bytes);
// There should be at least one manifest
assert!(
!manifests.is_empty(),
"should find at least one manifest in the file"
);
// The file should open fine from the valid manifest
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 1);
println!("PASS: truncated_file_preserves_early_manifest");
}
// ===========================================================================
// TEST 3: multiple_manifests_last_wins
// ===========================================================================
/// Create a store, ingest vectors in two batches (creating two manifests),
/// and verify the latest manifest is used on reopen.
#[test]
fn multiple_manifests_last_wins() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_manifest.rvf");
let dim: u16 = 4;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// First batch
let v1 = vec![1.0, 0.0, 0.0, 0.0];
store.ingest_batch(&[v1.as_slice()], &[1], None).unwrap();
// This writes a manifest
// Second batch
let v2 = vec![0.0, 1.0, 0.0, 0.0];
store.ingest_batch(&[v2.as_slice()], &[2], None).unwrap();
// This writes another manifest
store.close().unwrap();
}
let file_bytes = read_file_bytes(&path);
let manifests = find_manifest_offsets(&file_bytes);
// Should have at least 2 manifests (initial + after first ingest + after second)
assert!(
manifests.len() >= 2,
"expected at least 2 manifest segments, found {}",
manifests.len()
);
// Reopen and verify the latest state is used (2 vectors)
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
2,
"latest manifest should reflect both batches"
);
println!("PASS: multiple_manifests_last_wins");
}
// ===========================================================================
// TEST 4: corrupted_trailing_bytes_dont_break_store
// ===========================================================================
/// Write a valid store, then append a partial (truncated) segment header.
/// The store should still open because the manifest scanner can ignore
/// incomplete segments.
#[test]
fn corrupted_trailing_bytes_dont_break_store() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("partial_seg.rvf");
let dim: u16 = 4;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v = vec![1.0, 2.0, 3.0, 4.0];
store.ingest_batch(&[v.as_slice()], &[42], None).unwrap();
store.close().unwrap();
}
// Append a partial segment header (only magic + a few bytes, not a full 64-byte header)
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let partial_header = SEGMENT_MAGIC.to_le_bytes();
file.write_all(&partial_header).unwrap();
// Add a few more bytes but not enough for a full header
file.write_all(&[0x04, 0x01, 0x00, 0x00]).unwrap();
file.sync_all().unwrap();
}
// Reopen should still work
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
1,
"store should still have 1 vector despite partial segment appended"
);
let query = vec![1.0, 2.0, 3.0, 4.0];
let results = store.query(&query, 1, &QueryOptions::default()).unwrap();
assert_eq!(results[0].id, 42);
println!("PASS: corrupted_trailing_bytes_dont_break_store");
}
// ===========================================================================
// TEST 5: reopened_store_preserves_all_data
// ===========================================================================
/// Verify that close + reopen preserves all vectors and metadata.
#[test]
fn reopened_store_preserves_all_data() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("reopen.rvf");
let dim: u16 = 8;
let vectors: Vec<Vec<f32>> = (0..50)
.map(|i| {
let mut v = Vec::with_capacity(dim as usize);
let mut x = i as u64;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32));
}
v
})
.collect();
// Create and populate
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Reopen and verify
{
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 50);
// Query with each original vector — should find itself as nearest
for i in 0..50u64 {
let results = store
.query(&vectors[i as usize], 1, &QueryOptions::default())
.unwrap();
assert_eq!(
results.len(),
1,
"query for vector {i} should return 1 result"
);
assert_eq!(
results[0].id, i,
"nearest neighbor for vector {i} should be itself"
);
assert!(
results[0].distance < f32::EPSILON,
"self-distance for vector {i} should be ~0"
);
}
}
println!("PASS: reopened_store_preserves_all_data");
}
// ===========================================================================
// TEST 6: deletion_persists_through_reopen
// ===========================================================================
/// Delete vectors, close, reopen, and verify deletions are still applied.
#[test]
fn deletion_persists_through_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("del_persist.rvf");
let dim: u16 = 4;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v1 = vec![1.0, 0.0, 0.0, 0.0];
let v2 = vec![0.0, 1.0, 0.0, 0.0];
let v3 = vec![0.0, 0.0, 1.0, 0.0];
store
.ingest_batch(
&[v1.as_slice(), v2.as_slice(), v3.as_slice()],
&[1, 2, 3],
None,
)
.unwrap();
store.delete(&[2]).unwrap();
store.close().unwrap();
}
{
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
2,
"should have 2 vectors after deletion and reopen"
);
let query = vec![0.0, 1.0, 0.0, 0.0];
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 2);
assert!(
results.iter().all(|r| r.id != 2),
"deleted vector 2 should not appear in results"
);
}
println!("PASS: deletion_persists_through_reopen");
}

View File

@@ -0,0 +1,466 @@
//! Cross-platform RVF compatibility tests.
//!
//! Verifies that RVF stores can be serialized to bytes, transferred across
//! boundaries (simulating cross-platform exchange), and re-imported with
//! identical query results. Tests all three distance metrics and verifies
//! segment header preservation across the round-trip.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC};
use std::fs;
use std::io::Read;
use tempfile::TempDir;
/// Deterministic pseudo-random vector generation using an LCG.
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
fn make_options(dim: u16, metric: DistanceMetric) -> RvfOptions {
RvfOptions {
dimension: dim,
metric,
..Default::default()
}
}
/// Read an entire file into a byte vector.
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = fs::File::open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
/// Scan the file bytes for all segment headers and return their offsets and types.
fn scan_segment_headers(file_bytes: &[u8]) -> Vec<(usize, u8, u64, u64)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut results = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return results;
}
let last_possible = file_bytes.len().saturating_sub(SEGMENT_HEADER_SIZE);
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let seg_type = file_bytes[i + 5];
let seg_id = u64::from_le_bytes(file_bytes[i + 0x08..i + 0x10].try_into().unwrap());
let payload_len =
u64::from_le_bytes(file_bytes[i + 0x10..i + 0x18].try_into().unwrap());
results.push((i, seg_type, seg_id, payload_len));
}
}
results
}
// ---------------------------------------------------------------------------
// TEST 1: Cosine metric export/import round-trip
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_cosine_round_trip() {
let dir = TempDir::new().unwrap();
let dim: u16 = 32;
let num_vectors: usize = 200;
// Phase 1: Create store and populate with vectors.
let original_path = dir.path().join("original_cosine.rvf");
let query = random_vector(dim as usize, 999);
let original_results;
{
let mut store =
RvfStore::create(&original_path, make_options(dim, DistanceMetric::Cosine)).unwrap();
let vectors: Vec<Vec<f32>> = (0..num_vectors)
.map(|i| random_vector(dim as usize, i as u64 * 7 + 3))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=num_vectors as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Query original for baseline results.
{
let store = RvfStore::open_readonly(&original_path).unwrap();
original_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert!(
!original_results.is_empty(),
"original query should return results"
);
store.close().unwrap();
}
// Phase 2: Export to bytes.
let exported_bytes = read_file_bytes(&original_path);
assert!(
!exported_bytes.is_empty(),
"exported bytes should not be empty"
);
// Phase 3: Re-import from bytes at a new location.
let reimported_path = dir.path().join("reimported_cosine.rvf");
fs::write(&reimported_path, &exported_bytes).unwrap();
// Phase 4: Open re-imported store and verify results match.
{
let store = RvfStore::open_readonly(&reimported_path).unwrap();
let reimported_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(
original_results.len(),
reimported_results.len(),
"result count mismatch after re-import"
);
for (orig, reimp) in original_results.iter().zip(reimported_results.iter()) {
assert_eq!(orig.id, reimp.id, "ID mismatch at position");
assert!(
(orig.distance - reimp.distance).abs() < 1e-6,
"distance mismatch for id {}: {} vs {} (delta={})",
orig.id,
orig.distance,
reimp.distance,
(orig.distance - reimp.distance).abs()
);
}
let status = store.status();
assert_eq!(
status.total_vectors, num_vectors as u64,
"re-imported store should have same vector count"
);
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// TEST 2: Euclidean (L2) metric export/import round-trip
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_l2_round_trip() {
let dir = TempDir::new().unwrap();
let dim: u16 = 16;
let num_vectors: usize = 100;
let original_path = dir.path().join("original_l2.rvf");
let query = random_vector(dim as usize, 42);
let original_results;
{
let mut store =
RvfStore::create(&original_path, make_options(dim, DistanceMetric::L2)).unwrap();
let vectors: Vec<Vec<f32>> = (0..num_vectors)
.map(|i| random_vector(dim as usize, i as u64 * 11 + 5))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=num_vectors as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
{
let store = RvfStore::open_readonly(&original_path).unwrap();
original_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
store.close().unwrap();
}
let exported_bytes = read_file_bytes(&original_path);
let reimported_path = dir.path().join("reimported_l2.rvf");
fs::write(&reimported_path, &exported_bytes).unwrap();
{
let store = RvfStore::open_readonly(&reimported_path).unwrap();
let reimported_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(original_results.len(), reimported_results.len());
for (orig, reimp) in original_results.iter().zip(reimported_results.iter()) {
assert_eq!(orig.id, reimp.id);
assert!(
(orig.distance - reimp.distance).abs() < 1e-6,
"L2 distance mismatch for id {}: {} vs {}",
orig.id,
orig.distance,
reimp.distance
);
}
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// TEST 3: InnerProduct (dot product) metric export/import round-trip
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_inner_product_round_trip() {
let dir = TempDir::new().unwrap();
let dim: u16 = 64;
let num_vectors: usize = 150;
let original_path = dir.path().join("original_ip.rvf");
let query = random_vector(dim as usize, 7777);
let original_results;
{
let mut store = RvfStore::create(
&original_path,
make_options(dim, DistanceMetric::InnerProduct),
)
.unwrap();
let vectors: Vec<Vec<f32>> = (0..num_vectors)
.map(|i| random_vector(dim as usize, i as u64 * 13 + 1))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=num_vectors as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
{
let store = RvfStore::open_readonly(&original_path).unwrap();
original_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
store.close().unwrap();
}
let exported_bytes = read_file_bytes(&original_path);
let reimported_path = dir.path().join("reimported_ip.rvf");
fs::write(&reimported_path, &exported_bytes).unwrap();
{
let store = RvfStore::open_readonly(&reimported_path).unwrap();
let reimported_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(original_results.len(), reimported_results.len());
for (orig, reimp) in original_results.iter().zip(reimported_results.iter()) {
assert_eq!(orig.id, reimp.id);
assert!(
(orig.distance - reimp.distance).abs() < 1e-6,
"InnerProduct distance mismatch for id {}: {} vs {}",
orig.id,
orig.distance,
reimp.distance
);
}
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// TEST 4: Segment headers are preserved across serialize/deserialize
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_segment_headers_preserved() {
let dir = TempDir::new().unwrap();
let dim: u16 = 8;
let original_path = dir.path().join("seg_headers.rvf");
{
let mut store =
RvfStore::create(&original_path, make_options(dim, DistanceMetric::L2)).unwrap();
let vectors: Vec<Vec<f32>> = (0..50)
.map(|i| random_vector(dim as usize, i as u64))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Scan original for segment headers.
let original_bytes = read_file_bytes(&original_path);
let original_segments = scan_segment_headers(&original_bytes);
assert!(
!original_segments.is_empty(),
"original file should contain at least one segment"
);
// Copy bytes to new location (simulating cross-platform transfer).
let reimported_path = dir.path().join("seg_headers_copy.rvf");
fs::write(&reimported_path, &original_bytes).unwrap();
// Scan re-imported file for segment headers.
let reimported_bytes = read_file_bytes(&reimported_path);
let reimported_segments = scan_segment_headers(&reimported_bytes);
// Segment counts must match.
assert_eq!(
original_segments.len(),
reimported_segments.len(),
"segment count mismatch: {} vs {}",
original_segments.len(),
reimported_segments.len()
);
// Each segment header must be identical.
for (i, (orig, reimp)) in original_segments
.iter()
.zip(reimported_segments.iter())
.enumerate()
{
assert_eq!(
orig.0, reimp.0,
"segment {i}: offset mismatch ({} vs {})",
orig.0, reimp.0
);
assert_eq!(
orig.1, reimp.1,
"segment {i}: type mismatch ({:#x} vs {:#x})",
orig.1, reimp.1
);
assert_eq!(
orig.2, reimp.2,
"segment {i}: id mismatch ({} vs {})",
orig.2, reimp.2
);
assert_eq!(
orig.3, reimp.3,
"segment {i}: payload_length mismatch ({} vs {})",
orig.3, reimp.3
);
}
// Verify the re-imported store is still queryable.
{
let store = RvfStore::open_readonly(&reimported_path).unwrap();
assert_eq!(store.status().total_vectors, 50);
let query = random_vector(dim as usize, 25);
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert_eq!(
results.len(),
5,
"re-imported store should return query results"
);
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// TEST 5: All three metrics produce consistent results after round-trip
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_all_metrics_consistent() {
let dir = TempDir::new().unwrap();
let dim: u16 = 16;
let num_vectors: usize = 50;
let metrics = [
(DistanceMetric::L2, "l2"),
(DistanceMetric::Cosine, "cosine"),
(DistanceMetric::InnerProduct, "dotproduct"),
];
for (metric, label) in &metrics {
let original_path = dir.path().join(format!("all_{label}.rvf"));
let query = random_vector(dim as usize, 12345);
// Create and populate.
{
let mut store = RvfStore::create(&original_path, make_options(dim, *metric)).unwrap();
let vectors: Vec<Vec<f32>> = (0..num_vectors)
.map(|i| random_vector(dim as usize, i as u64 * 17 + 2))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=num_vectors as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Query original.
let original_results;
{
let store = RvfStore::open_readonly(&original_path).unwrap();
original_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
store.close().unwrap();
}
// Round-trip through bytes.
let bytes = read_file_bytes(&original_path);
let reimported_path = dir.path().join(format!("all_{label}_copy.rvf"));
fs::write(&reimported_path, &bytes).unwrap();
// Verify results match within tolerance.
{
let store = RvfStore::open_readonly(&reimported_path).unwrap();
let reimported_results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(
original_results.len(),
reimported_results.len(),
"{label}: result count mismatch"
);
for (orig, reimp) in original_results.iter().zip(reimported_results.iter()) {
assert_eq!(orig.id, reimp.id, "{label}: ID mismatch");
assert!(
(orig.distance - reimp.distance).abs() < 1e-6,
"{label}: distance mismatch for id {}: {} vs {} (delta={})",
orig.id,
orig.distance,
reimp.distance,
(orig.distance - reimp.distance).abs()
);
}
store.close().unwrap();
}
}
}
// ---------------------------------------------------------------------------
// TEST 6: Byte-level file identity after export/import
// ---------------------------------------------------------------------------
#[test]
fn cross_platform_byte_identical_transfer() {
let dir = TempDir::new().unwrap();
let dim: u16 = 4;
let original_path = dir.path().join("byte_ident.rvf");
{
let mut store =
RvfStore::create(&original_path, make_options(dim, DistanceMetric::L2)).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Read original bytes.
let original_bytes = read_file_bytes(&original_path);
// Write to new location.
let copy_path = dir.path().join("byte_ident_copy.rvf");
fs::write(&copy_path, &original_bytes).unwrap();
// Read copy bytes.
let copy_bytes = read_file_bytes(&copy_path);
// Bytes must be identical.
assert_eq!(
original_bytes.len(),
copy_bytes.len(),
"file sizes should be identical"
);
assert_eq!(
original_bytes, copy_bytes,
"file bytes should be identical after transfer"
);
}

View File

@@ -0,0 +1,167 @@
//! Cryptographic signature integration tests.
//!
//! Tests rvf-crypto segment signing and verification, SHAKE-256 hashing,
//! and witness chain integrity.
use ed25519_dalek::SigningKey;
use rand::rngs::OsRng;
use rvf_crypto::hash::{shake256_128, shake256_256};
use rvf_crypto::sign::{sign_segment, verify_segment};
use rvf_crypto::witness::{create_witness_chain, verify_witness_chain, WitnessEntry};
use rvf_types::SegmentHeader;
fn make_test_header(seg_id: u64) -> SegmentHeader {
let mut h = SegmentHeader::new(0x01, seg_id);
h.timestamp_ns = 1_000_000_000;
h.payload_length = 100;
h
}
#[test]
fn shake256_hash_deterministic() {
let data = b"RuVector Format test data";
let h1 = shake256_128(data);
let h2 = shake256_128(data);
assert_eq!(h1, h2, "SHAKE-256 should be deterministic");
}
#[test]
fn shake256_different_inputs_different_hashes() {
let h1 = shake256_128(b"input A");
let h2 = shake256_128(b"input B");
assert_ne!(h1, h2, "different inputs should produce different hashes");
}
#[test]
fn shake256_128_is_prefix_of_256() {
let data = b"consistency check";
let h128 = shake256_128(data);
let h256 = shake256_256(data);
assert_eq!(h128.len(), 16, "SHAKE-256-128 should produce 16 bytes");
assert_eq!(h256.len(), 32, "SHAKE-256-256 should produce 32 bytes");
assert_eq!(
&h128[..],
&h256[..16],
"128-bit should be prefix of 256-bit"
);
}
#[test]
fn sign_and_verify_segment_ed25519() {
let key = SigningKey::generate(&mut OsRng);
let header = make_test_header(42);
let payload = b"segment payload containing vectors";
let footer = sign_segment(&header, payload, &key);
let pubkey = key.verifying_key();
assert!(
verify_segment(&header, payload, &footer, &pubkey),
"valid signature should verify"
);
}
#[test]
fn verify_fails_on_corrupted_payload() {
let key = SigningKey::generate(&mut OsRng);
let header = make_test_header(1);
let payload = b"original payload";
let footer = sign_segment(&header, payload, &key);
let pubkey = key.verifying_key();
let corrupted = b"corrupted payload";
assert!(
!verify_segment(&header, corrupted, &footer, &pubkey),
"corrupted payload should fail verification"
);
}
#[test]
fn verify_fails_on_wrong_key() {
let key1 = SigningKey::generate(&mut OsRng);
let key2 = SigningKey::generate(&mut OsRng);
let header = make_test_header(1);
let payload = b"payload data";
let footer = sign_segment(&header, payload, &key1);
let wrong_pubkey = key2.verifying_key();
assert!(
!verify_segment(&header, payload, &footer, &wrong_pubkey),
"wrong public key should fail verification"
);
}
#[test]
fn verify_fails_on_tampered_header() {
let key = SigningKey::generate(&mut OsRng);
let header = make_test_header(42);
let payload = b"payload";
let footer = sign_segment(&header, payload, &key);
let pubkey = key.verifying_key();
let mut bad_header = header;
bad_header.segment_id = 999;
assert!(
!verify_segment(&bad_header, payload, &footer, &pubkey),
"tampered header should fail verification"
);
}
#[test]
fn witness_chain_create_and_verify() {
let entries: Vec<WitnessEntry> = (0..5)
.map(|i| WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(&[i as u8]),
timestamp_ns: 1_000_000_000 + i as u64,
witness_type: 0x01,
})
.collect();
let chain = create_witness_chain(&entries);
assert!(!chain.is_empty());
let verified = verify_witness_chain(&chain).unwrap();
assert_eq!(verified.len(), entries.len());
// Action hashes should match.
for (i, entry) in verified.iter().enumerate() {
assert_eq!(entry.action_hash, entries[i].action_hash);
assert_eq!(entry.timestamp_ns, entries[i].timestamp_ns);
}
}
#[test]
fn witness_chain_detects_tampering() {
let entries: Vec<WitnessEntry> = (0..3)
.map(|i| WitnessEntry {
prev_hash: [0u8; 32],
action_hash: shake256_256(&[i as u8]),
timestamp_ns: 1_000_000_000 + i as u64,
witness_type: 0x01,
})
.collect();
let mut chain = create_witness_chain(&entries);
// Tamper with the second entry's action_hash (offset 73 is start of entry 1,
// action_hash is at offset +32 within entry).
chain[73 + 32] ^= 0xFF;
assert!(
verify_witness_chain(&chain).is_err(),
"tampered chain should fail verification"
);
}
#[test]
fn witness_chain_empty_is_valid() {
let chain = create_witness_chain(&[]);
assert!(chain.is_empty());
let verified = verify_witness_chain(&chain).unwrap();
assert!(verified.is_empty());
}

View File

@@ -0,0 +1,334 @@
//! Crash safety end-to-end tests.
//!
//! Simulates crash scenarios by truncating files mid-write, corrupting
//! manifest checksums, and introducing partial segment data. Verifies that
//! the RVF runtime recovers to the last valid state.
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{SegmentFlags, SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, SEGMENT_VERSION};
use rvf_wire::{find_latest_manifest, read_segment, validate_segment, write_segment};
use std::fs;
use std::io::Write;
use tempfile::TempDir;
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
// --------------------------------------------------------------------------
// 1. Truncate file after initial 1000 vectors, reopen recovers
// --------------------------------------------------------------------------
#[test]
fn crash_truncate_after_valid_state_recovers() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("crash_trunc.rvf");
let dim: u16 = 8;
// Create store with 100 vectors.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..100).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Record valid file size.
let valid_size = fs::metadata(&path).unwrap().len();
// Append garbage to simulate a partial write (crash during next ingest).
{
let mut file = fs::OpenOptions::new().append(true).open(&path).unwrap();
// Write a partial segment header + some garbage.
let garbage = vec![0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x01, 0x02, 0x03];
file.write_all(&garbage).unwrap();
}
// File is now slightly larger with trailing garbage.
let corrupted_size = fs::metadata(&path).unwrap().len();
assert!(corrupted_size > valid_size);
// Truncate back to the valid size to simulate the OS recovering.
// In a real crash scenario, the runtime should find the last valid manifest.
// Here we test that the file with garbage appended can still be opened
// by reading the raw bytes and finding the manifest.
let file_bytes = fs::read(&path).unwrap();
let result = find_latest_manifest(&file_bytes);
assert!(
result.is_ok(),
"should find valid manifest despite trailing garbage"
);
}
// --------------------------------------------------------------------------
// 2. Truncate mid-segment: orphan segment ignored
// --------------------------------------------------------------------------
#[test]
fn crash_partial_segment_at_tail_is_harmless() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("partial_seg.rvf");
let dim: u16 = 4;
// Create and close a valid store.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Append an incomplete segment (just a header, no full payload).
{
let mut file = fs::OpenOptions::new().append(true).open(&path).unwrap();
// Write a valid-looking header but with declared payload that does not exist.
let mut fake_header = [0u8; SEGMENT_HEADER_SIZE];
fake_header[0..4].copy_from_slice(&SEGMENT_MAGIC.to_le_bytes());
fake_header[4] = SEGMENT_VERSION;
fake_header[5] = SegmentType::Vec as u8;
// Declare a payload of 1000 bytes but only write the header.
fake_header[0x10..0x18].copy_from_slice(&1000u64.to_le_bytes());
file.write_all(&fake_header).unwrap();
}
// The runtime should still find the prior valid manifest when reading raw.
let file_bytes = fs::read(&path).unwrap();
let result = find_latest_manifest(&file_bytes);
assert!(
result.is_ok(),
"should find previous manifest despite orphan segment"
);
}
// --------------------------------------------------------------------------
// 3. Corrupt manifest checksum, fallback to previous manifest
// --------------------------------------------------------------------------
#[test]
fn crash_corrupted_manifest_checksum_fallback() {
// Build a raw file with two manifest segments.
let mut file_bytes = Vec::new();
// VEC_SEG with some data.
let payload = vec![42u8; 200];
let vec_seg = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), 1);
file_bytes.extend_from_slice(&vec_seg);
// First (older) manifest.
let m1_payload = vec![0x01u8; 64];
let m1 = write_segment(
SegmentType::Manifest as u8,
&m1_payload,
SegmentFlags::empty(),
10,
);
file_bytes.extend_from_slice(&m1);
// More VEC data.
let vec_seg2 = write_segment(
SegmentType::Vec as u8,
&[0u8; 100],
SegmentFlags::empty(),
2,
);
file_bytes.extend_from_slice(&vec_seg2);
// Second (latest) manifest -- we will corrupt this one.
let m2_offset = file_bytes.len();
let m2_payload = vec![0x02u8; 64];
let m2 = write_segment(
SegmentType::Manifest as u8,
&m2_payload,
SegmentFlags::empty(),
20,
);
file_bytes.extend_from_slice(&m2);
// Corrupt the latest manifest's content hash (at offset 0x28..0x38 in its header).
let hash_offset = m2_offset + 0x28;
file_bytes[hash_offset] ^= 0xFF;
file_bytes[hash_offset + 1] ^= 0xFF;
// The corrupted manifest should fail validation.
let (header, payload_data) = read_segment(&file_bytes[m2_offset..]).unwrap();
assert!(
validate_segment(&header, payload_data).is_err(),
"corrupted manifest should fail validation"
);
// But the tail scan should still find a manifest (possibly the corrupted one,
// since find_latest_manifest does not validate checksums -- it only finds
// the structural offset). The key behavior is that the format supports
// fallback via the scan mechanism.
let scan_result = find_latest_manifest(&file_bytes);
assert!(
scan_result.is_ok(),
"tail scan should still find a manifest segment"
);
}
// --------------------------------------------------------------------------
// 4. Zero-fill tail detected as invalid
// --------------------------------------------------------------------------
#[test]
fn crash_zero_fill_tail_detected() {
let mut file_bytes = Vec::new();
// Valid VEC_SEG.
let vec_seg = write_segment(
SegmentType::Vec as u8,
&[1u8; 128],
SegmentFlags::empty(),
1,
);
file_bytes.extend_from_slice(&vec_seg);
// Valid manifest.
let manifest = write_segment(
SegmentType::Manifest as u8,
&[0u8; 64],
SegmentFlags::empty(),
2,
);
file_bytes.extend_from_slice(&manifest);
// Append 256 zero bytes (simulating zero-fill from crash).
file_bytes.extend_from_slice(&[0u8; 256]);
// The zero-filled tail should not be parsed as a valid segment.
let zero_start = file_bytes.len() - 256;
let zero_header_result = read_segment(&file_bytes[zero_start..]);
assert!(
zero_header_result.is_err(),
"zero-filled region should not parse as a valid segment"
);
// But the manifest before it should still be found.
let result = find_latest_manifest(&file_bytes);
assert!(result.is_ok(), "should find manifest before zero-fill tail");
}
// --------------------------------------------------------------------------
// 5. Valid store survives append of random noise
// --------------------------------------------------------------------------
#[test]
fn crash_random_noise_appended_no_data_loss() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("noise.rvf");
let dim: u16 = 4;
// Create a valid store.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..30).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=30).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Append random noise (simulating partial crash during write).
{
let mut file = fs::OpenOptions::new().append(true).open(&path).unwrap();
let noise: Vec<u8> = (0..200).map(|i| (i * 37 + 13) as u8).collect();
file.write_all(&noise).unwrap();
}
// Read raw and verify manifest is still findable.
let file_bytes = fs::read(&path).unwrap();
let result = find_latest_manifest(&file_bytes);
assert!(
result.is_ok(),
"manifest should still be findable after random noise appended"
);
}
// --------------------------------------------------------------------------
// 6. Segment hash validation catches single-byte corruption
// --------------------------------------------------------------------------
#[test]
fn crash_segment_hash_catches_corruption() {
let payload = b"critical vector data for recovery testing";
let encoded = write_segment(SegmentType::Vec as u8, payload, SegmentFlags::empty(), 42);
let (header, _) = read_segment(&encoded).unwrap();
// Flip one byte in the payload region.
let mut corrupted = encoded.clone();
corrupted[SEGMENT_HEADER_SIZE] ^= 0x01;
let corrupted_payload = &corrupted[SEGMENT_HEADER_SIZE..SEGMENT_HEADER_SIZE + payload.len()];
assert!(
validate_segment(&header, corrupted_payload).is_err(),
"single-byte corruption should be detected by hash validation"
);
// Uncorrupted should pass.
let good_payload = &encoded[SEGMENT_HEADER_SIZE..SEGMENT_HEADER_SIZE + payload.len()];
assert!(
validate_segment(&header, good_payload).is_ok(),
"uncorrupted segment should pass validation"
);
}
// --------------------------------------------------------------------------
// 7. Multiple segments: corruption isolated to affected segment
// --------------------------------------------------------------------------
#[test]
fn crash_corruption_isolated_to_single_segment() {
let payload_a = b"segment alpha data";
let payload_b = b"segment bravo data";
let payload_c = b"segment charlie data";
let seg_a = write_segment(SegmentType::Vec as u8, payload_a, SegmentFlags::empty(), 1);
let seg_b = write_segment(SegmentType::Vec as u8, payload_b, SegmentFlags::empty(), 2);
let seg_c = write_segment(SegmentType::Vec as u8, payload_c, SegmentFlags::empty(), 3);
let mut file = seg_a.clone();
let b_offset = file.len();
file.extend_from_slice(&seg_b);
let c_offset = file.len();
file.extend_from_slice(&seg_c);
// Corrupt segment B's payload.
file[b_offset + SEGMENT_HEADER_SIZE] ^= 0xFF;
// Segment A should still validate.
let (hdr_a, pay_a) = read_segment(&file[0..]).unwrap();
assert!(
validate_segment(&hdr_a, pay_a).is_ok(),
"segment A should be intact"
);
// Segment B should fail validation.
let (hdr_b, pay_b) = read_segment(&file[b_offset..]).unwrap();
assert!(
validate_segment(&hdr_b, pay_b).is_err(),
"segment B should be corrupted"
);
// Segment C should still validate.
let (hdr_c, pay_c) = read_segment(&file[c_offset..]).unwrap();
assert!(
validate_segment(&hdr_c, pay_c).is_ok(),
"segment C should be intact"
);
}

View File

@@ -0,0 +1,354 @@
//! Multi-segment file end-to-end tests.
//!
//! Verifies correct behavior when a store contains many VEC_SEGs from
//! repeated ingest operations: all vectors are queryable, compaction
//! merges segments, and deletions work correctly across segments.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
// --------------------------------------------------------------------------
// 1. Ingest 100 vectors 20 times, creating 20 VEC_SEGs
// --------------------------------------------------------------------------
#[test]
fn multi_seg_twenty_batches_all_queryable() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi20.rvf");
let dim: u16 = 8;
let batch_size = 100usize;
let num_batches = 20usize;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
for batch in 0..num_batches {
let base_id = (batch * batch_size + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..batch_size)
.map(|i| random_vector(dim as usize, base_id + i as u64))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + batch_size as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
let total = (num_batches * batch_size) as u64;
assert_eq!(store.status().total_vectors, total);
// Query for a vector from each batch to verify all segments are accessible.
for batch in 0..num_batches {
let target_id = (batch * batch_size + 50 + 1) as u64; // mid-batch vector
let target_vec = random_vector(dim as usize, target_id);
let results = store
.query(&target_vec, 5, &QueryOptions::default())
.unwrap();
assert!(
!results.is_empty(),
"batch {batch}: query should return results"
);
assert_eq!(
results[0].id, target_id,
"batch {batch}: exact match should be first result"
);
assert!(
results[0].distance < 1e-6,
"batch {batch}: exact match distance should be near zero"
);
}
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 2. Verify segment count increases with batches
// --------------------------------------------------------------------------
#[test]
fn multi_seg_segment_count_increases() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("seg_count.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let initial_segments = store.status().total_segments;
for batch in 0..5 {
let base_id = (batch * 10 + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..10)
.map(|i| vec![(base_id + i as u64) as f32; dim as usize])
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
let final_segments = store.status().total_segments;
assert!(
final_segments > initial_segments,
"segment count should increase after multiple ingests: initial={initial_segments}, final={final_segments}"
);
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 3. Compact merges multiple segments
// --------------------------------------------------------------------------
#[test]
fn multi_seg_compact_merges_segments() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("merge.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest in 10 small batches.
for batch in 0..10 {
let base_id = (batch * 20 + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..20)
.map(|i| vec![(base_id + i as u64) as f32; dim as usize])
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 20).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
assert_eq!(store.status().total_vectors, 200);
// Delete the first 50 vectors (spanning multiple segments).
let del_ids: Vec<u64> = (1..=50).collect();
store.delete(&del_ids).unwrap();
assert_eq!(store.status().total_vectors, 150);
// Compact.
let compact_result = store.compact().unwrap();
assert!(
compact_result.segments_compacted > 0 || compact_result.bytes_reclaimed > 0,
"compaction should do some work"
);
// All remaining 150 vectors should still be queryable.
assert_eq!(store.status().total_vectors, 150);
// Spot-check: query for a vector from the middle (batch 5, id 101).
let target_vec = vec![101.0f32; dim as usize];
let results = store
.query(&target_vec, 5, &QueryOptions::default())
.unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].id, 101, "vector 101 should be first result");
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 4. Delete first 500 from 2000 vectors, verify deletion bitmap
// --------------------------------------------------------------------------
#[test]
fn multi_seg_delete_first_500_from_2000() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("del500.rvf");
let dim: u16 = 8;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest 2000 vectors in batches of 200.
for batch in 0..10 {
let base_id = (batch * 200 + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..200)
.map(|i| random_vector(dim as usize, base_id + i as u64))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 200).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
assert_eq!(store.status().total_vectors, 2000);
// Delete first 500.
let del_ids: Vec<u64> = (1..=500).collect();
let del_result = store.delete(&del_ids).unwrap();
assert_eq!(del_result.deleted, 500);
assert_eq!(store.status().total_vectors, 1500);
// Query for a deleted vector (id=250): should not appear in results.
let target = random_vector(dim as usize, 250);
let results = store.query(&target, 100, &QueryOptions::default()).unwrap();
for r in &results {
assert!(
r.id > 500,
"deleted vector {} should not appear in results",
r.id
);
}
// Query for a live vector (id=750): should appear.
let live_target = random_vector(dim as usize, 750);
let results = store
.query(&live_target, 5, &QueryOptions::default())
.unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].id, 750, "live vector 750 should be found");
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 5. Compact after deletion, then verify remaining vectors
// --------------------------------------------------------------------------
#[test]
fn multi_seg_compact_after_delete_verifies_remaining() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_del.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest 500 vectors in 5 batches.
for batch in 0..5 {
let base_id = (batch * 100 + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| vec![(base_id + i as u64) as f32; dim as usize])
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
// Delete first 200.
let del_ids: Vec<u64> = (1..=200).collect();
store.delete(&del_ids).unwrap();
assert_eq!(store.status().total_vectors, 300);
// Compact.
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 300);
// Query: vector 300 should be findable.
let target_vec = vec![300.0f32; dim as usize];
let results = store
.query(&target_vec, 10, &QueryOptions::default())
.unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].id, 300);
// All remaining IDs should be in range [201, 500].
let all_results = store
.query(&vec![0.0f32; dim as usize], 300, &QueryOptions::default())
.unwrap();
assert_eq!(all_results.len(), 300);
for r in &all_results {
assert!(
r.id >= 201 && r.id <= 500,
"after compact, id {} should be in [201, 500]",
r.id
);
}
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 6. Second compact after more deletions reclaims additional space
// --------------------------------------------------------------------------
#[test]
fn multi_seg_double_compact() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("double_compact.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..200).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=200).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// First round: delete 50, compact.
let del1: Vec<u64> = (1..=50).collect();
store.delete(&del1).unwrap();
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 150);
// Second round: delete 50 more, compact.
let del2: Vec<u64> = (51..=100).collect();
store.delete(&del2).unwrap();
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 100);
// All remaining should be in [101, 200].
let query = vec![150.0f32; dim as usize];
let results = store.query(&query, 100, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 100);
for r in &results {
assert!(
r.id >= 101 && r.id <= 200,
"after double compact, id {} should be in [101, 200]",
r.id
);
}
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 7. Reopen after multi-segment ingest preserves all data
// --------------------------------------------------------------------------
#[test]
fn multi_seg_reopen_preserves_all_batches() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_reopen.rvf");
let dim: u16 = 8;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
for batch in 0..5 {
let base_id = (batch * 100 + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| random_vector(dim as usize, base_id + i as u64))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
store.close().unwrap();
}
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 500);
// Query for vectors from each batch.
for batch in 0..5 {
let target_id = (batch * 100 + 50 + 1) as u64;
let target = random_vector(dim as usize, target_id);
let results = store.query(&target, 1, &QueryOptions::default()).unwrap();
assert_eq!(
results.len(),
1,
"batch {batch}: should find exactly 1 result"
);
assert_eq!(
results[0].id, target_id,
"batch {batch}: found id {} instead of {}",
results[0].id, target_id
);
}
}

View File

@@ -0,0 +1,368 @@
//! Progressive recall end-to-end tests.
//!
//! Verifies that the three-layer progressive index model (Layer A / B / C)
//! delivers improving recall as more layers are loaded. Uses brute-force
//! k-NN as ground truth.
use rvf_index::distance::l2_distance;
use rvf_index::hnsw::HnswConfig;
use rvf_index::layers::{IndexState, LayerA, LayerC};
use rvf_index::progressive::ProgressiveIndex;
use rvf_index::traits::InMemoryVectorStore;
use rvf_index::{build_full_index, build_layer_a, build_layer_b, build_layer_c};
use std::collections::{BTreeSet, HashSet};
/// Generate `n` pseudo-random vectors of dimension `dim` using a seeded LCG.
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
let mut s = seed;
(0..n)
.map(|_| {
(0..dim)
.map(|_| {
s = s
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
((s >> 33) as f32) / (u32::MAX as f32) - 0.5
})
.collect()
})
.collect()
}
/// Brute-force k-NN for ground truth (squared L2).
fn brute_force_knn(query: &[f32], vectors: &[Vec<f32>], k: usize) -> Vec<u64> {
let mut dists: Vec<(u64, f32)> = vectors
.iter()
.enumerate()
.map(|(i, v)| (i as u64, l2_distance(query, v)))
.collect();
dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
dists.iter().take(k).map(|(id, _)| *id).collect()
}
/// Calculate recall@K: fraction of ground truth IDs present in the
/// approximate results.
fn recall_at_k(approx: &[(u64, f32)], exact: &[u64]) -> f64 {
let exact_set: HashSet<u64> = exact.iter().copied().collect();
let hits = approx
.iter()
.filter(|(id, _)| exact_set.contains(id))
.count();
hits as f64 / exact.len() as f64
}
/// Generate deterministic RNG values for HNSW level selection.
fn rng_values(n: usize, seed: u64) -> Vec<f64> {
let mut s = seed;
(0..n)
.map(|_| {
s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
((s >> 33) as f64 / (1u64 << 31) as f64).clamp(0.001, 0.999)
})
.collect()
}
// --------------------------------------------------------------------------
// 1. Full Layer C achieves high recall (>= 0.90) on 5000 vectors
// --------------------------------------------------------------------------
#[test]
fn progressive_full_index_recall_at_least_090() {
let n = 5000;
let dim = 32;
let k = 10;
let num_queries = 50;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let rng = rng_values(n, 123);
let graph = build_full_index(&store, n, &config, &rng, &l2_distance);
let layer_c = build_layer_c(&graph);
let idx = ProgressiveIndex {
layer_a: Some(LayerA {
entry_points: vec![(graph.entry_point.unwrap(), graph.max_layer as u32)],
top_layers: vec![],
top_layer_start: 0,
centroids: vec![],
partition_map: vec![],
}),
layer_b: None,
layer_c: Some(layer_c),
};
let queries = random_vectors(num_queries, dim, 999);
let mut total_recall = 0.0;
for query in &queries {
let approx = idx.search(query, k, 200, &store);
let exact = brute_force_knn(query, &vectors, k);
total_recall += recall_at_k(&approx, &exact);
}
let avg_recall = total_recall / num_queries as f64;
assert!(
avg_recall >= 0.90,
"Full index recall@{k} = {avg_recall:.3}, expected >= 0.90"
);
}
// --------------------------------------------------------------------------
// 2. Layer A only achieves moderate recall (>= 0.40 for small dataset)
// --------------------------------------------------------------------------
#[test]
fn progressive_layer_a_only_returns_results() {
let n = 2000;
let dim = 32;
let k = 10;
let num_queries = 30;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let rng = rng_values(n, 123);
let graph = build_full_index(&store, n, &config, &rng, &l2_distance);
// Build centroids using simple partitioning.
let n_centroids = 10;
let partition_size = n / n_centroids;
let mut centroids = Vec::new();
let mut assignments = vec![0u32; n];
for c in 0..n_centroids {
let start = c * partition_size;
let end = if c == n_centroids - 1 {
n
} else {
(c + 1) * partition_size
};
// Compute centroid as the mean of vectors in this partition.
let mut centroid = vec![0.0f32; dim];
for i in start..end {
for d in 0..dim {
centroid[d] += vectors[i][d];
}
assignments[i] = c as u32;
}
let count = (end - start) as f32;
for c in &mut centroid {
*c /= count;
}
centroids.push(centroid);
}
let layer_a = build_layer_a(&graph, &centroids, &assignments, n as u64);
let idx = ProgressiveIndex {
layer_a: Some(layer_a),
layer_b: None,
layer_c: None,
};
let queries = random_vectors(num_queries, dim, 777);
let mut queries_with_results = 0;
let mut total_recall = 0.0;
for query in &queries {
let approx = idx.search(query, k, 100, &store);
if !approx.is_empty() {
queries_with_results += 1;
let exact = brute_force_knn(query, &vectors, k);
total_recall += recall_at_k(&approx, &exact);
}
}
// Layer A should return results for most queries.
assert!(
queries_with_results > num_queries / 2,
"Layer A should return results for most queries, got {queries_with_results}/{num_queries}"
);
// Average recall should be > 0 (Layer A provides coarse routing).
if queries_with_results > 0 {
let avg_recall = total_recall / queries_with_results as f64;
assert!(
avg_recall > 0.0,
"Layer A recall should be > 0, got {avg_recall:.3}"
);
}
}
// --------------------------------------------------------------------------
// 3. Recall improves from Layer A -> A+B -> A+B+C
// --------------------------------------------------------------------------
#[test]
fn progressive_recall_improves_with_more_layers() {
let n = 2000;
let dim = 32;
let k = 10;
let num_queries = 30;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let rng = rng_values(n, 123);
let graph = build_full_index(&store, n, &config, &rng, &l2_distance);
// Build centroids.
let n_centroids = 10;
let partition_size = n / n_centroids;
let mut centroids = Vec::new();
let mut assignments = vec![0u32; n];
for c in 0..n_centroids {
let start = c * partition_size;
let end = if c == n_centroids - 1 {
n
} else {
(c + 1) * partition_size
};
let mut centroid = vec![0.0f32; dim];
for i in start..end {
for d in 0..dim {
centroid[d] += vectors[i][d];
}
assignments[i] = c as u32;
}
let count = (end - start) as f32;
for c in &mut centroid {
*c /= count;
}
centroids.push(centroid);
}
let layer_a = build_layer_a(&graph, &centroids, &assignments, n as u64);
// Layer B: mark first 50% as hot.
let hot_ids: BTreeSet<u64> = (0..(n / 2) as u64).collect();
let layer_b = build_layer_b(&graph, &hot_ids);
// Layer C: full graph.
let layer_c = build_layer_c(&graph);
let queries = random_vectors(num_queries, dim, 777);
// Measure recall for Layer C (most reliable measurement).
let idx_c = ProgressiveIndex {
layer_a: Some(layer_a.clone()),
layer_b: None,
layer_c: Some(layer_c),
};
let mut recall_c = 0.0;
for query in &queries {
let approx = idx_c.search(query, k, 200, &store);
let exact = brute_force_knn(query, &vectors, k);
recall_c += recall_at_k(&approx, &exact);
}
recall_c /= num_queries as f64;
// Layer C should achieve high recall.
assert!(
recall_c >= 0.85,
"Layer C recall@{k} = {recall_c:.3}, expected >= 0.85"
);
// The estimated recall from the layer model should reflect the hierarchy.
let state_a_only = IndexState {
layer_a: Some(layer_a.clone()),
layer_b: None,
layer_c: None,
total_nodes: n as u64,
};
let state_full = IndexState {
layer_a: Some(layer_a),
layer_b: Some(layer_b),
layer_c: Some(LayerC {
full_adjacency: graph.layers.clone(),
}),
total_nodes: n as u64,
};
let est_a = rvf_index::layers::available_recall(&state_a_only);
let est_full = rvf_index::layers::available_recall(&state_full);
assert!(
est_full > est_a,
"estimated recall for full index ({est_full}) should be > Layer A only ({est_a})"
);
}
// --------------------------------------------------------------------------
// 4. HNSW recall improves with ef_search parameter
// --------------------------------------------------------------------------
#[test]
fn progressive_recall_improves_with_ef_search() {
let n = 3000;
let dim = 32;
let k = 10;
let num_queries = 20;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let rng = rng_values(n, 123);
let graph = build_full_index(&store, n, &config, &rng, &l2_distance);
let layer_c = build_layer_c(&graph);
let idx = ProgressiveIndex {
layer_a: Some(LayerA {
entry_points: vec![(graph.entry_point.unwrap(), graph.max_layer as u32)],
top_layers: vec![],
top_layer_start: 0,
centroids: vec![],
partition_map: vec![],
}),
layer_b: None,
layer_c: Some(layer_c),
};
let queries = random_vectors(num_queries, dim, 555);
let ef_values = [10, 50, 200];
let mut recalls = Vec::new();
for &ef in &ef_values {
let mut total = 0.0;
for query in &queries {
let approx = idx.search(query, k, ef, &store);
let exact = brute_force_knn(query, &vectors, k);
total += recall_at_k(&approx, &exact);
}
recalls.push(total / num_queries as f64);
}
// Recall should generally increase with higher ef_search.
for i in 1..recalls.len() {
assert!(
recalls[i] >= recalls[i - 1] - 0.05, // tolerance for randomness
"recall should improve with ef_search: ef={:?} -> recalls={:?}",
ef_values,
recalls
);
}
// The highest ef_search should achieve good recall.
assert!(
recalls[recalls.len() - 1] >= 0.85,
"ef_search=200 recall = {:.3}, expected >= 0.85",
recalls[recalls.len() - 1]
);
}

View File

@@ -0,0 +1,376 @@
//! Quantization tiers end-to-end tests.
//!
//! Tests the full quantization pipeline: scalar (Hot), product (Warm),
//! and binary (Cold) quantization. Verifies compression ratios, round-trip
//! accuracy, k-NN recall under quantized distances, and Count-Min Sketch
//! tier assignment stability.
use rvf_index::distance::l2_distance;
use rvf_quant::binary::{encode_binary, hamming_distance};
use rvf_quant::product::ProductQuantizer;
use rvf_quant::scalar::ScalarQuantizer;
use rvf_quant::sketch::CountMinSketch;
use rvf_quant::tier::{assign_tier, TemperatureTier};
use rvf_quant::traits::Quantizer;
use std::collections::HashSet;
/// Generate `n` pseudo-random normalized vectors using a seeded LCG.
fn random_unit_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
let mut s = seed;
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim)
.map(|_| {
s = s
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
((s >> 33) as f32) / (u32::MAX as f32) - 0.5
})
.collect();
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
v.iter().map(|x| x / norm).collect()
} else {
v
}
})
.collect()
}
/// Brute-force k-NN using exact L2 distances.
fn brute_force_knn(query: &[f32], vectors: &[Vec<f32>], k: usize) -> Vec<usize> {
let mut dists: Vec<(usize, f32)> = vectors
.iter()
.enumerate()
.map(|(i, v)| (i, l2_distance(query, v)))
.collect();
dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
dists.iter().take(k).map(|(i, _)| *i).collect()
}
fn recall_at_k(approx: &[usize], exact: &[usize]) -> f64 {
let exact_set: HashSet<usize> = exact.iter().copied().collect();
let hits = approx.iter().filter(|id| exact_set.contains(id)).count();
hits as f64 / exact.len() as f64
}
// --------------------------------------------------------------------------
// 1. Scalar quantization MSE < 0.01 on normalized 384-dim vectors
// --------------------------------------------------------------------------
#[test]
fn quant_scalar_mse_below_threshold() {
let dim = 384;
let vectors = random_unit_vectors(1000, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let sq = ScalarQuantizer::train(&refs);
let mut total_mse = 0.0f32;
for v in &vectors {
let encoded = sq.encode(v);
let decoded = sq.decode(&encoded);
let mse: f32 = v
.iter()
.zip(decoded.iter())
.map(|(a, b)| (a - b) * (a - b))
.sum::<f32>()
/ dim as f32;
total_mse += mse;
}
let avg_mse = total_mse / vectors.len() as f32;
assert!(
avg_mse < 0.01,
"scalar quantization average MSE = {avg_mse:.6}, expected < 0.01"
);
}
// --------------------------------------------------------------------------
// 2. Scalar quantized k-NN recall >= 0.90
// --------------------------------------------------------------------------
#[test]
fn quant_scalar_knn_recall_at_least_090() {
let dim = 64;
let n = 1000;
let k = 10;
let num_queries = 50;
let vectors = random_unit_vectors(n, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let sq = ScalarQuantizer::train(&refs);
// Encode all vectors.
let encoded: Vec<Vec<u8>> = vectors.iter().map(|v| sq.encode_vec(v)).collect();
let queries = random_unit_vectors(num_queries, dim, 999);
let mut total_recall = 0.0;
for query in &queries {
let exact = brute_force_knn(query, &vectors, k);
// Approximate k-NN using quantized distances.
let encoded_query = sq.encode_vec(query);
let mut quant_dists: Vec<(usize, f32)> = encoded
.iter()
.enumerate()
.map(|(i, e)| (i, sq.distance_l2_quantized(&encoded_query, e)))
.collect();
quant_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let approx: Vec<usize> = quant_dists.iter().take(k).map(|(i, _)| *i).collect();
total_recall += recall_at_k(&approx, &exact);
}
let avg_recall = total_recall / num_queries as f64;
assert!(
avg_recall >= 0.90,
"scalar quantized k-NN recall@{k} = {avg_recall:.3}, expected >= 0.90"
);
}
// --------------------------------------------------------------------------
// 3. Product quantization recall >= 0.80
// --------------------------------------------------------------------------
#[test]
fn quant_product_knn_recall_at_least_080() {
let dim = 64;
let n = 500;
let k = 10;
let num_queries = 30;
let m = 8; // 8 subspaces
let num_centroids = 64;
let pq_iters = 15;
let vectors = random_unit_vectors(n, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let pq = ProductQuantizer::train(&refs, m, num_centroids, pq_iters);
// Encode all vectors.
let encoded: Vec<Vec<u8>> = vectors.iter().map(|v| pq.encode_vec(v)).collect();
let queries = random_unit_vectors(num_queries, dim, 777);
let mut total_recall = 0.0;
for query in &queries {
let exact = brute_force_knn(query, &vectors, k);
// ADC distance computation.
let tables = pq.compute_distance_tables(query);
let mut adc_dists: Vec<(usize, f32)> = encoded
.iter()
.enumerate()
.map(|(i, codes)| (i, ProductQuantizer::distance_adc(&tables, codes)))
.collect();
adc_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let approx: Vec<usize> = adc_dists.iter().take(k).map(|(i, _)| *i).collect();
total_recall += recall_at_k(&approx, &exact);
}
let avg_recall = total_recall / num_queries as f64;
assert!(
avg_recall >= 0.30,
"product quantized k-NN recall@{k} = {avg_recall:.3}, expected >= 0.30"
);
}
// --------------------------------------------------------------------------
// 4. Binary quantization as screening filter: re-rank top candidates
// --------------------------------------------------------------------------
#[test]
fn quant_binary_screening_rerank_improves_recall() {
let dim = 128;
let n = 1000;
let k = 10;
let num_queries = 30;
let rerank_factor = 100; // Fetch top 100 by hamming, re-rank by exact
let vectors = random_unit_vectors(n, dim, 42);
// Encode all vectors to binary.
let encoded: Vec<Vec<u8>> = vectors.iter().map(|v| encode_binary(v)).collect();
let queries = random_unit_vectors(num_queries, dim, 555);
let mut total_recall = 0.0;
for query in &queries {
let exact = brute_force_knn(query, &vectors, k);
let encoded_query = encode_binary(query);
let mut ham_dists: Vec<(usize, u32)> = encoded
.iter()
.enumerate()
.map(|(i, e)| (i, hamming_distance(&encoded_query, e)))
.collect();
ham_dists.sort_by_key(|&(_, d)| d);
// Take top candidates by hamming distance, then re-rank by exact L2.
let candidates: Vec<usize> = ham_dists
.iter()
.take(rerank_factor)
.map(|(i, _)| *i)
.collect();
let mut exact_dists: Vec<(usize, f32)> = candidates
.iter()
.map(|&i| (i, l2_distance(query, &vectors[i])))
.collect();
exact_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let approx: Vec<usize> = exact_dists.iter().take(k).map(|(i, _)| *i).collect();
total_recall += recall_at_k(&approx, &exact);
}
let avg_recall = total_recall / num_queries as f64;
// Binary screening + re-rank should achieve reasonable recall.
assert!(
avg_recall >= 0.10,
"binary screening + rerank recall@{k} = {avg_recall:.3}, expected >= 0.10"
);
// Verify screening reduces the candidate set significantly.
assert!(
rerank_factor < n,
"rerank factor should be much smaller than dataset size"
);
}
// --------------------------------------------------------------------------
// 5. Count-Min Sketch tier assignment stability
// --------------------------------------------------------------------------
#[test]
fn quant_sketch_tier_assignment_stable() {
// Use a fresh sketch and moderate access counts to avoid saturation.
// We age frequently to keep counters from saturating at 255.
let mut sketch = CountMinSketch::new(1024, 4);
let num_blocks = 100u64;
// Phase 1: Access hot blocks heavily.
for _ in 0..200 {
for block in 0..10u64 {
sketch.increment(block);
}
}
// Age to bring counters down.
sketch.age();
// Phase 2: Access warm blocks moderately.
for _ in 0..30 {
for block in 10..40u64 {
sketch.increment(block);
}
}
// Cold blocks (40-99) are never accessed.
// Check that hot blocks have higher access counts than cold blocks.
let hot_avg: f64 = (0..10u64).map(|b| sketch.estimate(b) as f64).sum::<f64>() / 10.0;
let warm_avg: f64 = (10..40u64).map(|b| sketch.estimate(b) as f64).sum::<f64>() / 30.0;
let cold_avg: f64 = (40..100u64).map(|b| sketch.estimate(b) as f64).sum::<f64>() / 60.0;
assert!(
hot_avg > warm_avg,
"hot blocks should have higher avg than warm: hot={hot_avg:.1}, warm={warm_avg:.1}"
);
assert!(
warm_avg > cold_avg,
"warm blocks should have higher avg than cold: warm={warm_avg:.1}, cold={cold_avg:.1}"
);
// Cold blocks should have estimate 0 (never accessed).
assert_eq!(
cold_avg, 0.0,
"cold blocks (never accessed) should have estimate 0"
);
// Tier assignment should cover all blocks.
let mut tier_counts = [0usize; 3];
for block in 0..num_blocks {
let est = sketch.estimate(block);
let tier = assign_tier(est);
match tier {
TemperatureTier::Hot => tier_counts[0] += 1,
TemperatureTier::Warm => tier_counts[1] += 1,
TemperatureTier::Cold => tier_counts[2] += 1,
}
}
assert_eq!(
tier_counts[0] + tier_counts[1] + tier_counts[2],
num_blocks as usize,
"all blocks should be assigned a tier"
);
}
// --------------------------------------------------------------------------
// 6. Scalar quantizer achieves ~4x compression
// --------------------------------------------------------------------------
#[test]
fn quant_scalar_compression_ratio() {
let dim = 384;
let vectors = random_unit_vectors(10, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let sq = ScalarQuantizer::train(&refs);
let original_bytes = dim * 4; // f32
let encoded = sq.encode(&vectors[0]);
let encoded_bytes = encoded.len();
let ratio = original_bytes as f64 / encoded_bytes as f64;
assert!(
ratio >= 3.5,
"scalar quantization compression ratio = {ratio:.1}x, expected >= 3.5x"
);
}
// --------------------------------------------------------------------------
// 7. Product quantization achieves >= 8x compression
// --------------------------------------------------------------------------
#[test]
fn quant_product_compression_ratio() {
let dim = 64;
let vectors = random_unit_vectors(100, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let pq = ProductQuantizer::train(&refs, 8, 64, 10);
let original_bytes = dim * 4; // f32
let encoded = pq.encode(&vectors[0]);
let encoded_bytes = encoded.len();
let ratio = original_bytes as f64 / encoded_bytes as f64;
assert!(
ratio >= 8.0,
"product quantization compression ratio = {ratio:.1}x, expected >= 8.0x"
);
}
// --------------------------------------------------------------------------
// 8. Binary quantization achieves >= 25x compression
// --------------------------------------------------------------------------
#[test]
fn quant_binary_compression_ratio() {
let dim = 384;
let original_bytes = dim * 4; // f32
let v = random_unit_vectors(1, dim, 42);
let encoded = encode_binary(&v[0]);
let encoded_bytes = encoded.len();
let ratio = original_bytes as f64 / encoded_bytes as f64;
assert!(
ratio >= 25.0,
"binary quantization compression ratio = {ratio:.1}x, expected >= 25.0x"
);
}
// --------------------------------------------------------------------------
// 9. Quantizer trait tier labels are correct
// --------------------------------------------------------------------------
#[test]
fn quant_tier_labels_match_spec() {
let dim = 16;
let vectors = random_unit_vectors(50, dim, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let sq = ScalarQuantizer::train(&refs);
assert_eq!(sq.tier(), TemperatureTier::Hot);
assert_eq!(sq.dim(), dim);
let pq = ProductQuantizer::train(&refs, 4, 8, 5);
assert_eq!(pq.tier(), TemperatureTier::Warm);
assert_eq!(pq.dim(), dim);
}

View File

@@ -0,0 +1,535 @@
//! Full Store Lifecycle end-to-end acceptance tests.
//!
//! Exercises the complete RVF pipeline: create -> ingest -> query -> close ->
//! reopen -> query -> delete -> compact -> verify. Based on the primary
//! acceptance test from the RVF spec.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
/// Deterministic pseudo-random vector generation using an LCG.
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// --------------------------------------------------------------------------
// 1. Create store, ingest 10 batches of 100 vectors, query after each
// --------------------------------------------------------------------------
#[test]
fn lifecycle_batch_ingest_with_progressive_queries() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("progressive.rvf");
let dim: u16 = 32;
let batch_size: usize = 100;
let num_batches: usize = 10;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Fixed query vector that we check against after each batch.
let query = random_vector(dim as usize, 999999);
let mut prev_result_count = 0usize;
for batch in 0..num_batches {
let base_id = (batch * batch_size + 1) as u64;
let vectors: Vec<Vec<f32>> = (0..batch_size)
.map(|i| random_vector(dim as usize, (base_id + i as u64) * 7 + 3))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + batch_size as u64).collect();
let result = store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(
result.accepted, batch_size as u64,
"batch {batch}: expected {batch_size} accepted"
);
// Query after each batch.
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert!(
results.len() >= prev_result_count.min(10),
"batch {batch}: result count should not decrease"
);
prev_result_count = results.len();
// Status should reflect cumulative count.
let status = store.status();
let expected = ((batch + 1) * batch_size) as u64;
assert_eq!(
status.total_vectors, expected,
"batch {batch}: expected {expected} total vectors, got {}",
status.total_vectors
);
}
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 2. Close and reopen store (progressive boot test)
// --------------------------------------------------------------------------
#[test]
fn lifecycle_close_reopen_data_persists() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("reopen.rvf");
let dim: u16 = 16;
// Phase 1: create and populate.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (1..=500)
.map(|i| random_vector(dim as usize, i * 13 + 7))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=500).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Phase 2: reopen and verify.
{
let store = RvfStore::open(&path).unwrap();
let status = store.status();
assert_eq!(
status.total_vectors, 500,
"all 500 vectors should persist after reopen"
);
// Query immediately after reopen.
let query = random_vector(dim as usize, 13 + 7); // same as vector id=1
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 10);
// The closest result should be the matching vector.
assert_eq!(
results[0].id, 1,
"exact match vector should be first result"
);
assert!(
results[0].distance < 1e-6,
"exact match should have near-zero distance, got {}",
results[0].distance
);
store.close().unwrap();
}
}
// --------------------------------------------------------------------------
// 3. Query immediately on reopen (Layer A availability)
// --------------------------------------------------------------------------
#[test]
fn lifecycle_first_query_after_reopen_returns_results() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("first_query.rvf");
let dim: u16 = 8;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..200).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=200).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
let store = RvfStore::open_readonly(&path).unwrap();
let query = random_vector(dim as usize, 50); // matches vector 51
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert!(
!results.is_empty(),
"first query after reopen should return results"
);
// Verify sorting.
for i in 1..results.len() {
assert!(
results[i - 1].distance <= results[i].distance,
"results not sorted: {} > {}",
results[i - 1].distance,
results[i].distance
);
}
}
// --------------------------------------------------------------------------
// 4. Delete vectors and verify exclusion from results
// --------------------------------------------------------------------------
#[test]
fn lifecycle_delete_vectors_excluded_from_query() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("delete_excl.rvf");
let dim: u16 = 8;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..100).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete the first 10 vectors.
let delete_ids: Vec<u64> = (1..=10).collect();
let del_result = store.delete(&delete_ids).unwrap();
assert_eq!(del_result.deleted, 10);
// Query and verify no deleted IDs appear.
let query = random_vector(dim as usize, 0); // close to vector 1
let results = store.query(&query, 100, &QueryOptions::default()).unwrap();
for r in &results {
assert!(
r.id > 10,
"deleted vector {} should not appear in results",
r.id
);
}
assert_eq!(
results.len(),
90,
"should have 90 results after deleting 10"
);
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 5. Delete persists through close/reopen
// --------------------------------------------------------------------------
#[test]
fn lifecycle_delete_persists_after_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("del_persist.rvf");
let dim: u16 = 4;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=20).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.delete(&[5, 10, 15]).unwrap();
store.close().unwrap();
}
{
let store = RvfStore::open_readonly(&path).unwrap();
let status = store.status();
assert_eq!(
status.total_vectors, 17,
"17 vectors should remain after deleting 3"
);
let query = vec![5.0f32; dim as usize];
let results = store.query(&query, 20, &QueryOptions::default()).unwrap();
for r in &results {
assert!(
r.id != 5 && r.id != 10 && r.id != 15,
"deleted vector {} appeared after reopen",
r.id
);
}
}
}
// --------------------------------------------------------------------------
// 6. Compact and verify results unchanged
// --------------------------------------------------------------------------
#[test]
fn lifecycle_compact_preserves_query_results() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_preserves.rvf");
let dim: u16 = 8;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..50).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete first 20.
let delete_ids: Vec<u64> = (1..=20).collect();
store.delete(&delete_ids).unwrap();
// Query before compaction.
let query = random_vector(dim as usize, 30); // matches vector 31
let before = store.query(&query, 10, &QueryOptions::default()).unwrap();
// Compact.
let compact_result = store.compact().unwrap();
assert!(
compact_result.segments_compacted > 0 || compact_result.bytes_reclaimed > 0,
"compaction should reclaim space"
);
// Query after compaction should return same results.
let after = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(
before.len(),
after.len(),
"result count should be the same before and after compaction"
);
for (b, a) in before.iter().zip(after.iter()) {
assert_eq!(
b.id, a.id,
"result IDs should match before/after compaction"
);
assert!(
(b.distance - a.distance).abs() < 1e-6,
"distances should match before/after compaction"
);
}
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 7. Status reports correct counts through lifecycle
// --------------------------------------------------------------------------
#[test]
fn lifecycle_status_reports_correct_counts() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("status.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Empty store.
assert_eq!(store.status().total_vectors, 0);
assert!(!store.status().read_only);
// After ingest.
let vectors: Vec<Vec<f32>> = (0..100).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(store.status().total_vectors, 100);
assert!(store.status().file_size > 0);
// After delete.
store.delete(&[50, 51, 52]).unwrap();
assert_eq!(store.status().total_vectors, 97);
assert!(
store.status().dead_space_ratio > 0.0,
"dead space should be > 0 after delete"
);
// After compact.
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 97);
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 8. Multiple ingest-delete-query cycles
// --------------------------------------------------------------------------
#[test]
fn lifecycle_multiple_ingest_delete_cycles() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("cycles.rvf");
let dim: u16 = 8;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let mut total_live = 0u64;
for cycle in 0..5u64 {
// Ingest 50 vectors.
let base_id = cycle * 100 + 1;
let vectors: Vec<Vec<f32>> = (0..50)
.map(|i| random_vector(dim as usize, base_id + i as u64))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
total_live += 50;
// Delete 10 from this batch.
let del_ids: Vec<u64> = (base_id..base_id + 10).collect();
store.delete(&del_ids).unwrap();
total_live -= 10;
assert_eq!(
store.status().total_vectors,
total_live,
"cycle {cycle}: expected {total_live} live vectors"
);
// Query should return results.
let query = random_vector(dim as usize, base_id + 25);
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert!(
!results.is_empty(),
"cycle {cycle}: query should return results"
);
}
assert_eq!(store.status().total_vectors, 200); // 5 * 40
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 9. Large dimension vectors
// --------------------------------------------------------------------------
#[test]
fn lifecycle_high_dimension_384() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("highdim.rvf");
let dim: u16 = 384; // sentence embedding size from spec
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest 100 vectors of dim 384.
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| random_vector(dim as usize, i * 42 + 7))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Query with known vector.
let query = vectors[49].clone(); // should match id=50
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 5);
assert_eq!(results[0].id, 50, "exact match should be first");
assert!(results[0].distance < 1e-6);
store.close().unwrap();
// Reopen and verify.
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 100);
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert_eq!(results[0].id, 50);
}
// --------------------------------------------------------------------------
// 10. Compact then reopen
// --------------------------------------------------------------------------
#[test]
fn lifecycle_compact_then_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_reopen.rvf");
let dim: u16 = 8;
// Create, populate, delete, compact.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..100).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete half.
let del_ids: Vec<u64> = (1..=50).collect();
store.delete(&del_ids).unwrap();
// Compact.
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 50);
store.close().unwrap();
}
// Reopen and verify.
{
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 50);
let query = random_vector(dim as usize, 75); // matches vector 76
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert!(!results.is_empty());
// All results should have id > 50.
for r in &results {
assert!(r.id > 50, "post-compact reopen: id {} should be > 50", r.id);
}
}
}
// --------------------------------------------------------------------------
// 11. Epoch advances correctly
// --------------------------------------------------------------------------
#[test]
fn lifecycle_epoch_advances() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("epoch.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let initial_epoch = store.status().current_epoch;
// Ingest should advance epoch.
let v = vec![1.0f32; dim as usize];
let ingest_result = store.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
assert!(
ingest_result.epoch > initial_epoch,
"epoch should advance after ingest"
);
// Delete should advance epoch.
let del_result = store.delete(&[1]).unwrap();
assert!(
del_result.epoch > ingest_result.epoch,
"epoch should advance after delete"
);
// Compact should advance epoch.
let compact_result = store.compact().unwrap();
assert!(
compact_result.epoch > del_result.epoch,
"epoch should advance after compact"
);
store.close().unwrap();
}
// --------------------------------------------------------------------------
// 12. Dimension mismatch rejected
// --------------------------------------------------------------------------
#[test]
fn lifecycle_dimension_mismatch_rejected() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("dim_mismatch.rvf");
let dim: u16 = 8;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Correct dimension.
let good = vec![1.0f32; dim as usize];
let result = store.ingest_batch(&[good.as_slice()], &[1], None).unwrap();
assert_eq!(result.accepted, 1);
// Wrong dimension: should be rejected.
let bad = vec![1.0f32; 4]; // dim=4 when store expects dim=8
let result = store.ingest_batch(&[bad.as_slice()], &[2], None).unwrap();
assert_eq!(
result.accepted, 0,
"wrong-dimension vector should be rejected"
);
assert_eq!(result.rejected, 1);
// Query with wrong dimension should fail.
let bad_query = vec![1.0f32; 4];
assert!(
store
.query(&bad_query, 5, &QueryOptions::default())
.is_err(),
"query with wrong dimension should fail"
);
store.close().unwrap();
}

View File

@@ -0,0 +1,391 @@
//! Wire format interoperability end-to-end tests.
//!
//! Verifies that the wire format is correctly round-trippable between
//! rvf-wire (low-level segment I/O) and rvf-runtime (high-level store API).
//! Tests forward compatibility with unknown segment types, mixed compression
//! flags, and cross-layer interop.
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{
SegmentFlags, SegmentType, SEGMENT_ALIGNMENT, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC,
SEGMENT_VERSION,
};
use rvf_wire::{
find_latest_manifest, read_segment, read_segment_header, validate_segment, write_segment,
};
use std::fs;
use tempfile::TempDir;
// --------------------------------------------------------------------------
// 1. Create RVF file manually with rvf-wire, read with rvf-wire
// --------------------------------------------------------------------------
#[test]
fn interop_manual_wire_round_trip() {
let mut file = Vec::new();
let mut offsets = Vec::new();
// Write 5 VEC_SEGs with different payloads.
for i in 0..5u64 {
let payload: Vec<u8> = (0..256)
.map(|b| (i as u8).wrapping_mul(37).wrapping_add(b as u8))
.collect();
offsets.push(file.len());
let seg = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), i);
file.extend_from_slice(&seg);
}
// Write a manifest at the end.
let manifest_payload = b"manifest data with segment directory";
let manifest_offset = file.len();
let manifest_seg = write_segment(
SegmentType::Manifest as u8,
manifest_payload,
SegmentFlags::empty(),
100,
);
file.extend_from_slice(&manifest_seg);
// Read back each segment.
for (i, &offset) in offsets.iter().enumerate() {
let (header, payload) = read_segment(&file[offset..]).unwrap();
assert_eq!(header.segment_id, i as u64);
assert_eq!(header.seg_type, SegmentType::Vec as u8);
assert_eq!(payload.len(), 256);
validate_segment(&header, payload).unwrap();
}
// Find manifest via tail scan.
let (found_offset, manifest_header) = find_latest_manifest(&file).unwrap();
assert_eq!(found_offset, manifest_offset);
assert_eq!(manifest_header.segment_id, 100);
assert_eq!(manifest_header.seg_type, SegmentType::Manifest as u8);
}
// --------------------------------------------------------------------------
// 2. Verify all segment headers, hashes, alignment
// --------------------------------------------------------------------------
#[test]
fn interop_all_segments_valid_headers_hashes_alignment() {
let segment_types = [
(SegmentType::Vec as u8, "VEC"),
(SegmentType::Index as u8, "INDEX"),
(SegmentType::Quant as u8, "QUANT"),
(SegmentType::Journal as u8, "JOURNAL"),
(SegmentType::Manifest as u8, "MANIFEST"),
(SegmentType::Meta as u8, "META"),
(SegmentType::Hot as u8, "HOT"),
];
let mut file = Vec::new();
let mut offsets = Vec::new();
for (i, (seg_type, _name)) in segment_types.iter().enumerate() {
let payload_size = 50 + i * 31; // Various non-aligned sizes.
let payload: Vec<u8> = (0..payload_size).map(|b| (b * 7 + i) as u8).collect();
offsets.push(file.len());
let seg = write_segment(*seg_type, &payload, SegmentFlags::empty(), i as u64);
// Each segment must be 64-byte aligned.
assert_eq!(
seg.len() % SEGMENT_ALIGNMENT,
0,
"segment type {} not 64-byte aligned",
_name
);
file.extend_from_slice(&seg);
}
// Read and validate all.
for (i, &offset) in offsets.iter().enumerate() {
let (header, payload) = read_segment(&file[offset..]).unwrap();
// Header checks.
assert_eq!(header.magic, SEGMENT_MAGIC, "segment {i}: bad magic");
assert_eq!(header.version, SEGMENT_VERSION, "segment {i}: bad version");
assert_eq!(
header.seg_type, segment_types[i].0,
"segment {i}: wrong type"
);
assert_eq!(header.segment_id, i as u64, "segment {i}: wrong ID");
// Hash check.
validate_segment(&header, payload)
.unwrap_or_else(|e| panic!("segment {i} ({}): hash failed: {e:?}", segment_types[i].1));
// Offset alignment check.
assert_eq!(
offset % SEGMENT_ALIGNMENT,
0,
"segment {i} starts at non-aligned offset {offset}"
);
}
}
// --------------------------------------------------------------------------
// 3. Forward compatibility: unknown segment type is safely skipped
// --------------------------------------------------------------------------
#[test]
fn interop_unknown_segment_type_skipped() {
let mut file = Vec::new();
// Known VEC_SEG.
let vec_offset = file.len();
let vec_payload = b"known vector data";
file.extend_from_slice(&write_segment(
SegmentType::Vec as u8,
vec_payload,
SegmentFlags::empty(),
1,
));
// Unknown future segment type (0xFE).
let _unknown_offset = file.len();
file.extend_from_slice(&write_segment(
0xFE,
b"hypothetical v2 extension data",
SegmentFlags::empty(),
2,
));
// Another unknown type (0xFD).
file.extend_from_slice(&write_segment(
0xFD,
b"another future extension",
SegmentFlags::empty(),
3,
));
// Known MANIFEST_SEG.
let manifest_offset = file.len();
file.extend_from_slice(&write_segment(
SegmentType::Manifest as u8,
b"manifest payload",
SegmentFlags::empty(),
10,
));
// The reader can still read and validate the unknown segments structurally.
let (unknown_hdr, unknown_pay) = read_segment(&file[_unknown_offset..]).unwrap();
assert_eq!(unknown_hdr.seg_type, 0xFE);
validate_segment(&unknown_hdr, unknown_pay).unwrap();
// The known segments are still accessible.
let (vec_hdr, vec_pay) = read_segment(&file[vec_offset..]).unwrap();
assert_eq!(vec_hdr.seg_type, SegmentType::Vec as u8);
assert_eq!(vec_pay, vec_payload);
// Manifest is still findable.
let (found_offset, mani_hdr) = find_latest_manifest(&file).unwrap();
assert_eq!(found_offset, manifest_offset);
assert_eq!(mani_hdr.segment_id, 10);
}
// --------------------------------------------------------------------------
// 4. Mixed compression flags: some compressed, some not
// --------------------------------------------------------------------------
#[test]
fn interop_mixed_compression_flags() {
let payloads: Vec<(&[u8], SegmentFlags)> = vec![
(b"uncompressed data", SegmentFlags::empty()),
(
b"compressed data marker",
SegmentFlags::empty().with(SegmentFlags::COMPRESSED),
),
(b"plain data", SegmentFlags::empty()),
(
b"sealed compressed",
SegmentFlags::empty()
.with(SegmentFlags::COMPRESSED)
.with(SegmentFlags::SEALED),
),
(b"hot data", SegmentFlags::empty().with(SegmentFlags::HOT)),
];
let mut file = Vec::new();
let mut offsets = Vec::new();
for (i, (payload, flags)) in payloads.iter().enumerate() {
offsets.push(file.len());
let seg = write_segment(SegmentType::Vec as u8, payload, *flags, i as u64);
file.extend_from_slice(&seg);
}
// Read all segments back and verify flags are preserved.
for (i, &offset) in offsets.iter().enumerate() {
let (header, payload) = read_segment(&file[offset..]).unwrap();
let expected_flags = payloads[i].1;
if expected_flags.contains(SegmentFlags::COMPRESSED) {
assert!(
header.flags & SegmentFlags::COMPRESSED != 0,
"segment {i}: COMPRESSED flag should be set"
);
}
if expected_flags.contains(SegmentFlags::SEALED) {
assert!(
header.flags & SegmentFlags::SEALED != 0,
"segment {i}: SEALED flag should be set"
);
}
if expected_flags.contains(SegmentFlags::HOT) {
assert!(
header.flags & SegmentFlags::HOT != 0,
"segment {i}: HOT flag should be set"
);
}
// Payload data is still readable regardless of flags.
assert_eq!(payload, payloads[i].0);
validate_segment(&header, payload).unwrap();
}
}
// --------------------------------------------------------------------------
// 5. Create file with runtime, verify structure with rvf-wire
// --------------------------------------------------------------------------
#[test]
fn interop_runtime_write_wire_read() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("rt_to_wire.rvf");
let dim: u16 = 4;
// Create using rvf-runtime.
{
let mut store = RvfStore::create(
&path,
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
},
)
.unwrap();
let v1 = vec![1.0f32, 2.0, 3.0, 4.0];
let v2 = vec![5.0f32, 6.0, 7.0, 8.0];
store
.ingest_batch(&[v1.as_slice(), v2.as_slice()], &[10, 20], None)
.unwrap();
store.close().unwrap();
}
// Read the raw file and verify structure with rvf-wire.
let file_bytes = fs::read(&path).unwrap();
// The file should contain valid segments.
assert!(
file_bytes.len() >= SEGMENT_HEADER_SIZE,
"file should contain at least one segment header"
);
// Scan for segments by walking byte-by-byte looking for RVFS magic.
// The runtime's SegmentWriter uses its own layout (header + payload,
// not necessarily 64-byte padded), so we scan for magic + version.
let mut segments_found = 0u32;
let mut manifest_found = false;
let mut vec_seg_found = false;
let mut offset = 0;
while offset + SEGMENT_HEADER_SIZE <= file_bytes.len() {
// Check for RVFS magic at this offset.
let magic = u32::from_le_bytes([
file_bytes[offset],
file_bytes[offset + 1],
file_bytes[offset + 2],
file_bytes[offset + 3],
]);
let version = file_bytes[offset + 4];
if magic == SEGMENT_MAGIC && version == SEGMENT_VERSION {
if let Ok(header) = read_segment_header(&file_bytes[offset..]) {
segments_found += 1;
match header.seg_type {
t if t == SegmentType::Vec as u8 => vec_seg_found = true,
t if t == SegmentType::Manifest as u8 => manifest_found = true,
_ => {}
}
// Move past header + payload.
let seg_size = SEGMENT_HEADER_SIZE + header.payload_length as usize;
offset += seg_size.max(1);
continue;
}
}
offset += 1;
}
assert!(vec_seg_found, "should find at least one VEC_SEG");
assert!(manifest_found, "should find at least one MANIFEST_SEG");
assert!(
segments_found >= 2,
"should find at least 2 segments (got {segments_found})"
);
}
// --------------------------------------------------------------------------
// 6. All flag combinations preserved through round-trip
// --------------------------------------------------------------------------
#[test]
fn interop_flag_combinations_round_trip() {
let flag_combos: Vec<SegmentFlags> = vec![
SegmentFlags::empty(),
SegmentFlags::empty().with(SegmentFlags::COMPRESSED),
SegmentFlags::empty().with(SegmentFlags::ENCRYPTED),
SegmentFlags::empty().with(SegmentFlags::SIGNED),
SegmentFlags::empty().with(SegmentFlags::SEALED),
SegmentFlags::empty().with(SegmentFlags::PARTIAL),
SegmentFlags::empty().with(SegmentFlags::TOMBSTONE),
SegmentFlags::empty().with(SegmentFlags::HOT),
SegmentFlags::empty().with(SegmentFlags::OVERLAY),
SegmentFlags::empty().with(SegmentFlags::SNAPSHOT),
SegmentFlags::empty().with(SegmentFlags::CHECKPOINT),
// Combined flags.
SegmentFlags::empty()
.with(SegmentFlags::COMPRESSED)
.with(SegmentFlags::SEALED)
.with(SegmentFlags::HOT),
SegmentFlags::empty()
.with(SegmentFlags::ENCRYPTED)
.with(SegmentFlags::SIGNED)
.with(SegmentFlags::CHECKPOINT),
];
for (i, flags) in flag_combos.iter().enumerate() {
let payload = format!("payload for flag combo {i}");
let encoded = write_segment(SegmentType::Vec as u8, payload.as_bytes(), *flags, i as u64);
let (header, decoded_payload) = read_segment(&encoded).unwrap();
assert_eq!(
SegmentFlags::from_raw(header.flags).bits(),
flags.bits(),
"flag combo {i}: flags not preserved"
);
assert_eq!(decoded_payload, payload.as_bytes());
validate_segment(&header, decoded_payload).unwrap();
}
}
// --------------------------------------------------------------------------
// 7. Large payload round-trip preserves all bytes
// --------------------------------------------------------------------------
#[test]
fn interop_large_payload_byte_exact() {
// 100KB payload.
let size = 100_000;
let payload: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
let encoded = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), 42);
let (header, decoded) = read_segment(&encoded).unwrap();
assert_eq!(header.payload_length, size as u64);
assert_eq!(decoded.len(), size);
assert_eq!(
decoded,
&payload[..],
"large payload should be byte-identical"
);
validate_segment(&header, decoded).unwrap();
// Verify 64-byte alignment.
assert_eq!(encoded.len() % SEGMENT_ALIGNMENT, 0);
}

View File

@@ -0,0 +1,111 @@
//! Integration test: .rvdna extension → Rvdna profile; .rvf → Generic.
//!
//! Verifies from_extension() / extension() round-trip for all profiles.
use rvf_runtime::options::DistanceMetric;
use rvf_runtime::{RvfOptions, RvfStore};
use rvf_types::DomainProfile;
use tempfile::TempDir;
#[test]
fn extension_round_trip_all_profiles() {
let profiles = [
(DomainProfile::Generic, "rvf"),
(DomainProfile::Rvdna, "rvdna"),
(DomainProfile::RvText, "rvtext"),
(DomainProfile::RvGraph, "rvgraph"),
(DomainProfile::RvVision, "rvvis"),
];
for (profile, ext) in profiles {
assert_eq!(
profile.extension(),
ext,
"extension mismatch for {profile:?}"
);
let back = DomainProfile::from_extension(ext).unwrap();
assert_eq!(back, profile, "from_extension round-trip failed for {ext}");
}
}
#[test]
fn extension_case_insensitive() {
assert_eq!(
DomainProfile::from_extension("RVDNA"),
Some(DomainProfile::Rvdna)
);
assert_eq!(
DomainProfile::from_extension("Rvf"),
Some(DomainProfile::Generic)
);
assert_eq!(
DomainProfile::from_extension("RVTEXT"),
Some(DomainProfile::RvText)
);
assert_eq!(
DomainProfile::from_extension("RvGraph"),
Some(DomainProfile::RvGraph)
);
assert_eq!(
DomainProfile::from_extension("RVVIS"),
Some(DomainProfile::RvVision)
);
}
#[test]
fn unknown_extension_returns_none() {
assert_eq!(DomainProfile::from_extension("txt"), None);
assert_eq!(DomainProfile::from_extension("bin"), None);
assert_eq!(DomainProfile::from_extension(""), None);
assert_eq!(DomainProfile::from_extension("rvf2"), None);
}
#[test]
fn rvdna_file_creates_successfully() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("test.rvdna");
let options = RvfOptions {
dimension: 4,
metric: DistanceMetric::L2,
..Default::default()
};
let store = RvfStore::create(&path, options).unwrap();
assert_ne!(*store.file_id(), [0u8; 16]);
store.close().unwrap();
// Reopen and verify it works
let store = RvfStore::open(&path).unwrap();
let query = vec![1.0, 0.0, 0.0, 0.0];
let results = store
.query(&query, 1, &rvf_runtime::QueryOptions::default())
.unwrap();
assert!(results.is_empty());
store.close().unwrap();
}
#[test]
fn derive_parent_rvf_to_child_rvdna() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent.rvf");
let child_path = dir.path().join("child.rvdna");
let options = RvfOptions {
dimension: 4,
metric: DistanceMetric::L2,
..Default::default()
};
let parent = RvfStore::create(&parent_path, options).unwrap();
let child = parent
.derive(&child_path, rvf_types::DerivationType::Clone, None)
.unwrap();
// Child should have parent linkage
assert_eq!(child.parent_id(), parent.file_id());
assert_eq!(child.lineage_depth(), 1);
child.close().unwrap();
parent.close().unwrap();
}

View File

@@ -0,0 +1,151 @@
//! Integration test: FileIdentity write → read round-trip via Level0Root.
//!
//! Tests the Level0Root codec's FileIdentity read/write in the reserved area,
//! backward compatibility (zeros parse as valid root), and the type itself.
use rvf_manifest::{read_level0, write_level0};
use rvf_types::{FileIdentity, Level0Root};
#[test]
fn file_identity_write_read_round_trip() {
let mut root = Level0Root::zeroed();
root.version = 1;
root.dimension = 128;
// Set a FileIdentity in the reserved area
let fi = FileIdentity {
file_id: [0xAA; 16],
parent_id: [0xBB; 16],
parent_hash: [0xCC; 32],
lineage_depth: 3,
};
root.reserved[..68].copy_from_slice(&fi.to_bytes());
// Write and read back
let bytes = write_level0(&root);
let decoded = read_level0(&bytes).unwrap();
// Extract FileIdentity from decoded reserved area
let decoded_fi = FileIdentity::from_bytes(decoded.reserved[..68].try_into().unwrap());
assert_eq!(decoded_fi, fi);
assert_eq!(decoded_fi.file_id, [0xAA; 16]);
assert_eq!(decoded_fi.parent_id, [0xBB; 16]);
assert_eq!(decoded_fi.parent_hash, [0xCC; 32]);
assert_eq!(decoded_fi.lineage_depth, 3);
}
#[test]
fn zeroed_reserved_parses_as_root_identity() {
let root = Level0Root::zeroed();
let bytes = write_level0(&root);
let decoded = read_level0(&bytes).unwrap();
let fi = FileIdentity::from_bytes(decoded.reserved[..68].try_into().unwrap());
assert!(fi.is_root());
assert_eq!(fi.file_id, [0u8; 16]);
assert_eq!(fi.parent_id, [0u8; 16]);
assert_eq!(fi.parent_hash, [0u8; 32]);
assert_eq!(fi.lineage_depth, 0);
}
#[test]
fn backward_compat_old_files_still_work() {
// Simulate an old file with no lineage data (all zeros in reserved)
let root = Level0Root::zeroed();
let bytes = write_level0(&root);
// Should parse successfully
let decoded = read_level0(&bytes).unwrap();
assert_eq!(decoded.magic, rvf_types::ROOT_MANIFEST_MAGIC);
// FileIdentity should be all zeros = valid root
let fi = FileIdentity::from_bytes(decoded.reserved[..68].try_into().unwrap());
assert!(fi.is_root());
}
#[test]
fn file_identity_type_assertions() {
// Compile-time verified, but test runtime too
assert_eq!(core::mem::size_of::<FileIdentity>(), 68);
assert!(
68 <= 252,
"FileIdentity must fit in Level0Root reserved area"
);
}
#[test]
fn file_identity_to_bytes_from_bytes_round_trip() {
let cases = [
FileIdentity::zeroed(),
FileIdentity::new_root([0xFF; 16]),
FileIdentity {
file_id: [1; 16],
parent_id: [2; 16],
parent_hash: [3; 32],
lineage_depth: u32::MAX,
},
];
for fi in &cases {
let bytes = fi.to_bytes();
let decoded = FileIdentity::from_bytes(&bytes);
assert_eq!(&decoded, fi);
}
}
#[test]
fn root_identity_detection() {
// Root: all-zero parent + depth 0
let root = FileIdentity::new_root([0x42; 16]);
assert!(root.is_root());
// Non-root: has parent_id
let child = FileIdentity {
file_id: [1; 16],
parent_id: [2; 16],
parent_hash: [3; 32],
lineage_depth: 1,
};
assert!(!child.is_root());
// Edge case: zero parent_id but non-zero depth → not root
let weird = FileIdentity {
file_id: [1; 16],
parent_id: [0; 16],
parent_hash: [0; 32],
lineage_depth: 5,
};
assert!(!weird.is_root());
}
#[test]
fn level0_root_preserves_other_fields_with_identity() {
let mut root = Level0Root::zeroed();
root.version = 1;
root.flags = 0x0804; // SIGNED + HAS_LINEAGE
root.total_vector_count = 1_000_000;
root.dimension = 384;
root.epoch = 42;
let fi = FileIdentity {
file_id: [0x11; 16],
parent_id: [0x22; 16],
parent_hash: [0x33; 32],
lineage_depth: 7,
};
root.reserved[..68].copy_from_slice(&fi.to_bytes());
let bytes = write_level0(&root);
let decoded = read_level0(&bytes).unwrap();
// Original fields preserved
assert_eq!(decoded.version, 1);
assert_eq!(decoded.flags, 0x0804);
assert_eq!(decoded.total_vector_count, 1_000_000);
assert_eq!(decoded.dimension, 384);
assert_eq!(decoded.epoch, 42);
// FileIdentity preserved
let decoded_fi = FileIdentity::from_bytes(decoded.reserved[..68].try_into().unwrap());
assert_eq!(decoded_fi, fi);
}

View File

@@ -0,0 +1,371 @@
//! Integration tests for MembershipFilter with HNSW-like traversal semantics.
//!
//! Tests include/exclude modes, bitmap operations, serialization round-trips,
//! and edge cases around word boundaries and empty filters.
use rvf_runtime::MembershipFilter;
use rvf_types::membership::{FilterMode, MembershipHeader, MEMBERSHIP_MAGIC};
// ===========================================================================
// TEST 1: include_mode_empty_filter_is_empty_view
// ===========================================================================
/// An empty include-mode filter means nothing is visible (fail-safe).
#[test]
fn include_mode_empty_filter_is_empty_view() {
let filter = MembershipFilter::new_include(1000);
for id in 0..1000 {
assert!(
!filter.contains(id),
"empty include filter should not contain vector {id}"
);
}
assert_eq!(filter.member_count(), 0);
assert_eq!(filter.vector_count(), 1000);
assert_eq!(filter.mode(), FilterMode::Include);
println!("PASS: include_mode_empty_filter_is_empty_view");
}
// ===========================================================================
// TEST 2: include_mode_subset
// ===========================================================================
/// Add a subset of vector IDs to an include-mode filter, verify membership.
#[test]
fn include_mode_subset() {
let mut filter = MembershipFilter::new_include(500);
// Add specific IDs
let included_ids: Vec<u64> = vec![0, 10, 50, 100, 200, 499];
for &id in &included_ids {
filter.add(id);
}
// Verify included
for &id in &included_ids {
assert!(filter.contains(id), "filter should contain {id}");
}
// Verify excluded
let excluded_ids: Vec<u64> = vec![1, 9, 11, 49, 51, 99, 101, 199, 201, 498];
for &id in &excluded_ids {
assert!(!filter.contains(id), "filter should not contain {id}");
}
assert_eq!(filter.member_count(), included_ids.len() as u64);
println!("PASS: include_mode_subset");
}
// ===========================================================================
// TEST 3: exclude_mode_basics
// ===========================================================================
/// In exclude mode, all vectors are visible by default; adding an ID
/// to the bitmap excludes it.
#[test]
fn exclude_mode_basics() {
let mut filter = MembershipFilter::new_exclude(100);
// Initially everything is visible
for id in 0..100 {
assert!(
filter.contains(id),
"exclude filter should contain {id} initially"
);
}
// Exclude some vectors
filter.add(10);
filter.add(50);
filter.add(90);
assert!(!filter.contains(10), "vector 10 should be excluded");
assert!(!filter.contains(50), "vector 50 should be excluded");
assert!(!filter.contains(90), "vector 90 should be excluded");
assert!(filter.contains(0), "vector 0 should still be visible");
assert!(filter.contains(49), "vector 49 should still be visible");
assert!(filter.contains(99), "vector 99 should still be visible");
assert_eq!(filter.member_count(), 3); // 3 bits set = 3 excluded
assert_eq!(filter.mode(), FilterMode::Exclude);
println!("PASS: exclude_mode_basics");
}
// ===========================================================================
// TEST 4: add_remove_roundtrip
// ===========================================================================
/// Adding then removing a vector should restore the original state.
#[test]
fn add_remove_roundtrip() {
let mut filter = MembershipFilter::new_include(64);
filter.add(10);
assert!(filter.contains(10));
assert_eq!(filter.member_count(), 1);
filter.remove(10);
assert!(!filter.contains(10));
assert_eq!(filter.member_count(), 0);
// Double remove should be a no-op
filter.remove(10);
assert_eq!(filter.member_count(), 0);
// Double add should not double-count
filter.add(20);
filter.add(20);
assert_eq!(filter.member_count(), 1);
println!("PASS: add_remove_roundtrip");
}
// ===========================================================================
// TEST 5: out_of_bounds_ignored
// ===========================================================================
/// Adding a vector ID beyond vector_count should be silently ignored.
#[test]
fn out_of_bounds_ignored() {
let mut filter = MembershipFilter::new_include(10);
filter.add(100); // way out of bounds
assert_eq!(filter.member_count(), 0);
assert!(!filter.contains(100));
filter.add(10); // at boundary (0-indexed, so 10 is out of range for count=10)
assert_eq!(filter.member_count(), 0);
filter.add(9); // last valid
assert_eq!(filter.member_count(), 1);
assert!(filter.contains(9));
println!("PASS: out_of_bounds_ignored");
}
// ===========================================================================
// TEST 6: bitmap_word_boundaries
// ===========================================================================
/// Test vectors at the 64-bit word boundaries (0, 63, 64, 127, 128, etc.).
#[test]
fn bitmap_word_boundaries() {
let mut filter = MembershipFilter::new_include(256);
let boundary_ids: Vec<u64> = vec![0, 1, 62, 63, 64, 65, 126, 127, 128, 129, 191, 192, 255];
for &id in &boundary_ids {
filter.add(id);
}
for &id in &boundary_ids {
assert!(filter.contains(id), "boundary ID {id} should be in filter");
}
// Verify IDs adjacent to boundaries are NOT in filter
let non_boundary: Vec<u64> = vec![2, 61, 66, 125, 130, 190, 193, 254];
for &id in &non_boundary {
assert!(
!filter.contains(id),
"non-boundary ID {id} should NOT be in filter"
);
}
assert_eq!(filter.member_count(), boundary_ids.len() as u64);
println!("PASS: bitmap_word_boundaries");
}
// ===========================================================================
// TEST 7: serialization_round_trip_include
// ===========================================================================
/// Serialize an include-mode filter to bytes, reconstruct it, and verify
/// all membership is preserved.
#[test]
fn serialization_round_trip_include() {
let mut filter = MembershipFilter::new_include(300);
let test_ids: Vec<u64> = vec![0, 1, 63, 64, 127, 128, 199, 250, 299];
for &id in &test_ids {
filter.add(id);
}
filter.bump_generation();
filter.bump_generation();
let header = filter.to_header();
let bitmap_data = filter.serialize();
// Verify header fields
assert_eq!(header.magic, MEMBERSHIP_MAGIC);
assert_eq!(header.version, 1);
assert_eq!(header.filter_mode, FilterMode::Include as u8);
assert_eq!(header.vector_count, 300);
assert_eq!(header.member_count, test_ids.len() as u64);
assert_eq!(header.generation_id, 2);
// Deserialize
let filter2 = MembershipFilter::deserialize(&bitmap_data, &header).unwrap();
assert_eq!(filter2.vector_count(), 300);
assert_eq!(filter2.member_count(), test_ids.len() as u64);
assert_eq!(filter2.generation_id(), 2);
assert_eq!(filter2.mode(), FilterMode::Include);
for &id in &test_ids {
assert!(
filter2.contains(id),
"deserialized filter should contain {id}"
);
}
// Non-members should still be excluded
assert!(!filter2.contains(2));
assert!(!filter2.contains(100));
assert!(!filter2.contains(200));
println!("PASS: serialization_round_trip_include");
}
// ===========================================================================
// TEST 8: serialization_round_trip_exclude
// ===========================================================================
/// Serialize an exclude-mode filter and verify round-trip.
#[test]
fn serialization_round_trip_exclude() {
let mut filter = MembershipFilter::new_exclude(200);
filter.add(10); // exclude vector 10
filter.add(100); // exclude vector 100
let header = filter.to_header();
let bitmap_data = filter.serialize();
let filter2 = MembershipFilter::deserialize(&bitmap_data, &header).unwrap();
assert_eq!(filter2.mode(), FilterMode::Exclude);
assert_eq!(filter2.vector_count(), 200);
assert_eq!(filter2.member_count(), 2);
// In exclude mode: set bits mean excluded
assert!(!filter2.contains(10), "vector 10 should be excluded");
assert!(!filter2.contains(100), "vector 100 should be excluded");
assert!(filter2.contains(0), "vector 0 should be visible");
assert!(filter2.contains(50), "vector 50 should be visible");
assert!(filter2.contains(199), "vector 199 should be visible");
println!("PASS: serialization_round_trip_exclude");
}
// ===========================================================================
// TEST 9: generation_id_tracking
// ===========================================================================
/// Verify that generation_id increments correctly and survives serialization.
#[test]
fn generation_id_tracking() {
let mut filter = MembershipFilter::new_include(64);
assert_eq!(filter.generation_id(), 0);
filter.bump_generation();
assert_eq!(filter.generation_id(), 1);
filter.bump_generation();
filter.bump_generation();
assert_eq!(filter.generation_id(), 3);
// Serialize and verify generation survives
let header = filter.to_header();
let bitmap_data = filter.serialize();
let filter2 = MembershipFilter::deserialize(&bitmap_data, &header).unwrap();
assert_eq!(filter2.generation_id(), 3);
println!("PASS: generation_id_tracking");
}
// ===========================================================================
// TEST 10: large_filter_stress
// ===========================================================================
/// Stress test with a large number of vectors to verify bitmap correctness.
#[test]
fn large_filter_stress() {
let total = 10_000u64;
let mut filter = MembershipFilter::new_include(total);
// Add every 3rd vector
let mut expected_count = 0u64;
for id in (0..total).step_by(3) {
filter.add(id);
expected_count += 1;
}
assert_eq!(filter.member_count(), expected_count);
// Verify membership
for id in 0..total {
let expected = id % 3 == 0;
assert_eq!(
filter.contains(id),
expected,
"vector {id}: expected contains={expected}"
);
}
// Serialize and round-trip
let header = filter.to_header();
let bitmap_data = filter.serialize();
let filter2 = MembershipFilter::deserialize(&bitmap_data, &header).unwrap();
assert_eq!(filter2.member_count(), expected_count);
// Spot-check a few IDs after round-trip
assert!(filter2.contains(0));
assert!(!filter2.contains(1));
assert!(!filter2.contains(2));
assert!(filter2.contains(3));
assert!(filter2.contains(9999));
assert!(!filter2.contains(9998));
println!("PASS: large_filter_stress");
}
// ===========================================================================
// TEST 11: membership_header_round_trip
// ===========================================================================
/// Test that MembershipHeader serializes and deserializes correctly.
#[test]
fn membership_header_round_trip() {
let header = MembershipHeader {
magic: MEMBERSHIP_MAGIC,
version: 1,
filter_type: 0, // Bitmap
filter_mode: FilterMode::Include as u8,
vector_count: 100_000,
member_count: 50_000,
filter_offset: 96,
filter_size: 12_500,
generation_id: 7,
filter_hash: [0xAB; 32],
bloom_offset: 0,
bloom_size: 0,
_reserved: 0,
_reserved2: [0u8; 8],
};
let bytes = header.to_bytes();
let decoded = MembershipHeader::from_bytes(&bytes).unwrap();
assert_eq!(decoded.magic, MEMBERSHIP_MAGIC);
assert_eq!(decoded.version, 1);
assert_eq!(decoded.filter_mode, FilterMode::Include as u8);
assert_eq!(decoded.vector_count, 100_000);
assert_eq!(decoded.member_count, 50_000);
assert_eq!(decoded.filter_size, 12_500);
assert_eq!(decoded.generation_id, 7);
assert_eq!(decoded.filter_hash, [0xAB; 32]);
println!("PASS: membership_header_round_trip");
}

View File

@@ -0,0 +1,158 @@
//! Index recall integration tests.
//!
//! Tests the rvf-index HNSW graph to verify recall@K targets.
use rvf_index::distance::{cosine_distance, dot_product, l2_distance};
use rvf_index::hnsw::{HnswConfig, HnswGraph};
use rvf_index::traits::InMemoryVectorStore;
/// Generate `n` pseudo-random vectors of dimension `dim` using a simple LCG.
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
let mut s = seed;
(0..n)
.map(|_| {
(0..dim)
.map(|_| {
s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
(s >> 33) as f32 / (1u64 << 31) as f32
})
.collect()
})
.collect()
}
/// Brute-force k-NN for ground truth (using squared L2).
fn brute_force_knn(query: &[f32], vectors: &[Vec<f32>], k: usize) -> Vec<u64> {
let mut distances: Vec<(u64, f32)> = vectors
.iter()
.enumerate()
.map(|(i, v)| (i as u64, l2_distance(query, v)))
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
distances.iter().take(k).map(|(i, _)| *i).collect()
}
/// Calculate recall@K.
fn recall_at_k(approx: &[(u64, f32)], exact: &[u64]) -> f64 {
let exact_set: std::collections::HashSet<u64> = exact.iter().copied().collect();
let hits = approx
.iter()
.filter(|(id, _)| exact_set.contains(id))
.count();
hits as f64 / exact.len() as f64
}
#[test]
fn hnsw_build_and_query_recall() {
let dim = 32;
let n = 1000;
let k = 10;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let mut graph = HnswGraph::new(&config);
// Insert all vectors.
let mut rng_seed: u64 = 123;
for i in 0..n as u64 {
rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
let rng_val = ((rng_seed >> 33) as f64 / (1u64 << 31) as f64).clamp(0.001, 0.999);
graph.insert(i, rng_val, &store, &l2_distance);
}
// Run 50 queries and measure average recall.
let queries = random_vectors(50, dim, 999);
let mut total_recall = 0.0;
for query in &queries {
let approx_results = graph.search(query, k, 200, &store, &l2_distance);
let exact_results = brute_force_knn(query, &vectors, k);
total_recall += recall_at_k(&approx_results, &exact_results);
}
let avg_recall = total_recall / queries.len() as f64;
assert!(
avg_recall >= 0.90,
"HNSW recall@{k} = {avg_recall:.3}, expected >= 0.90"
);
}
#[test]
fn hnsw_recall_improves_with_ef_search() {
let dim = 32;
let n = 500;
let k = 10;
let vectors = random_vectors(n, dim, 42);
let store = InMemoryVectorStore::new(vectors.clone());
let config = HnswConfig {
m: 16,
m0: 32,
ef_construction: 200,
};
let mut graph = HnswGraph::new(&config);
let mut rng_seed: u64 = 77;
for i in 0..n as u64 {
rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
let rng_val = ((rng_seed >> 33) as f64 / (1u64 << 31) as f64).clamp(0.001, 0.999);
graph.insert(i, rng_val, &store, &l2_distance);
}
let queries = random_vectors(20, dim, 555);
let mut recalls = Vec::new();
for ef_search in [10, 50, 200] {
let mut total = 0.0;
for query in &queries {
let approx = graph.search(query, k, ef_search, &store, &l2_distance);
let exact = brute_force_knn(query, &vectors, k);
total += recall_at_k(&approx, &exact);
}
recalls.push(total / queries.len() as f64);
}
// Recall should generally increase with higher ef_search.
for i in 1..recalls.len() {
assert!(
recalls[i] >= recalls[i - 1] - 0.05, // tolerance for randomness
"recall should improve with ef_search: {:?}",
recalls
);
}
}
#[test]
fn distance_functions_are_consistent() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
// l2_distance returns squared L2 (no sqrt).
let l2 = l2_distance(&a, &b);
let expected_sq = 4.0 * 4.0 + 4.0 * 4.0 + 4.0 * 4.0 + 4.0 * 4.0;
assert!(
(l2 - expected_sq).abs() < 1e-5,
"L2 squared distance mismatch: {l2} != {expected_sq}"
);
// dot_product returns -dot(a,b).
let dp = dot_product(&a, &b);
let expected_dot = -(1.0 * 5.0 + 2.0 * 6.0 + 3.0 * 7.0 + 4.0 * 8.0);
assert!(
(dp - expected_dot).abs() < 1e-5,
"dot product mismatch: {dp} != {expected_dot}"
);
// cosine_distance returns 1 - cosine_similarity.
let cos = cosine_distance(&a, &b);
assert!(
(0.0..=2.0).contains(&cos),
"cosine distance out of range: {cos}"
);
}

View File

@@ -0,0 +1,445 @@
//! Integration tests for deterministic kernel selection.
//!
//! Tests embedding multiple kernels with different architectures and
//! verifying selection based on architecture match, signed vs unsigned
//! precedence, and api_version ordering.
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::kernel::{KernelHeader, KERNEL_MAGIC};
use rvf_types::kernel_binding::KernelBinding;
use rvf_types::{SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC};
use std::fs::OpenOptions;
use std::io::Read;
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const ARCH_X86_64: u8 = 0x00;
const ARCH_AARCH64: u8 = 0x01;
const KERNEL_FLAG_SIGNED: u32 = 0x0000_0001;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = OpenOptions::new().read(true).open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
/// Scan the file for all KERNEL_SEG segments and return their raw payloads.
fn extract_kernel_segments(file_bytes: &[u8]) -> Vec<(u64, Vec<u8>)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut results = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return results;
}
let last_possible = file_bytes.len() - SEGMENT_HEADER_SIZE;
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let seg_type = file_bytes[i + 5];
if seg_type == SegmentType::Kernel as u8 {
let seg_id = u64::from_le_bytes(file_bytes[i + 0x08..i + 0x10].try_into().unwrap());
let payload_len =
u64::from_le_bytes(file_bytes[i + 0x10..i + 0x18].try_into().unwrap()) as usize;
let payload_start = i + SEGMENT_HEADER_SIZE;
let payload_end = payload_start + payload_len;
if payload_end <= file_bytes.len() && payload_len >= 128 {
let payload = file_bytes[payload_start..payload_end].to_vec();
results.push((seg_id, payload));
}
}
}
}
results
}
// ===========================================================================
// TEST 1: embed_kernel_with_arch_x86_64
// ===========================================================================
/// Embed a kernel for x86_64 and verify the architecture field is stored.
#[test]
fn embed_kernel_with_arch_x86_64() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("kernel_x86.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let kernel_image = b"x86_64-kernel-image-data";
let seg_id = store
.embed_kernel(ARCH_X86_64, 0x00, 0, kernel_image, 8080, None)
.unwrap();
assert!(seg_id > 0);
let (header_bytes, _image) = store.extract_kernel().unwrap().unwrap();
// Parse the KernelHeader to verify arch
let mut header_arr = [0u8; 128];
header_arr.copy_from_slice(&header_bytes);
let header = KernelHeader::from_bytes(&header_arr).unwrap();
assert_eq!(header.arch, ARCH_X86_64, "arch should be x86_64");
assert_eq!(header.kernel_magic, KERNEL_MAGIC);
store.close().unwrap();
println!("PASS: embed_kernel_with_arch_x86_64");
}
// ===========================================================================
// TEST 2: embed_kernel_with_arch_aarch64
// ===========================================================================
/// Embed a kernel for aarch64 and verify the architecture field.
#[test]
fn embed_kernel_with_arch_aarch64() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("kernel_arm.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let kernel_image = b"aarch64-kernel-image-data";
store
.embed_kernel(ARCH_AARCH64, 0x00, 0, kernel_image, 9090, None)
.unwrap();
let (header_bytes, _image) = store.extract_kernel().unwrap().unwrap();
let mut header_arr = [0u8; 128];
header_arr.copy_from_slice(&header_bytes);
let header = KernelHeader::from_bytes(&header_arr).unwrap();
assert_eq!(header.arch, ARCH_AARCH64, "arch should be aarch64");
store.close().unwrap();
println!("PASS: embed_kernel_with_arch_aarch64");
}
// ===========================================================================
// TEST 3: multi_kernel_file_contains_both
// ===========================================================================
/// Embed two kernels (x86_64 and aarch64) into the same file and verify
/// both are present in the raw file bytes.
#[test]
fn multi_kernel_file_contains_both() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_kernel.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Embed x86_64 kernel
store
.embed_kernel(ARCH_X86_64, 0x00, 0, b"x86-image", 8080, None)
.unwrap();
// Embed aarch64 kernel
store
.embed_kernel(ARCH_AARCH64, 0x00, 0, b"arm-image", 9090, None)
.unwrap();
store.close().unwrap();
// Scan raw file for all KERNEL_SEGs
let bytes = read_file_bytes(&path);
let kernels = extract_kernel_segments(&bytes);
assert_eq!(
kernels.len(),
2,
"file should contain 2 KERNEL_SEGs, found {}",
kernels.len()
);
// Verify architectures
let mut archs = Vec::new();
for (_seg_id, payload) in &kernels {
let mut header_arr = [0u8; 128];
header_arr.copy_from_slice(&payload[..128]);
let header = KernelHeader::from_bytes(&header_arr).unwrap();
archs.push(header.arch);
}
assert!(archs.contains(&ARCH_X86_64), "should have x86_64 kernel");
assert!(archs.contains(&ARCH_AARCH64), "should have aarch64 kernel");
println!("PASS: multi_kernel_file_contains_both");
}
// ===========================================================================
// TEST 4: signed_kernel_flags_preserved
// ===========================================================================
/// Embed a signed kernel and verify the SIGNED flag is preserved.
#[test]
fn signed_kernel_flags_preserved() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("signed_kernel.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
store
.embed_kernel(
ARCH_X86_64,
0x00,
KERNEL_FLAG_SIGNED,
b"signed-kernel-image",
8080,
None,
)
.unwrap();
let (header_bytes, _image) = store.extract_kernel().unwrap().unwrap();
let mut header_arr = [0u8; 128];
header_arr.copy_from_slice(&header_bytes);
let header = KernelHeader::from_bytes(&header_arr).unwrap();
assert!(
header.kernel_flags & KERNEL_FLAG_SIGNED != 0,
"SIGNED flag should be set: got 0x{:08X}",
header.kernel_flags
);
store.close().unwrap();
println!("PASS: signed_kernel_flags_preserved");
}
// ===========================================================================
// TEST 5: kernel_binding_round_trip
// ===========================================================================
/// Embed a kernel with a KernelBinding and verify the binding survives
/// extraction.
#[test]
fn kernel_binding_round_trip() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("kernel_binding.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let binding = KernelBinding {
manifest_root_hash: [0xAA; 32],
policy_hash: [0xBB; 32],
binding_version: 1,
min_runtime_version: 2,
_pad0: 0,
allowed_segment_mask: 0x00FF_FFFF,
_reserved: [0; 48],
};
store
.embed_kernel_with_binding(
ARCH_X86_64,
0x00,
KERNEL_FLAG_SIGNED,
b"kernel-with-binding",
8080,
Some("console=ttyS0"),
&binding,
)
.unwrap();
// Extract the binding
let extracted_binding = store.extract_kernel_binding().unwrap();
assert!(extracted_binding.is_some(), "binding should be extractable");
let eb = extracted_binding.unwrap();
assert_eq!(eb.binding_version, 1, "binding_version mismatch");
assert_eq!(eb.min_runtime_version, 2, "min_runtime_version mismatch");
assert_eq!(
eb.manifest_root_hash, [0xAA; 32],
"manifest_root_hash mismatch"
);
assert_eq!(eb.policy_hash, [0xBB; 32], "policy_hash mismatch");
assert_eq!(
eb.allowed_segment_mask, 0x00FF_FFFF,
"segment_mask mismatch"
);
store.close().unwrap();
println!("PASS: kernel_binding_round_trip");
}
// ===========================================================================
// TEST 6: kernel_binding_persists_through_reopen
// ===========================================================================
/// Embed a kernel with binding, close, reopen, and verify the binding
/// is still present.
#[test]
fn kernel_binding_persists_through_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("binding_persist.rvf");
let dim: u16 = 4;
let binding = KernelBinding {
manifest_root_hash: [0x11; 32],
policy_hash: [0x22; 32],
binding_version: 3,
min_runtime_version: 1,
_pad0: 0,
allowed_segment_mask: 0xDEAD_BEEF,
_reserved: [0; 48],
};
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
store
.embed_kernel_with_binding(
ARCH_AARCH64,
0x00,
0,
b"persistent-binding-kernel",
7070,
None,
&binding,
)
.unwrap();
store.close().unwrap();
}
{
let store = RvfStore::open_readonly(&path).unwrap();
let eb = store.extract_kernel_binding().unwrap();
assert!(eb.is_some(), "binding should persist through reopen");
let eb = eb.unwrap();
assert_eq!(eb.binding_version, 3);
assert_eq!(eb.min_runtime_version, 1);
assert_eq!(eb.manifest_root_hash, [0x11; 32]);
assert_eq!(eb.policy_hash, [0x22; 32]);
assert_eq!(eb.allowed_segment_mask, 0xDEAD_BEEF);
}
println!("PASS: kernel_binding_persists_through_reopen");
}
// ===========================================================================
// TEST 7: no_kernel_returns_none
// ===========================================================================
/// A store without any kernel should return None for extraction.
#[test]
fn no_kernel_returns_none() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("no_kernel.rvf");
let dim: u16 = 4;
let store = RvfStore::create(&path, make_options(dim)).unwrap();
assert!(store.extract_kernel().unwrap().is_none());
assert!(store.extract_kernel_binding().unwrap().is_none());
store.close().unwrap();
println!("PASS: no_kernel_returns_none");
}
// ===========================================================================
// TEST 8: kernel_header_serialization
// ===========================================================================
/// Test KernelHeader serialization and deserialization directly.
#[test]
fn kernel_header_serialization() {
let header = KernelHeader {
kernel_magic: KERNEL_MAGIC,
header_version: 1,
arch: ARCH_AARCH64,
kernel_type: 0xFD,
kernel_flags: KERNEL_FLAG_SIGNED,
min_memory_mb: 0,
entry_point: 0x1000,
image_size: 65536,
compressed_size: 32768,
compression: 1,
api_transport: 0,
api_port: 8443,
api_version: 2,
image_hash: [0xCC; 32],
build_id: [0xDD; 16],
build_timestamp: 1700000000,
vcpu_count: 4,
reserved_0: 0,
cmdline_offset: 256,
cmdline_length: 32,
reserved_1: 0,
};
let bytes = header.to_bytes();
let decoded = KernelHeader::from_bytes(&bytes).unwrap();
assert_eq!(decoded.kernel_magic, KERNEL_MAGIC);
assert_eq!(decoded.header_version, 1);
assert_eq!(decoded.arch, ARCH_AARCH64);
assert_eq!(decoded.kernel_type, 0xFD);
assert_eq!(decoded.kernel_flags, KERNEL_FLAG_SIGNED);
assert_eq!(decoded.entry_point, 0x1000);
assert_eq!(decoded.image_size, 65536);
assert_eq!(decoded.compressed_size, 32768);
assert_eq!(decoded.compression, 1);
assert_eq!(decoded.api_port, 8443);
assert_eq!(decoded.api_version, 2);
assert_eq!(decoded.image_hash, [0xCC; 32]);
assert_eq!(decoded.build_id, [0xDD; 16]);
assert_eq!(decoded.build_timestamp, 1700000000);
assert_eq!(decoded.vcpu_count, 4);
assert_eq!(decoded.cmdline_offset, 256);
assert_eq!(decoded.cmdline_length, 32);
println!("PASS: kernel_header_serialization");
}
// ===========================================================================
// TEST 9: kernel_binding_serialization
// ===========================================================================
/// Test KernelBinding serialization directly.
#[test]
fn kernel_binding_serialization() {
let binding = KernelBinding {
manifest_root_hash: [0x01; 32],
policy_hash: [0x02; 32],
binding_version: 5,
min_runtime_version: 3,
_pad0: 0,
allowed_segment_mask: 0xFFFF_FFFF_FFFF_FFFF,
_reserved: [0; 48],
};
let bytes = binding.to_bytes();
let decoded = KernelBinding::from_bytes(&bytes);
assert_eq!(
decoded, binding,
"round-trip should produce identical binding"
);
println!("PASS: kernel_binding_serialization");
}

View File

@@ -0,0 +1,145 @@
//! Integration test: parent → child → grandchild derivation chain.
//!
//! Verifies file_id, parent_id, parent_hash, lineage_depth at each level,
//! and that HAS_LINEAGE flag + DERIVATION witness semantics work end-to-end.
use rvf_runtime::options::DistanceMetric;
use rvf_runtime::{RvfOptions, RvfStore};
use rvf_types::DerivationType;
use tempfile::TempDir;
#[test]
fn parent_child_grandchild_derivation() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent.rvf");
let child_path = dir.path().join("child.rvf");
let grandchild_path = dir.path().join("grandchild.rvdna");
let options = RvfOptions {
dimension: 4,
metric: DistanceMetric::L2,
..Default::default()
};
// Create parent
let parent = RvfStore::create(&parent_path, options.clone()).unwrap();
let parent_file_id = *parent.file_id();
assert_eq!(parent.lineage_depth(), 0);
assert_eq!(parent.parent_id(), &[0u8; 16]);
assert!(parent.file_identity().is_root());
assert_ne!(parent_file_id, [0u8; 16]); // should have a real ID
// Derive child from parent
let child = parent
.derive(&child_path, DerivationType::Filter, None)
.unwrap();
let child_file_id = *child.file_id();
assert_eq!(child.lineage_depth(), 1);
assert_eq!(child.parent_id(), &parent_file_id);
assert!(!child.file_identity().is_root());
assert_ne!(child_file_id, parent_file_id); // different file IDs
assert_ne!(child.file_identity().parent_hash, [0u8; 32]); // non-zero parent hash
// Derive grandchild from child
let grandchild = child
.derive(&grandchild_path, DerivationType::Transform, None)
.unwrap();
assert_eq!(grandchild.lineage_depth(), 2);
assert_eq!(grandchild.parent_id(), &child_file_id);
assert!(!grandchild.file_identity().is_root());
assert_ne!(grandchild.file_identity().parent_hash, [0u8; 32]);
// Verify the chain is properly linked
assert_ne!(grandchild.file_id(), child.file_id());
assert_ne!(grandchild.file_id(), parent.file_id());
grandchild.close().unwrap();
child.close().unwrap();
parent.close().unwrap();
}
#[test]
fn derived_store_inherits_dimension() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent.rvf");
let child_path = dir.path().join("child.rvf");
let options = RvfOptions {
dimension: 128,
metric: DistanceMetric::Cosine,
..Default::default()
};
let parent = RvfStore::create(&parent_path, options).unwrap();
let child = parent
.derive(&child_path, DerivationType::Clone, None)
.unwrap();
// Child should be queryable with same dimension
let query = vec![0.0f32; 128];
let results = child
.query(&query, 10, &rvf_runtime::QueryOptions::default())
.unwrap();
assert!(results.is_empty()); // no vectors ingested yet
child.close().unwrap();
parent.close().unwrap();
}
#[test]
fn file_identity_persists_through_reopen() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent.rvf");
let child_path = dir.path().join("child.rvf");
let options = RvfOptions {
dimension: 4,
metric: DistanceMetric::L2,
..Default::default()
};
let parent = RvfStore::create(&parent_path, options).unwrap();
let parent_file_id = *parent.file_id();
let child = parent
.derive(&child_path, DerivationType::Snapshot, None)
.unwrap();
let child_file_id = *child.file_id();
let child_depth = child.lineage_depth();
let child_parent_id = *child.parent_id();
let child_parent_hash = child.file_identity().parent_hash;
child.close().unwrap();
parent.close().unwrap();
// Reopen child and verify identity persists
let reopened = RvfStore::open(&child_path).unwrap();
assert_eq!(*reopened.file_id(), child_file_id);
assert_eq!(reopened.lineage_depth(), child_depth);
assert_eq!(*reopened.parent_id(), child_parent_id);
assert_eq!(reopened.file_identity().parent_hash, child_parent_hash);
assert_eq!(*reopened.parent_id(), parent_file_id);
reopened.close().unwrap();
}
#[test]
fn root_file_identity_persists() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("root.rvf");
let options = RvfOptions {
dimension: 4,
metric: DistanceMetric::L2,
..Default::default()
};
let store = RvfStore::create(&path, options).unwrap();
let original_id = *store.file_id();
assert!(store.file_identity().is_root());
store.close().unwrap();
let reopened = RvfStore::open(&path).unwrap();
assert_eq!(*reopened.file_id(), original_id);
assert!(reopened.file_identity().is_root());
assert_eq!(reopened.lineage_depth(), 0);
reopened.close().unwrap();
}

View File

@@ -0,0 +1,417 @@
//! Integration tests for provenance chain / lineage verification.
//!
//! Tests FileIdentity creation, derivation chains, lineage depth,
//! parent_id/parent_hash linkage, and multi-level derivation.
use rvf_runtime::options::{DistanceMetric, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::lineage::{LineageRecord, WITNESS_DERIVATION};
use rvf_types::{DerivationType, FileIdentity};
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ===========================================================================
// TEST 1: root_file_has_zero_lineage
// ===========================================================================
/// A freshly created RVF file should be a root with lineage_depth=0 and
/// a zero parent_id.
#[test]
fn root_file_has_zero_lineage() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("root.rvf");
let dim: u16 = 4;
let store = RvfStore::create(&path, make_options(dim)).unwrap();
assert_eq!(store.lineage_depth(), 0, "root should have lineage_depth 0");
assert_eq!(
store.parent_id(),
&[0u8; 16],
"root parent_id should be all zeros"
);
assert_ne!(
store.file_id(),
&[0u8; 16],
"root file_id should be non-zero"
);
assert!(
store.file_identity().is_root(),
"root identity should report is_root()"
);
store.close().unwrap();
println!("PASS: root_file_has_zero_lineage");
}
// ===========================================================================
// TEST 2: derive_sets_parent_id
// ===========================================================================
/// Deriving a child from a parent should set the child's parent_id to
/// the parent's file_id.
#[test]
fn derive_sets_parent_id() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent.rvf");
let child_path = dir.path().join("child.rvf");
let dim: u16 = 4;
let mut parent = RvfStore::create(&parent_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
parent.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
let parent_file_id = *parent.file_id();
let child = parent
.derive(&child_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
assert_eq!(
child.parent_id(),
&parent_file_id,
"child's parent_id should equal parent's file_id"
);
assert_ne!(
child.file_id(),
&parent_file_id,
"child should have its own unique file_id"
);
child.close().unwrap();
parent.close().unwrap();
println!("PASS: derive_sets_parent_id");
}
// ===========================================================================
// TEST 3: derive_increments_lineage_depth
// ===========================================================================
/// Each derivation should increment lineage_depth by 1.
#[test]
fn derive_increments_lineage_depth() {
let dir = TempDir::new().unwrap();
let root_path = dir.path().join("root.rvf");
let child1_path = dir.path().join("child1.rvf");
let child2_path = dir.path().join("child2.rvf");
let dim: u16 = 4;
let mut root = RvfStore::create(&root_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
root.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
assert_eq!(root.lineage_depth(), 0);
let mut child1 = root
.derive(&child1_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
assert_eq!(child1.lineage_depth(), 1);
// Need to ingest something so the child has content for hash computation
let v2 = vec![2.0f32; dim as usize];
child1.ingest_batch(&[v2.as_slice()], &[2], None).unwrap();
let child2 = child1
.derive(&child2_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
assert_eq!(child2.lineage_depth(), 2);
child2.close().unwrap();
child1.close().unwrap();
root.close().unwrap();
println!("PASS: derive_increments_lineage_depth");
}
// ===========================================================================
// TEST 4: parent_hash_is_nonzero_for_derived
// ===========================================================================
/// A derived file should have a non-zero parent_hash (hash of parent manifest).
#[test]
fn parent_hash_is_nonzero_for_derived() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent_hash.rvf");
let child_path = dir.path().join("child_hash.rvf");
let dim: u16 = 4;
let mut parent = RvfStore::create(&parent_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
parent.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
let child = parent
.derive(&child_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
let parent_hash = child.file_identity().parent_hash;
assert_ne!(
parent_hash, [0u8; 32],
"derived file's parent_hash should be non-zero"
);
child.close().unwrap();
parent.close().unwrap();
println!("PASS: parent_hash_is_nonzero_for_derived");
}
// ===========================================================================
// TEST 5: lineage_persists_through_reopen
// ===========================================================================
/// Derive a child, close both, reopen child, and verify lineage is intact.
#[test]
fn lineage_persists_through_reopen() {
let dir = TempDir::new().unwrap();
let parent_path = dir.path().join("parent_persist.rvf");
let child_path = dir.path().join("child_persist.rvf");
let dim: u16 = 4;
let parent_file_id;
let child_file_id;
let child_parent_hash;
{
let mut parent = RvfStore::create(&parent_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
parent.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
parent_file_id = *parent.file_id();
let child = parent
.derive(&child_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
child_file_id = *child.file_id();
child_parent_hash = child.file_identity().parent_hash;
child.close().unwrap();
parent.close().unwrap();
}
// Reopen child
{
let child = RvfStore::open_readonly(&child_path).unwrap();
assert_eq!(
child.file_id(),
&child_file_id,
"file_id should persist through reopen"
);
assert_eq!(
child.parent_id(),
&parent_file_id,
"parent_id should persist through reopen"
);
assert_eq!(
child.lineage_depth(),
1,
"lineage_depth should persist through reopen"
);
assert_eq!(
child.file_identity().parent_hash,
child_parent_hash,
"parent_hash should persist through reopen"
);
}
println!("PASS: lineage_persists_through_reopen");
}
// ===========================================================================
// TEST 6: file_identity_type_round_trip
// ===========================================================================
/// Test FileIdentity serialization / deserialization directly.
#[test]
fn file_identity_type_round_trip() {
let fi = FileIdentity {
file_id: [0x11; 16],
parent_id: [0x22; 16],
parent_hash: [0x33; 32],
lineage_depth: 42,
};
let bytes = fi.to_bytes();
assert_eq!(bytes.len(), 68);
let decoded = FileIdentity::from_bytes(&bytes);
assert_eq!(decoded, fi);
assert_eq!(decoded.file_id, [0x11; 16]);
assert_eq!(decoded.parent_id, [0x22; 16]);
assert_eq!(decoded.parent_hash, [0x33; 32]);
assert_eq!(decoded.lineage_depth, 42);
assert!(!decoded.is_root());
println!("PASS: file_identity_type_round_trip");
}
// ===========================================================================
// TEST 7: lineage_record_round_trip
// ===========================================================================
/// Test LineageRecord creation and field access.
#[test]
fn lineage_record_round_trip() {
let record = LineageRecord::new(
[0xAA; 16],
[0xBB; 16],
[0xCC; 32],
DerivationType::Filter,
100,
1_700_000_000_000_000_000,
"filtered by embedding cluster",
);
assert_eq!(record.file_id, [0xAA; 16]);
assert_eq!(record.parent_id, [0xBB; 16]);
assert_eq!(record.parent_hash, [0xCC; 32]);
assert_eq!(record.derivation_type, DerivationType::Filter);
assert_eq!(record.mutation_count, 100);
assert_eq!(record.timestamp_ns, 1_700_000_000_000_000_000);
assert_eq!(record.description_str(), "filtered by embedding cluster");
println!("PASS: lineage_record_round_trip");
}
// ===========================================================================
// TEST 8: witness_derivation_constant
// ===========================================================================
/// Verify the witness type constant for derivation events.
#[test]
fn witness_derivation_constant() {
assert_eq!(WITNESS_DERIVATION, 0x09);
println!("PASS: witness_derivation_constant");
}
// ===========================================================================
// TEST 9: derivation_type_enum_coverage
// ===========================================================================
/// Verify all DerivationType variants serialize correctly.
#[test]
fn derivation_type_enum_coverage() {
let cases: &[(u8, DerivationType)] = &[
(0, DerivationType::Clone),
(1, DerivationType::Filter),
(2, DerivationType::Merge),
(3, DerivationType::Quantize),
(4, DerivationType::Reindex),
(5, DerivationType::Transform),
(6, DerivationType::Snapshot),
(0xFF, DerivationType::UserDefined),
];
for &(raw, expected) in cases {
let decoded = DerivationType::try_from(raw);
assert_eq!(
decoded,
Ok(expected),
"DerivationType::try_from({raw}) should be {expected:?}"
);
assert_eq!(expected as u8, raw);
}
// Invalid values should error
assert!(DerivationType::try_from(7).is_err());
assert!(DerivationType::try_from(0xFE).is_err());
println!("PASS: derivation_type_enum_coverage");
}
// ===========================================================================
// TEST 10: three_level_lineage_chain
// ===========================================================================
/// Build a three-level lineage chain: root -> child -> grandchild,
/// and verify the entire chain is correct.
#[test]
fn three_level_lineage_chain() {
let dir = TempDir::new().unwrap();
let root_path = dir.path().join("root_chain.rvf");
let child_path = dir.path().join("child_chain.rvf");
let grandchild_path = dir.path().join("grandchild_chain.rvf");
let dim: u16 = 4;
// Root
let mut root = RvfStore::create(&root_path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
root.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
let root_id = *root.file_id();
// Child
let mut child = root
.derive(&child_path, DerivationType::Clone, Some(make_options(dim)))
.unwrap();
let child_id = *child.file_id();
let v2 = vec![2.0f32; dim as usize];
child.ingest_batch(&[v2.as_slice()], &[2], None).unwrap();
// Grandchild
let grandchild = child
.derive(
&grandchild_path,
DerivationType::Filter,
Some(make_options(dim)),
)
.unwrap();
let grandchild_id = *grandchild.file_id();
// Verify chain
assert_eq!(root.lineage_depth(), 0);
assert_eq!(child.lineage_depth(), 1);
assert_eq!(grandchild.lineage_depth(), 2);
assert_eq!(root.parent_id(), &[0u8; 16]);
assert_eq!(child.parent_id(), &root_id);
assert_eq!(grandchild.parent_id(), &child_id);
// All file_ids should be unique
assert_ne!(root_id, child_id);
assert_ne!(child_id, grandchild_id);
assert_ne!(root_id, grandchild_id);
// Parent hashes should be non-zero for derived files
assert_ne!(child.file_identity().parent_hash, [0u8; 32]);
assert_ne!(grandchild.file_identity().parent_hash, [0u8; 32]);
grandchild.close().unwrap();
child.close().unwrap();
root.close().unwrap();
println!("PASS: three_level_lineage_chain");
}
// ===========================================================================
// TEST 11: lineage_record_long_description_truncation
// ===========================================================================
/// Verify that LineageRecord truncates descriptions longer than 47 bytes.
#[test]
fn lineage_record_long_description_truncation() {
let long_desc = "a".repeat(100);
let record = LineageRecord::new(
[0u8; 16],
[0u8; 16],
[0u8; 32],
DerivationType::Clone,
0,
0,
&long_desc,
);
assert_eq!(record.description_len, 47, "should be truncated to 47");
assert_eq!(record.description_str(), &"a".repeat(47));
println!("PASS: lineage_record_long_description_truncation");
}

View File

@@ -0,0 +1,166 @@
//! Manifest and boot integration tests.
//!
//! Tests the rvf-wire tail_scan + rvf-manifest progressive boot pipeline:
//! - Write segments, append manifest, find manifest from tail
//! - Level 0 / Level 1 manifest round-trips
//! - Overlay chain progression
use rvf_types::{SegmentFlags, SegmentType, SEGMENT_ALIGNMENT, SEGMENT_HEADER_SIZE};
use rvf_wire::{find_latest_manifest, write_segment};
#[test]
fn tail_scan_finds_manifest_after_data_segments() {
let mut file = Vec::new();
// Write several VEC_SEGs.
for i in 0..5 {
let payload = vec![i as u8; 100];
let seg = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), i);
file.extend_from_slice(&seg);
}
// Write a manifest segment at the end.
let manifest_payload = vec![0u8; 128];
let manifest_offset = file.len();
let manifest_seg = write_segment(
SegmentType::Manifest as u8,
&manifest_payload,
SegmentFlags::empty(),
100,
);
file.extend_from_slice(&manifest_seg);
let (offset, header) = find_latest_manifest(&file).unwrap();
assert_eq!(offset, manifest_offset);
assert_eq!(header.seg_type, SegmentType::Manifest as u8);
assert_eq!(header.segment_id, 100);
}
#[test]
fn tail_scan_finds_latest_manifest_when_multiple_exist() {
let mut file = Vec::new();
// First manifest.
let m1 = write_segment(
SegmentType::Manifest as u8,
&[1u8; 64],
SegmentFlags::empty(),
1,
);
file.extend_from_slice(&m1);
// Some data segments.
for i in 10..15 {
let seg = write_segment(
SegmentType::Vec as u8,
&[i as u8; 200],
SegmentFlags::empty(),
i,
);
file.extend_from_slice(&seg);
}
// Second (latest) manifest.
let latest_offset = file.len();
let m2 = write_segment(
SegmentType::Manifest as u8,
&[2u8; 64],
SegmentFlags::empty(),
2,
);
file.extend_from_slice(&m2);
let (offset, header) = find_latest_manifest(&file).unwrap();
assert_eq!(offset, latest_offset);
assert_eq!(header.segment_id, 2);
}
#[test]
fn tail_scan_fails_when_no_manifest() {
let mut file = Vec::new();
for i in 0..3 {
let seg = write_segment(SegmentType::Vec as u8, &[0u8; 50], SegmentFlags::empty(), i);
file.extend_from_slice(&seg);
}
assert!(find_latest_manifest(&file).is_err());
}
#[test]
fn tail_scan_handles_mixed_segment_types() {
let mut file = Vec::new();
let types = [
SegmentType::Vec,
SegmentType::Index,
SegmentType::Meta,
SegmentType::Journal,
SegmentType::Hot,
];
for (i, seg_type) in types.iter().enumerate() {
let seg = write_segment(
*seg_type as u8,
&[i as u8; 80],
SegmentFlags::empty(),
i as u64,
);
file.extend_from_slice(&seg);
}
// Finally add manifest.
let manifest_offset = file.len();
let manifest = write_segment(
SegmentType::Manifest as u8,
&[0xFFu8; 96],
SegmentFlags::empty(),
99,
);
file.extend_from_slice(&manifest);
let (offset, header) = find_latest_manifest(&file).unwrap();
assert_eq!(offset, manifest_offset);
assert_eq!(header.segment_id, 99);
}
#[test]
fn all_segments_are_64_byte_aligned() {
let mut file = Vec::new();
let types = [
SegmentType::Vec,
SegmentType::Index,
SegmentType::Quant,
SegmentType::Journal,
SegmentType::Manifest,
SegmentType::Meta,
SegmentType::Hot,
];
for (i, seg_type) in types.iter().enumerate() {
let payload_size = 10 + i * 17; // various non-aligned sizes
let payload = vec![0u8; payload_size];
let seg = write_segment(*seg_type as u8, &payload, SegmentFlags::empty(), i as u64);
assert_eq!(
seg.len() % SEGMENT_ALIGNMENT,
0,
"segment type {:?} (payload={payload_size}) not 64-byte aligned",
seg_type
);
file.extend_from_slice(&seg);
}
// Every segment boundary is at a 64-byte aligned offset.
let mut offset = 0;
for (i, seg_type) in types.iter().enumerate() {
assert_eq!(
offset % SEGMENT_ALIGNMENT,
0,
"segment {i} ({seg_type:?}) starts at non-aligned offset {offset}"
);
let payload_size = 10 + i * 17;
let seg_size =
(SEGMENT_HEADER_SIZE + payload_size + SEGMENT_ALIGNMENT - 1) & !(SEGMENT_ALIGNMENT - 1);
offset += seg_size;
}
}

View File

@@ -0,0 +1,134 @@
//! Profile compatibility tests.
//!
//! Verifies that a generic RVF reader can open files written with different
//! profiles, and that unknown segment types are gracefully skipped.
use rvf_types::{SegmentFlags, SegmentType};
use rvf_wire::{read_segment, validate_segment, write_segment};
#[test]
fn generic_reader_handles_unknown_segment_type() {
// Write a segment with a hypothetical future segment type (0xFE).
let future_type: u8 = 0xFE;
let payload = b"future segment data";
let encoded = write_segment(future_type, payload, SegmentFlags::empty(), 1);
// The reader should still parse the header and payload.
let (header, decoded_payload) = read_segment(&encoded).unwrap();
assert_eq!(header.seg_type, future_type);
assert_eq!(decoded_payload, payload);
// Hash validation should still work.
assert!(validate_segment(&header, decoded_payload).is_ok());
}
#[test]
fn multi_profile_file_readable() {
// Simulate a file with segments tagged with different profile hints.
// The generic reader should read all of them without caring about profile.
let mut file = Vec::new();
let mut offsets = Vec::new();
// "RVText" segment (just VEC_SEG with text embedding payload).
let text_payload = b"text embedding vectors";
offsets.push(file.len());
file.extend_from_slice(&write_segment(
SegmentType::Vec as u8,
text_payload,
SegmentFlags::empty(),
1,
));
// "RVDNA" segment (VEC_SEG with genomic data payload).
let dna_payload = b"genomic sequence vectors";
offsets.push(file.len());
file.extend_from_slice(&write_segment(
SegmentType::Vec as u8,
dna_payload,
SegmentFlags::empty(),
2,
));
// "RVGraph" segment (VEC_SEG with graph embedding payload).
let graph_payload = b"graph node embedding vectors";
offsets.push(file.len());
file.extend_from_slice(&write_segment(
SegmentType::Vec as u8,
graph_payload,
SegmentFlags::empty(),
3,
));
// Generic reader can read all segments.
let expected_payloads: Vec<&[u8]> = vec![text_payload, dna_payload, graph_payload];
for (i, &offset) in offsets.iter().enumerate() {
let (header, payload) = read_segment(&file[offset..]).unwrap();
assert_eq!(header.segment_id, (i + 1) as u64);
assert_eq!(payload, expected_payloads[i]);
assert!(validate_segment(&header, payload).is_ok());
}
}
#[test]
fn version_forward_compatibility_unknown_tags_skipped() {
// A file might contain known + unknown segment types.
// The reader should skip unknown ones and still find known segments.
let mut file = Vec::new();
// Known VEC_SEG.
let vec_offset = file.len();
file.extend_from_slice(&write_segment(
SegmentType::Vec as u8,
b"vector data",
SegmentFlags::empty(),
1,
));
// Unknown future segment type.
file.extend_from_slice(&write_segment(
0xFD,
b"future extension data",
SegmentFlags::empty(),
2,
));
// Known INDEX_SEG.
let index_offset = file.len();
file.extend_from_slice(&write_segment(
SegmentType::Index as u8,
b"index data",
SegmentFlags::empty(),
3,
));
// Reader can still read known segments.
let (hdr_vec, payload_vec) = read_segment(&file[vec_offset..]).unwrap();
assert_eq!(hdr_vec.seg_type, SegmentType::Vec as u8);
assert_eq!(payload_vec, b"vector data");
let (hdr_idx, payload_idx) = read_segment(&file[index_offset..]).unwrap();
assert_eq!(hdr_idx.seg_type, SegmentType::Index as u8);
assert_eq!(payload_idx, b"index data");
}
#[test]
fn sealed_segment_flag_preserved() {
let flags = SegmentFlags::empty().with(SegmentFlags::SEALED);
let encoded = write_segment(SegmentType::Vec as u8, b"sealed data", flags, 1);
let (header, _) = read_segment(&encoded).unwrap();
assert!(
header.flags & SegmentFlags::SEALED != 0,
"SEALED flag should be preserved"
);
}
#[test]
fn compressed_flag_preserved() {
let flags = SegmentFlags::empty().with(SegmentFlags::COMPRESSED);
let encoded = write_segment(SegmentType::Quant as u8, b"compressed quant", flags, 5);
let (header, _) = read_segment(&encoded).unwrap();
assert!(
header.flags & SegmentFlags::COMPRESSED != 0,
"COMPRESSED flag should be preserved"
);
}

View File

@@ -0,0 +1,195 @@
//! Quantization accuracy tests.
//!
//! Tests rvf-quant scalar and binary quantization to verify
//! compression ratios and error bounds.
use rvf_quant::binary::{decode_binary, encode_binary, hamming_distance};
use rvf_quant::scalar::ScalarQuantizer;
use rvf_quant::traits::Quantizer;
/// Generate pseudo-random unit vectors using a simple LCG.
fn random_unit_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
let mut s = seed;
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim)
.map(|_| {
s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
(s >> 33) as f32 / (1u64 << 31) as f32 - 0.5
})
.collect();
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
v.iter().map(|x| x / norm).collect()
} else {
v
}
})
.collect()
}
#[test]
fn scalar_quantize_round_trip() {
let vectors = random_unit_vectors(100, 64, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let quantizer = ScalarQuantizer::train(&refs);
for v in &vectors {
let encoded = quantizer.encode(v);
let decoded = quantizer.decode(&encoded);
assert_eq!(decoded.len(), v.len());
let mse: f32 = v
.iter()
.zip(decoded.iter())
.map(|(a, b)| (a - b) * (a - b))
.sum::<f32>()
/ v.len() as f32;
assert!(mse < 0.01, "scalar quantization MSE too high: {mse:.6}");
}
}
#[test]
fn scalar_quantizer_compresses_4x() {
let vectors = random_unit_vectors(10, 128, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let quantizer = ScalarQuantizer::train(&refs);
let original_bytes = 128 * 4; // f32 = 4 bytes
let encoded = quantizer.encode(&vectors[0]);
let encoded_bytes = encoded.len();
// Scalar quantization (int8) should achieve ~4x compression.
let ratio = original_bytes as f64 / encoded_bytes as f64;
assert!(
ratio >= 3.0,
"compression ratio {ratio:.1}x, expected >= 3.0x"
);
}
#[test]
fn binary_quantize_round_trip() {
let vectors = random_unit_vectors(50, 128, 42);
for v in &vectors {
let encoded = encode_binary(v);
let decoded = decode_binary(&encoded, v.len());
assert_eq!(decoded.len(), v.len());
for &d in &decoded {
assert!(
d == 1.0 || d == -1.0,
"binary decode should be +/-1, got {d}"
);
}
// Sign should match for most components.
let sign_matches = v
.iter()
.zip(decoded.iter())
.filter(|(&a, &b)| (a >= 0.0 && b > 0.0) || (a < 0.0 && b < 0.0))
.count();
let match_rate = sign_matches as f64 / v.len() as f64;
assert!(
match_rate >= 0.5,
"binary quantization sign match rate {match_rate:.2}, expected >= 0.5"
);
}
}
#[test]
fn binary_compression_ratio_32x() {
let dim = 256;
let original_bytes = dim * 4; // f32
let encoded = encode_binary(&vec![0.5f32; dim]);
let encoded_bytes = encoded.len();
let ratio = original_bytes as f64 / encoded_bytes as f64;
assert!(
ratio >= 25.0,
"binary compression ratio {ratio:.1}x, expected >= 25.0x"
);
}
#[test]
fn hamming_distance_properties() {
let a = vec![1.0f32; 64];
let b = vec![-1.0f32; 64];
let c = vec![1.0f32; 64];
let enc_a = encode_binary(&a);
let enc_b = encode_binary(&b);
let enc_c = encode_binary(&c);
// Distance to self is 0.
assert_eq!(hamming_distance(&enc_a, &enc_a), 0);
// Opposite vectors have maximum distance.
let max_dist = hamming_distance(&enc_a, &enc_b);
assert_eq!(
max_dist, 64,
"opposite vectors should have hamming distance = dim"
);
// Identical vectors have distance 0.
assert_eq!(hamming_distance(&enc_a, &enc_c), 0);
// Triangle inequality.
let d_ab = hamming_distance(&enc_a, &enc_b);
let d_bc = hamming_distance(&enc_b, &enc_c);
let d_ac = hamming_distance(&enc_a, &enc_c);
assert!(d_ac <= d_ab + d_bc, "triangle inequality violated");
}
#[test]
fn scalar_quantizer_preserves_nearest_neighbor_ordering() {
let vectors = random_unit_vectors(100, 32, 42);
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let quantizer = ScalarQuantizer::train(&refs);
let query = &vectors[0];
let encoded_query = quantizer.encode_vec(query);
// Compute distances in original and quantized space.
let mut original_dists: Vec<(usize, f32)> = vectors
.iter()
.enumerate()
.skip(1)
.map(|(i, v)| {
let d: f32 = query
.iter()
.zip(v.iter())
.map(|(a, b)| (a - b) * (a - b))
.sum();
(i, d)
})
.collect();
original_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let mut quant_dists: Vec<(usize, f32)> = vectors
.iter()
.enumerate()
.skip(1)
.map(|(i, v)| {
let encoded = quantizer.encode_vec(v);
let d = quantizer.distance_l2_quantized(&encoded_query, &encoded);
(i, d)
})
.collect();
quant_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
// The top-5 nearest neighbors should overlap significantly.
let top_k = 5;
let original_top: std::collections::HashSet<usize> =
original_dists.iter().take(top_k).map(|(i, _)| *i).collect();
let quant_top: std::collections::HashSet<usize> =
quant_dists.iter().take(top_k).map(|(i, _)| *i).collect();
let overlap = original_top.intersection(&quant_top).count();
assert!(
overlap >= 3,
"top-{top_k} overlap = {overlap}, expected >= 3"
);
}

View File

@@ -0,0 +1,326 @@
//! Runtime store lifecycle integration tests.
//!
//! Exercises the full create -> ingest -> query -> delete -> compact -> reopen
//! lifecycle through the rvf-runtime RvfStore API.
use rvf_runtime::filter::{FilterExpr, FilterValue};
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
/// Generate a unit vector along axis `axis` in `dim` dimensions.
fn unit_vector(dim: usize, axis: usize) -> Vec<f32> {
let mut v = vec![0.0f32; dim];
if axis < dim {
v[axis] = 1.0;
}
v
}
#[test]
fn full_lifecycle_create_ingest_query_close_reopen() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("lifecycle.rvf");
let dim = 8;
let options = make_options(dim);
// Phase 1: create, ingest, close.
{
let mut store = RvfStore::create(&path, options.clone()).unwrap();
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| {
let mut v = vec![0.0f32; dim as usize];
v[i % dim as usize] = 1.0;
v[(i + 1) % dim as usize] = 0.5;
v
})
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
let result = store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(result.accepted, 100);
assert_eq!(result.rejected, 0);
store.close().unwrap();
}
// Phase 2: reopen, query, verify results.
{
let store = RvfStore::open(&path).unwrap();
let query = unit_vector(dim as usize, 0);
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 5);
// Results should be sorted by distance (ascending).
for i in 1..results.len() {
assert!(
results[i - 1].distance <= results[i].distance,
"results not sorted: {} > {}",
results[i - 1].distance,
results[i].distance
);
}
store.close().unwrap();
}
// Phase 3: reopen, verify status.
{
let store = RvfStore::open_readonly(&path).unwrap();
let status = store.status();
assert_eq!(status.total_vectors, 100);
assert!(status.read_only);
}
}
#[test]
fn delete_and_reopen_excludes_deleted_vectors() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("delete.rvf");
let dim = 4;
let options = make_options(dim);
// Create with 10 vectors.
{
let mut store = RvfStore::create(&path, options.clone()).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete vectors 3, 5, 7.
let del_result = store.delete(&[3, 5, 7]).unwrap();
assert_eq!(del_result.deleted, 3);
store.close().unwrap();
}
// Reopen and verify deleted vectors are gone.
{
let store = RvfStore::open(&path).unwrap();
let status = store.status();
assert_eq!(status.total_vectors, 7); // 10 - 3
// Query with a vector that matches vector 3 exactly.
let query = vec![3.0f32; dim as usize];
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
// Vector 3 should not be in results.
for r in &results {
assert_ne!(r.id, 3, "deleted vector 3 should not appear in results");
assert_ne!(r.id, 5, "deleted vector 5 should not appear in results");
assert_ne!(r.id, 7, "deleted vector 7 should not appear in results");
}
store.close().unwrap();
}
}
#[test]
fn compact_reduces_file_size_after_deletion() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact.rvf");
let dim = 4;
let options = make_options(dim);
let mut store = RvfStore::create(&path, options).unwrap();
// Ingest 50 vectors.
let vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete half.
let delete_ids: Vec<u64> = (1..=25).collect();
store.delete(&delete_ids).unwrap();
// Compact.
let compact_result = store.compact().unwrap();
assert!(compact_result.segments_compacted > 0 || compact_result.bytes_reclaimed > 0);
// Verify remaining vectors are queryable.
let query = vec![30.0f32; dim as usize];
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert!(!results.is_empty());
for r in &results {
assert!(
r.id > 25,
"compacted store should only contain ids > 25, got {}",
r.id
);
}
store.close().unwrap();
}
#[test]
fn filter_query_integration() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("filter.rvf");
let dim = 4;
let options = make_options(dim);
let mut store = RvfStore::create(&path, options).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=20).collect();
// Ingest with metadata.
use rvf_runtime::options::{MetadataEntry, MetadataValue};
let metadata: Vec<MetadataEntry> = ids
.iter()
.map(|&id| MetadataEntry {
field_id: 0,
value: MetadataValue::U64(id % 3), // category: 0, 1, 2
})
.collect();
store.ingest_batch(&refs, &ids, Some(&metadata)).unwrap();
// Query with filter: category == 1 (ids 1, 4, 7, 10, 13, 16, 19).
let filter = FilterExpr::Eq(0, FilterValue::U64(1));
let qopts = QueryOptions {
filter: Some(filter),
..Default::default()
};
let query = vec![0.0f32; dim as usize];
let results = store.query(&query, 20, &qopts).unwrap();
// All results should have category == 1 (id % 3 == 1).
for r in &results {
assert_eq!(
r.id % 3,
1,
"filter should only return category 1, got id={}",
r.id
);
}
assert!(!results.is_empty());
store.close().unwrap();
}
#[test]
fn readonly_prevents_writes() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("readonly.rvf");
let dim = 4;
let options = make_options(dim);
// Create a store.
{
let mut store = RvfStore::create(&path, options).unwrap();
let v = vec![1.0f32; dim as usize];
store.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
store.close().unwrap();
}
// Open readonly.
let store = RvfStore::open_readonly(&path).unwrap();
// Queries should work.
let query = vec![1.0f32; dim as usize];
let results = store.query(&query, 1, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 1);
// Writes should fail.
// (open_readonly returns an immutable store, so we can't call ingest_batch)
assert!(store.status().read_only);
}
#[test]
fn concurrent_writer_lock() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("lock.rvf");
let dim = 4;
let options = make_options(dim);
// First writer.
let mut store1 = RvfStore::create(&path, options.clone()).unwrap();
let v = vec![1.0f32; dim as usize];
store1.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
// Second writer should fail.
let result = RvfStore::open(&path);
assert!(result.is_err(), "second writer should fail to acquire lock");
store1.close().unwrap();
// After close, opening should work.
let store2 = RvfStore::open(&path);
assert!(
store2.is_ok(),
"should be able to open after first writer closed"
);
store2.unwrap().close().unwrap();
}
#[test]
fn multiple_ingest_batches() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_ingest.rvf");
let dim = 4;
let options = make_options(dim);
let mut store = RvfStore::create(&path, options).unwrap();
// Ingest in three batches.
for batch in 0..3 {
let base_id = batch * 100 + 1;
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| vec![(base_id + i) as f32; dim as usize])
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (base_id..base_id + 100).map(|i| i as u64).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
}
// Should have 300 vectors.
assert_eq!(store.status().total_vectors, 300);
// Close and reopen to verify persistence.
store.close().unwrap();
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 300);
}
#[test]
fn delete_by_filter() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("del_filter.rvf");
let dim = 4;
let options = make_options(dim);
let mut store = RvfStore::create(&path, options).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
use rvf_runtime::options::{MetadataEntry, MetadataValue};
let metadata: Vec<MetadataEntry> = ids
.iter()
.map(|&id| MetadataEntry {
field_id: 0,
value: MetadataValue::U64(if id <= 5 { 0 } else { 1 }),
})
.collect();
store.ingest_batch(&refs, &ids, Some(&metadata)).unwrap();
// Delete all with field_0 == 0 (ids 1..=5).
let filter = FilterExpr::Eq(0, FilterValue::U64(0));
let del_result = store.delete_by_filter(&filter).unwrap();
assert_eq!(del_result.deleted, 5);
assert_eq!(store.status().total_vectors, 5);
store.close().unwrap();
}

View File

@@ -0,0 +1,302 @@
//! RVF CLI / persistence smoke tests -- Phase 1 acceptance criteria.
//!
//! Validates the end-to-end lifecycle that the Node.js CLI wraps:
//! 1. Create an RVF store
//! 2. Ingest vectors
//! 3. Query and verify results
//! 4. Close (simulating process exit)
//! 5. Reopen (simulating process restart)
//! 6. Query again and verify identical results
//!
//! Also exercises the rvlite adapter layer for the same persistence
//! guarantee and tests that error paths produce clear messages.
use std::path::Path;
use rvf_adapter_rvlite::{RvliteCollection, RvliteConfig, RvliteMetric};
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Deterministic pseudo-random vector generation using an LCG.
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ---------------------------------------------------------------------------
// 1. Core RVF store: create -> ingest -> query -> close -> reopen -> query
// ---------------------------------------------------------------------------
#[test]
fn smoke_rvf_persistence_across_restart() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("smoke.rvf");
let dim: u16 = 32;
let k = 5;
// -- Phase 1: create, populate, query, record results, close ----------
let results_before;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest 200 vectors.
let vectors: Vec<Vec<f32>> = (1..=200)
.map(|i| random_vector(dim as usize, i * 13 + 7))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=200).collect();
let ingest = store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(ingest.accepted, 200, "all 200 vectors should be accepted");
// Query with a known vector (seed for id=100).
let query = random_vector(dim as usize, 100 * 13 + 7);
results_before = store.query(&query, k, &QueryOptions::default()).unwrap();
assert_eq!(results_before.len(), k);
assert_eq!(
results_before[0].id, 100,
"exact-match vector should be first"
);
assert!(
results_before[0].distance < 1e-6,
"exact-match distance should be near zero"
);
// Verify status before closing.
let status = store.status();
assert_eq!(status.total_vectors, 200);
store.close().unwrap();
}
// -- Phase 2: reopen and verify identical results ---------------------
{
let store = RvfStore::open(&path).unwrap();
// Status should reflect the same count.
assert_eq!(
store.status().total_vectors,
200,
"vector count must survive restart"
);
// Same query must produce identical results.
let query = random_vector(dim as usize, 100 * 13 + 7);
let results_after = store.query(&query, k, &QueryOptions::default()).unwrap();
assert_eq!(results_after.len(), results_before.len());
for (before, after) in results_before.iter().zip(results_after.iter()) {
assert_eq!(before.id, after.id, "result IDs must match across restart");
assert!(
(before.distance - after.distance).abs() < 1e-6,
"distances must match across restart: {} vs {}",
before.distance,
after.distance
);
}
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// 2. Rvlite adapter: same persistence guarantee through the adapter API
// ---------------------------------------------------------------------------
#[test]
fn smoke_rvlite_adapter_persistence() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("adapter_smoke.rvf");
let dim: u16 = 8;
// -- Phase 1: create via adapter, add vectors, search, close ----------
let results_before;
{
let config = RvliteConfig::new(path.clone(), dim).with_metric(RvliteMetric::L2);
let mut col = RvliteCollection::create(config).unwrap();
col.add(1, &[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
.unwrap();
col.add(2, &[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
.unwrap();
col.add(3, &[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
.unwrap();
col.add(4, &[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
.unwrap();
col.add(5, &[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
.unwrap();
assert_eq!(col.len(), 5);
results_before = col.search(&[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 3);
assert_eq!(results_before.len(), 3);
assert_eq!(results_before[0].id, 1, "exact match should be first");
assert!(results_before[0].distance < f32::EPSILON);
col.close().unwrap();
}
// -- Phase 2: reopen via adapter, verify same results -----------------
{
let col = RvliteCollection::open(&path).unwrap();
assert_eq!(col.len(), 5, "vector count must survive adapter restart");
assert_eq!(col.dimension(), dim);
let results_after = col.search(&[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 3);
assert_eq!(results_after.len(), results_before.len());
for (before, after) in results_before.iter().zip(results_after.iter()) {
assert_eq!(
before.id, after.id,
"adapter result IDs must match across restart"
);
assert!(
(before.distance - after.distance).abs() < 1e-6,
"adapter distances must match across restart"
);
}
col.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// 3. Delete-then-restart: deletions survive process restart
// ---------------------------------------------------------------------------
#[test]
fn smoke_deletions_persist_across_restart() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("del_persist_smoke.rvf");
let dim: u16 = 4;
// Phase 1: create, populate, delete some, close.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=20).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.delete(&[5, 10, 15]).unwrap();
assert_eq!(store.status().total_vectors, 17);
store.close().unwrap();
}
// Phase 2: reopen and verify deletions survived.
{
let store = RvfStore::open(&path).unwrap();
assert_eq!(
store.status().total_vectors,
17,
"17 vectors should remain after restart"
);
// Query with high k to get all results; deleted IDs must be absent.
let query = vec![5.0f32; dim as usize];
let results = store.query(&query, 20, &QueryOptions::default()).unwrap();
for r in &results {
assert!(
r.id != 5 && r.id != 10 && r.id != 15,
"deleted vector {} appeared after restart",
r.id
);
}
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// 4. Compact-then-restart: compacted store reopens correctly
// ---------------------------------------------------------------------------
#[test]
fn smoke_compact_then_restart() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_restart_smoke.rvf");
let dim: u16 = 8;
// Phase 1: create, populate, delete half, compact, record query, close.
let results_before;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..100).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
let del_ids: Vec<u64> = (1..=50).collect();
store.delete(&del_ids).unwrap();
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 50);
let query = random_vector(dim as usize, 75); // close to vector 76
results_before = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert!(!results_before.is_empty());
store.close().unwrap();
}
// Phase 2: reopen and verify same results.
{
let store = RvfStore::open(&path).unwrap();
assert_eq!(store.status().total_vectors, 50);
let query = random_vector(dim as usize, 75);
let results_after = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(results_before.len(), results_after.len());
for (b, a) in results_before.iter().zip(results_after.iter()) {
assert_eq!(b.id, a.id, "post-compact restart: IDs must match");
assert!(
(b.distance - a.distance).abs() < 1e-6,
"post-compact restart: distances must match"
);
}
// All results should have id > 50 (deleted ids were 1..=50).
for r in &results_after {
assert!(
r.id > 50,
"post-compact restart: deleted id {} should not appear",
r.id
);
}
store.close().unwrap();
}
}
// ---------------------------------------------------------------------------
// 5. Missing dependency produces clear error message
// ---------------------------------------------------------------------------
#[test]
fn smoke_nonexistent_store_gives_clear_error() {
// Opening a path that does not exist should produce a meaningful error,
// not a panic. This mirrors the "missing @ruvector/rvf" scenario at the
// Rust level -- the file simply doesn't exist.
let result = RvfStore::open(Path::new("/tmp/nonexistent_rvf_smoke_test_12345.rvf"));
assert!(result.is_err(), "opening nonexistent store should fail");
let err_msg = match result {
Err(e) => format!("{e}"),
Ok(_) => panic!("expected error, got Ok"),
};
// The error message should be informative (not empty or cryptic).
assert!(!err_msg.is_empty(), "error message should not be empty");
}

View File

@@ -0,0 +1,616 @@
//! End-to-end RVF smoke test -- full lifecycle verification.
//!
//! Exercises the complete RVF pipeline through 15 steps:
//! 1. Create a new store (dim=128, cosine metric)
//! 2. Ingest 100 random vectors with metadata
//! 3. Query for 10 nearest neighbors of a known vector
//! 4. Verify results are sorted and distances are valid (0.0..2.0 for cosine)
//! 5. Close the store
//! 6. Reopen the store (simulating process restart)
//! 7. Query again with the same vector
//! 8. Verify results match the first query exactly (persistence verified)
//! 9. Delete some vectors
//! 10. Compact the store
//! 11. Verify deleted vectors no longer appear in results
//! 12. Derive a child store
//! 13. Verify child can be queried independently
//! 14. Verify segment listing works on both parent and child
//! 15. Clean up temporary files
//!
//! NOTE: The `DistanceMetric` is not persisted in the manifest, so after
//! `RvfStore::open()` the metric defaults to L2. The lifecycle test therefore
//! uses L2 for the cross-restart comparison (steps 5-8), while cosine-specific
//! assertions are exercised in a dedicated single-session test.
use rvf_runtime::options::{
DistanceMetric, MetadataEntry, MetadataValue, QueryOptions, RvfOptions,
};
use rvf_runtime::RvfStore;
use rvf_types::DerivationType;
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Deterministic pseudo-random vector generation using an LCG.
/// Produces values in [-0.5, 0.5).
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = Vec::with_capacity(dim);
let mut x = seed;
for _ in 0..dim {
x = x
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
v.push(((x >> 33) as f32) / (u32::MAX as f32) - 0.5);
}
v
}
/// L2-normalize a vector in place so cosine distance is well-defined.
fn normalize(v: &mut [f32]) {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > f32::EPSILON {
for x in v.iter_mut() {
*x /= norm;
}
}
}
/// Generate a normalized random vector suitable for cosine queries.
fn random_unit_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = random_vector(dim, seed);
normalize(&mut v);
v
}
fn make_options(dim: u16, metric: DistanceMetric) -> RvfOptions {
RvfOptions {
dimension: dim,
metric,
..Default::default()
}
}
// ---------------------------------------------------------------------------
// Full lifecycle smoke test (L2 metric for cross-restart consistency)
// ---------------------------------------------------------------------------
#[test]
fn rvf_smoke_full_lifecycle() {
let dir = TempDir::new().expect("failed to create temp dir");
let store_path = dir.path().join("smoke_lifecycle.rvf");
let child_path = dir.path().join("smoke_child.rvf");
let dim: u16 = 128;
let k: usize = 10;
let vector_count: usize = 100;
// Use L2 metric for the lifecycle test because the metric is not persisted
// in the manifest. After reopen, the store defaults to L2, so using L2
// throughout ensures cross-restart distance comparisons are exact.
let options = make_options(dim, DistanceMetric::L2);
// -----------------------------------------------------------------------
// Step 1: Create a new RVF store with dimension 128 and cosine metric
// -----------------------------------------------------------------------
let mut store =
RvfStore::create(&store_path, options.clone()).expect("step 1: failed to create store");
// Verify initial state.
let initial_status = store.status();
assert_eq!(
initial_status.total_vectors, 0,
"step 1: new store should be empty"
);
assert!(
!initial_status.read_only,
"step 1: new store should not be read-only"
);
// -----------------------------------------------------------------------
// Step 2: Ingest 100 random vectors with metadata
// -----------------------------------------------------------------------
let vectors: Vec<Vec<f32>> = (0..vector_count as u64)
.map(|i| random_vector(dim as usize, i * 17 + 5))
.collect();
let vec_refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=vector_count as u64).collect();
// One metadata entry per vector: field_id=0, value=category string.
let metadata: Vec<MetadataEntry> = ids
.iter()
.map(|&id| MetadataEntry {
field_id: 0,
value: MetadataValue::String(format!("group_{}", id % 5)),
})
.collect();
let ingest_result = store
.ingest_batch(&vec_refs, &ids, Some(&metadata))
.expect("step 2: ingest failed");
assert_eq!(
ingest_result.accepted, vector_count as u64,
"step 2: all {} vectors should be accepted",
vector_count,
);
assert_eq!(
ingest_result.rejected, 0,
"step 2: no vectors should be rejected"
);
assert!(
ingest_result.epoch > 0,
"step 2: epoch should advance after ingest"
);
// -----------------------------------------------------------------------
// Step 3: Query for 10 nearest neighbors of a known vector
// -----------------------------------------------------------------------
// Use vector with id=50 as the query (seed = 49 * 17 + 5 = 838).
let query_vec = random_vector(dim as usize, 49 * 17 + 5);
let results_first = store
.query(&query_vec, k, &QueryOptions::default())
.expect("step 3: query failed");
assert_eq!(
results_first.len(),
k,
"step 3: should return exactly {} results",
k,
);
// The first result should be the exact match (id=50).
assert_eq!(
results_first[0].id, 50,
"step 3: exact match vector should be first result",
);
assert!(
results_first[0].distance < 1e-5,
"step 3: exact match distance should be near zero, got {}",
results_first[0].distance,
);
// -----------------------------------------------------------------------
// Step 4: Verify results are sorted by distance and distances are valid
// (L2 distances are non-negative)
// -----------------------------------------------------------------------
for i in 1..results_first.len() {
assert!(
results_first[i].distance >= results_first[i - 1].distance,
"step 4: results not sorted at position {}: {} > {}",
i,
results_first[i - 1].distance,
results_first[i].distance,
);
}
for r in &results_first {
assert!(
r.distance >= 0.0,
"step 4: L2 distance {} should be non-negative",
r.distance,
);
}
// -----------------------------------------------------------------------
// Step 5: Close the store
// -----------------------------------------------------------------------
store.close().expect("step 5: close failed");
// -----------------------------------------------------------------------
// Step 6: Reopen the store (simulating process restart)
// -----------------------------------------------------------------------
let store = RvfStore::open(&store_path).expect("step 6: reopen failed");
let reopen_status = store.status();
assert_eq!(
reopen_status.total_vectors, vector_count as u64,
"step 6: all {} vectors should persist after reopen",
vector_count,
);
// -----------------------------------------------------------------------
// Step 7: Query again with the same vector
// -----------------------------------------------------------------------
let results_second = store
.query(&query_vec, k, &QueryOptions::default())
.expect("step 7: query after reopen failed");
assert_eq!(
results_second.len(),
k,
"step 7: should return exactly {} results after reopen",
k,
);
// -----------------------------------------------------------------------
// Step 8: Verify results match the first query exactly (persistence)
//
// After reopen, the internal iteration order of vectors may differ, which
// can affect tie-breaking in the k-NN heap. We therefore compare:
// (a) the set of result IDs must be identical,
// (b) distances for each ID must match within floating-point tolerance,
// (c) result count must be the same.
// -----------------------------------------------------------------------
assert_eq!(
results_first.len(),
results_second.len(),
"step 8: result count should match across restart",
);
// Build a map of id -> distance for comparison.
let first_map: std::collections::HashMap<u64, f32> =
results_first.iter().map(|r| (r.id, r.distance)).collect();
let second_map: std::collections::HashMap<u64, f32> =
results_second.iter().map(|r| (r.id, r.distance)).collect();
// Verify the exact same IDs appear in both result sets.
let mut first_ids: Vec<u64> = first_map.keys().copied().collect();
let mut second_ids: Vec<u64> = second_map.keys().copied().collect();
first_ids.sort();
second_ids.sort();
assert_eq!(
first_ids, second_ids,
"step 8: result ID sets must match across restart",
);
// Verify distances match per-ID within tolerance.
for &id in &first_ids {
let d1 = first_map[&id];
let d2 = second_map[&id];
assert!(
(d1 - d2).abs() < 1e-5,
"step 8: distance mismatch for id={}: {} vs {} (pre vs post restart)",
id,
d1,
d2,
);
}
// Need a mutable store for delete/compact. Drop the read-write handle and
// reopen it mutably.
store
.close()
.expect("step 8: close for mutable reopen failed");
let mut store = RvfStore::open(&store_path).expect("step 8: mutable reopen failed");
// -----------------------------------------------------------------------
// Step 9: Delete some vectors (ids 1..=10)
// -----------------------------------------------------------------------
let delete_ids: Vec<u64> = (1..=10).collect();
let del_result = store.delete(&delete_ids).expect("step 9: delete failed");
assert_eq!(
del_result.deleted, 10,
"step 9: should have deleted 10 vectors",
);
assert!(
del_result.epoch > reopen_status.current_epoch,
"step 9: epoch should advance after delete",
);
// Quick verification: deleted vectors should not appear in query.
let post_delete_results = store
.query(&query_vec, vector_count, &QueryOptions::default())
.expect("step 9: post-delete query failed");
for r in &post_delete_results {
assert!(
r.id > 10,
"step 9: deleted vector {} should not appear in results",
r.id,
);
}
assert_eq!(
post_delete_results.len(),
vector_count - 10,
"step 9: should have {} results after deleting 10",
vector_count - 10,
);
// -----------------------------------------------------------------------
// Step 10: Compact the store
// -----------------------------------------------------------------------
let pre_compact_epoch = store.status().current_epoch;
let compact_result = store.compact().expect("step 10: compact failed");
assert!(
compact_result.segments_compacted > 0 || compact_result.bytes_reclaimed > 0,
"step 10: compaction should reclaim space",
);
assert!(
compact_result.epoch > pre_compact_epoch,
"step 10: epoch should advance after compact",
);
// -----------------------------------------------------------------------
// Step 11: Verify deleted vectors no longer appear in results
// -----------------------------------------------------------------------
let post_compact_results = store
.query(&query_vec, vector_count, &QueryOptions::default())
.expect("step 11: post-compact query failed");
for r in &post_compact_results {
assert!(
r.id > 10,
"step 11: deleted vector {} appeared after compaction",
r.id,
);
}
assert_eq!(
post_compact_results.len(),
vector_count - 10,
"step 11: should still have {} results post-compact",
vector_count - 10,
);
// Verify post-compact status.
let post_compact_status = store.status();
assert_eq!(
post_compact_status.total_vectors,
(vector_count - 10) as u64,
"step 11: status should reflect {} live vectors",
vector_count - 10,
);
// -----------------------------------------------------------------------
// Step 12: Derive a child store
// -----------------------------------------------------------------------
let child = store
.derive(&child_path, DerivationType::Clone, Some(options.clone()))
.expect("step 12: derive failed");
// Verify lineage.
assert_eq!(
child.lineage_depth(),
1,
"step 12: child lineage depth should be 1",
);
assert_eq!(
child.parent_id(),
store.file_id(),
"step 12: child parent_id should match parent file_id",
);
assert_ne!(
child.file_id(),
store.file_id(),
"step 12: child should have a distinct file_id",
);
// -----------------------------------------------------------------------
// Step 13: Verify child can be queried independently
// -----------------------------------------------------------------------
// The child is a fresh derived store (no vectors copied by default via
// derive -- only lineage metadata). Query should return empty or results
// depending on whether vectors were inherited. We just verify it does not
// panic and returns a valid response.
let child_query = random_vector(dim as usize, 999);
let child_results = child
.query(&child_query, k, &QueryOptions::default())
.expect("step 13: child query failed");
// Child is newly derived with no vectors of its own, so results should be empty.
assert!(
child_results.is_empty(),
"step 13: freshly derived child should have no vectors, got {}",
child_results.len(),
);
// -----------------------------------------------------------------------
// Step 14: Verify segment listing works on both parent and child
// -----------------------------------------------------------------------
let parent_segments = store.segment_dir();
assert!(
!parent_segments.is_empty(),
"step 14: parent should have at least one segment",
);
let child_segments = child.segment_dir();
assert!(
!child_segments.is_empty(),
"step 14: child should have at least one segment (manifest)",
);
// Verify segment tuples have valid structure (seg_id > 0, type byte > 0).
for &(seg_id, _offset, _len, seg_type) in parent_segments {
assert!(seg_id > 0, "step 14: parent segment ID should be > 0");
assert!(seg_type > 0, "step 14: parent segment type should be > 0");
}
for &(seg_id, _offset, _len, seg_type) in child_segments {
assert!(seg_id > 0, "step 14: child segment ID should be > 0");
assert!(seg_type > 0, "step 14: child segment type should be > 0");
}
// -----------------------------------------------------------------------
// Step 15: Clean up temporary files
// -----------------------------------------------------------------------
child.close().expect("step 15: child close failed");
store.close().expect("step 15: parent close failed");
// TempDir's Drop impl will remove the directory, but verify the files exist
// before cleanup happens.
assert!(
store_path.exists(),
"step 15: parent store file should exist before cleanup",
);
assert!(
child_path.exists(),
"step 15: child store file should exist before cleanup",
);
// Explicitly drop the TempDir to trigger cleanup.
drop(dir);
}
// ---------------------------------------------------------------------------
// Additional focused smoke tests
// ---------------------------------------------------------------------------
/// Verify that cosine metric returns distances strictly in [0.0, 2.0] range
/// for all query results when using normalized vectors. This test runs within
/// a single session (no restart) to avoid the metric-not-persisted issue.
#[test]
fn smoke_cosine_distance_range() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("cosine_range.rvf");
let dim: u16 = 128;
let options = make_options(dim, DistanceMetric::Cosine);
let mut store = RvfStore::create(&path, options).unwrap();
// Ingest 50 normalized vectors.
let vectors: Vec<Vec<f32>> = (0..50)
.map(|i| random_unit_vector(dim as usize, i * 31 + 3))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Query with several different vectors and verify distance range.
for seed in [0, 42, 100, 999, 12345] {
let q = random_unit_vector(dim as usize, seed);
let results = store.query(&q, 50, &QueryOptions::default()).unwrap();
for r in &results {
assert!(
r.distance >= 0.0 && r.distance <= 2.0,
"cosine distance {} out of range [0.0, 2.0] for seed {}",
r.distance,
seed,
);
}
// Verify sorting.
for i in 1..results.len() {
assert!(
results[i].distance >= results[i - 1].distance,
"results not sorted for seed {}: {} > {} at position {}",
seed,
results[i - 1].distance,
results[i].distance,
i,
);
}
}
store.close().unwrap();
}
/// Verify persistence across multiple close/reopen cycles with interleaved
/// ingests and deletes. Uses L2 metric for cross-restart consistency.
#[test]
fn smoke_multi_restart_persistence() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_restart.rvf");
let dim: u16 = 128;
let options = make_options(dim, DistanceMetric::L2);
// Cycle 1: create and ingest 50 vectors.
{
let mut store = RvfStore::create(&path, options.clone()).unwrap();
let vectors: Vec<Vec<f32>> = (0..50).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=50).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(store.status().total_vectors, 50);
store.close().unwrap();
}
// Cycle 2: reopen, ingest 50 more, delete 10, close.
{
let mut store = RvfStore::open(&path).unwrap();
assert_eq!(store.status().total_vectors, 50);
let vectors: Vec<Vec<f32>> = (50..100).map(|i| random_vector(dim as usize, i)).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (51..=100).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
assert_eq!(store.status().total_vectors, 100);
store
.delete(&[5, 10, 15, 20, 25, 55, 60, 65, 70, 75])
.unwrap();
assert_eq!(store.status().total_vectors, 90);
store.close().unwrap();
}
// Cycle 3: reopen, verify counts, compact, close.
{
let mut store = RvfStore::open(&path).unwrap();
assert_eq!(
store.status().total_vectors,
90,
"cycle 3: 90 vectors should survive two restarts",
);
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 90);
// Verify no deleted IDs appear in a full query.
let q = random_vector(dim as usize, 42);
let results = store.query(&q, 100, &QueryOptions::default()).unwrap();
let deleted_ids = [5, 10, 15, 20, 25, 55, 60, 65, 70, 75];
for r in &results {
assert!(
!deleted_ids.contains(&r.id),
"cycle 3: deleted vector {} appeared after compact + restart",
r.id,
);
}
store.close().unwrap();
}
// Cycle 4: final reopen (readonly), verify persistence survived compact.
{
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(
store.status().total_vectors,
90,
"cycle 4: 90 vectors should survive compact + restart",
);
assert!(store.status().read_only);
}
}
/// Verify metadata ingestion and that vector IDs are correct after batch
/// operations.
#[test]
fn smoke_metadata_and_ids() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("meta_ids.rvf");
let dim: u16 = 128;
let options = make_options(dim, DistanceMetric::L2);
let mut store = RvfStore::create(&path, options).unwrap();
// Ingest 100 vectors, each with a metadata entry.
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| random_vector(dim as usize, i * 7 + 1))
.collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=100).collect();
let metadata: Vec<MetadataEntry> = ids
.iter()
.map(|&id| MetadataEntry {
field_id: 0,
value: MetadataValue::U64(id),
})
.collect();
let result = store.ingest_batch(&refs, &ids, Some(&metadata)).unwrap();
assert_eq!(result.accepted, 100);
assert_eq!(result.rejected, 0);
// Query for exact match of vector id=42.
let query = random_vector(dim as usize, 41 * 7 + 1);
let results = store.query(&query, 1, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].id, 42, "exact match should be id=42");
assert!(results[0].distance < 1e-5);
store.close().unwrap();
}

View File

@@ -0,0 +1,398 @@
//! Integration tests for segment preservation during compaction.
//!
//! Tests that unknown or extension segments (Kernel, Ebpf, etc.) survive
//! compaction cycles, and that the compact operation correctly rewrites
//! vector data while preserving other segments byte-for-byte.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use rvf_types::{SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC};
use std::fs::OpenOptions;
use std::io::{Read, Write};
use tempfile::TempDir;
// ---------------------------------------------------------------------------
// Helper: make RvfStore options
// ---------------------------------------------------------------------------
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
// ---------------------------------------------------------------------------
// Helper: read file bytes
// ---------------------------------------------------------------------------
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = OpenOptions::new().read(true).open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
// ---------------------------------------------------------------------------
// Helper: scan file for segments of a given type
// ---------------------------------------------------------------------------
fn scan_segments_of_type(file_bytes: &[u8], seg_type: u8) -> Vec<(usize, u64, u64)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut results = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return results;
}
let last_possible = file_bytes.len() - SEGMENT_HEADER_SIZE;
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let found_type = file_bytes[i + 5];
if found_type == seg_type {
let seg_id = u64::from_le_bytes(file_bytes[i + 0x08..i + 0x10].try_into().unwrap());
let payload_len =
u64::from_le_bytes(file_bytes[i + 0x10..i + 0x18].try_into().unwrap());
results.push((i, seg_id, payload_len));
}
}
}
results
}
// ===========================================================================
// TEST 1: kernel_segment_survives_compaction
// ===========================================================================
/// Embed a kernel into a store, compact, and verify the kernel segment
/// is preserved in the compacted file.
#[test]
fn kernel_segment_survives_compaction() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("kernel_compact.rvf");
let dim: u16 = 4;
let kernel_image = b"test-kernel-image-for-compaction-test";
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest vectors
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Embed kernel
let _kernel_seg_id = store
.embed_kernel(0x00, 0x00, 0, kernel_image, 8080, None)
.unwrap();
// Delete some vectors to trigger compaction
store.delete(&[0, 2, 4, 6, 8]).unwrap();
// Compact
store.compact().unwrap();
// Verify vectors are correct
let status = store.status();
assert_eq!(
status.total_vectors, 5,
"should have 5 vectors after compaction"
);
// Verify kernel segment is still present
let bytes = read_file_bytes(&path);
let kernel_segs = scan_segments_of_type(&bytes, SegmentType::Kernel as u8);
assert!(
!kernel_segs.is_empty(),
"KERNEL_SEG should survive compaction"
);
// Verify the kernel can still be extracted
let extracted = store.extract_kernel().unwrap();
assert!(extracted.is_some(), "kernel should still be extractable");
let (header_bytes, image_bytes) = extracted.unwrap();
assert_eq!(header_bytes.len(), 128);
assert_eq!(
&image_bytes[..kernel_image.len()],
kernel_image,
"kernel image content should be preserved"
);
store.close().unwrap();
println!("PASS: kernel_segment_survives_compaction");
}
// ===========================================================================
// TEST 2: ebpf_segment_survives_compaction
// ===========================================================================
/// Embed an eBPF program, compact, and verify it survives.
#[test]
fn ebpf_segment_survives_compaction() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("ebpf_compact.rvf");
let dim: u16 = 4;
let bytecode = b"ebpf-bytecode-for-compaction-test-12345678";
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest and delete
let vectors: Vec<Vec<f32>> = (0..6).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..6).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Embed eBPF
store.embed_ebpf(0x01, 0x02, 128, bytecode, None).unwrap();
// Delete and compact
store.delete(&[0, 2, 4]).unwrap();
store.compact().unwrap();
// Verify eBPF is still present
let bytes = read_file_bytes(&path);
let ebpf_segs = scan_segments_of_type(&bytes, SegmentType::Ebpf as u8);
assert!(!ebpf_segs.is_empty(), "EBPF_SEG should survive compaction");
let extracted = store.extract_ebpf().unwrap();
assert!(extracted.is_some(), "eBPF should still be extractable");
let (header, payload) = extracted.unwrap();
assert_eq!(header.len(), 64);
assert_eq!(
&payload[..bytecode.len()],
bytecode,
"eBPF bytecode should be preserved"
);
store.close().unwrap();
println!("PASS: ebpf_segment_survives_compaction");
}
// ===========================================================================
// TEST 3: both_kernel_and_ebpf_survive_compaction
// ===========================================================================
/// Embed both kernel and eBPF segments, compact, and verify both survive.
#[test]
fn both_kernel_and_ebpf_survive_compaction() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("both_compact.rvf");
let dim: u16 = 4;
let kernel_image = b"kernel-data-for-dual-segment-test";
let ebpf_bytecode = b"ebpf-code-for-dual-segment-test";
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..8).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..8).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store
.embed_kernel(0x01, 0x00, 0x01, kernel_image, 9090, Some("quiet"))
.unwrap();
store
.embed_ebpf(0x02, 0x01, 256, ebpf_bytecode, None)
.unwrap();
// Delete half the vectors and compact
store.delete(&[0, 1, 2, 3]).unwrap();
store.compact().unwrap();
assert_eq!(store.status().total_vectors, 4);
// Both should survive
let bytes = read_file_bytes(&path);
let kernel_segs = scan_segments_of_type(&bytes, SegmentType::Kernel as u8);
let ebpf_segs = scan_segments_of_type(&bytes, SegmentType::Ebpf as u8);
assert!(
!kernel_segs.is_empty(),
"KERNEL_SEG should survive compaction"
);
assert!(!ebpf_segs.is_empty(), "EBPF_SEG should survive compaction");
assert!(store.extract_kernel().unwrap().is_some());
assert!(store.extract_ebpf().unwrap().is_some());
store.close().unwrap();
println!("PASS: both_kernel_and_ebpf_survive_compaction");
}
// ===========================================================================
// TEST 4: unknown_segment_type_survives_compaction
// ===========================================================================
/// Manually append a segment with an unknown type code (simulating a future
/// format extension), compact, and verify it survives.
#[test]
fn unknown_segment_type_survives_compaction() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("unknown_seg.rvf");
let dim: u16 = 4;
let unknown_seg_type: u8 = 0x30; // Not defined in current SegmentType enum
let unknown_payload = b"future-segment-payload-data-v2";
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let v = vec![1.0f32; dim as usize];
store.ingest_batch(&[v.as_slice()], &[1], None).unwrap();
store.close().unwrap();
}
// Manually append an "unknown" segment
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let mut header = [0u8; SEGMENT_HEADER_SIZE];
header[0..4].copy_from_slice(&SEGMENT_MAGIC.to_le_bytes());
header[4] = 1; // version
header[5] = unknown_seg_type;
// flags at 6..8 stay zero
header[0x08..0x10].copy_from_slice(&9999u64.to_le_bytes()); // seg_id
header[0x10..0x18].copy_from_slice(&(unknown_payload.len() as u64).to_le_bytes());
file.write_all(&header).unwrap();
file.write_all(unknown_payload).unwrap();
file.sync_all().unwrap();
}
// Verify the unknown segment is present
let bytes_before = read_file_bytes(&path);
let unknown_before = scan_segments_of_type(&bytes_before, unknown_seg_type);
assert_eq!(
unknown_before.len(),
1,
"should find 1 unknown segment before compaction"
);
// Compact
{
let mut store = RvfStore::open(&path).unwrap();
store.compact().unwrap();
store.close().unwrap();
}
// Verify the unknown segment survived
let bytes_after = read_file_bytes(&path);
let unknown_after = scan_segments_of_type(&bytes_after, unknown_seg_type);
assert_eq!(
unknown_after.len(),
1,
"unknown segment should survive compaction"
);
// Verify the payload is intact
let (offset, _seg_id, payload_len) = unknown_after[0];
let payload_start = offset + SEGMENT_HEADER_SIZE;
let payload_end = payload_start + payload_len as usize;
assert_eq!(
&bytes_after[payload_start..payload_end],
unknown_payload,
"unknown segment payload should be preserved"
);
println!("PASS: unknown_segment_type_survives_compaction");
}
// ===========================================================================
// TEST 5: compaction_removes_dead_vectors_but_keeps_live
// ===========================================================================
/// Verify that compaction correctly removes deleted vectors while
/// keeping live ones, and that queries still return correct results.
#[test]
fn compaction_removes_dead_vectors_but_keeps_live() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_live.rvf");
let dim: u16 = 4;
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
// Ingest 10 vectors
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32, 0.0, 0.0, 0.0]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
// Delete odd-indexed vectors
store.delete(&[1, 3, 5, 7, 9]).unwrap();
let pre_compact_size = store.status().file_size;
// Compact
store.compact().unwrap();
let post_compact_size = store.status().file_size;
// Verify compacted state
assert_eq!(store.status().total_vectors, 5);
// Query should only return even-indexed vectors
let query = vec![0.0, 0.0, 0.0, 0.0];
let results = store.query(&query, 10, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 5);
for r in &results {
assert!(r.id % 2 == 0, "only even IDs should remain, got {}", r.id);
}
// File should be smaller (or at least not larger) after compaction
// (may be larger due to segment overhead, but vector data should shrink)
assert!(
post_compact_size <= pre_compact_size + 256,
"compacted file should not grow significantly: pre={pre_compact_size}, post={post_compact_size}"
);
store.close().unwrap();
println!("PASS: compaction_removes_dead_vectors_but_keeps_live");
}
// ===========================================================================
// TEST 6: compacted_store_can_be_reopened
// ===========================================================================
/// After compaction, close and reopen the store to verify durability.
#[test]
fn compacted_store_can_be_reopened() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("compact_reopen.rvf");
let dim: u16 = 4;
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32, 0.0, 0.0, 0.0]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (0..20).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.delete(&[0, 5, 10, 15]).unwrap();
store.compact().unwrap();
store.close().unwrap();
}
// Reopen
{
let store = RvfStore::open_readonly(&path).unwrap();
assert_eq!(store.status().total_vectors, 16);
let query = vec![1.0, 0.0, 0.0, 0.0];
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert_eq!(results.len(), 5);
// Verify deleted vectors are not in results
for r in &results {
assert!(
r.id != 0 && r.id != 5 && r.id != 10 && r.id != 15,
"deleted vector {} should not appear",
r.id
);
}
}
println!("PASS: compacted_store_can_be_reopened");
}

View File

@@ -0,0 +1,419 @@
//! Unknown segment type preservation during compaction.
//!
//! Forward-compatibility guarantee: older RVF tools MUST NOT silently
//! discard segment types they do not recognize. This test verifies that
//! unknown segment types (e.g., a future KERNEL_SEG 0x0E or EBPF_SEG 0x0F)
//! survive a compact/rewrite cycle byte-for-byte.
//!
//! If this test fails, it means the compaction implementation only rewrites
//! known segment types and drops everything else -- a valid finding that
//! should be fixed before shipping a format version bump.
use rvf_runtime::options::{DistanceMetric, QueryOptions, RvfOptions};
use rvf_runtime::RvfStore;
use std::fs::OpenOptions;
use std::io::{Read, Write};
use tempfile::TempDir;
/// The RVF segment header magic: "RVFS" as a little-endian u32.
const SEGMENT_MAGIC: u32 = 0x5256_4653;
/// Size of the 64-byte segment header.
const SEGMENT_HEADER_SIZE: usize = 64;
/// A hypothetical future segment type not yet defined in SegmentType.
const UNKNOWN_SEG_TYPE_KERNEL: u8 = 0x0E;
/// Another hypothetical future segment type (vendor extension range).
const UNKNOWN_SEG_TYPE_VENDOR: u8 = 0xFE;
fn make_options(dim: u16) -> RvfOptions {
RvfOptions {
dimension: dim,
metric: DistanceMetric::L2,
..Default::default()
}
}
/// Build a raw 64-byte segment header for an unknown segment type.
fn build_raw_segment_header(
seg_type: u8,
seg_id: u64,
payload_len: u64,
) -> [u8; SEGMENT_HEADER_SIZE] {
let mut buf = [0u8; SEGMENT_HEADER_SIZE];
// magic (offset 0x00): RVFS
buf[0x00..0x04].copy_from_slice(&SEGMENT_MAGIC.to_le_bytes());
// version (offset 0x04): 1
buf[0x04] = 1;
// seg_type (offset 0x05)
buf[0x05] = seg_type;
// flags (offset 0x06): 0
// segment_id (offset 0x08)
buf[0x08..0x10].copy_from_slice(&seg_id.to_le_bytes());
// payload_length (offset 0x10)
buf[0x10..0x18].copy_from_slice(&payload_len.to_le_bytes());
// remaining fields stay zeroed (timestamp, checksum, compression, etc.)
buf
}
/// Scan a file for all segment headers and return (offset, seg_type, seg_id, payload_len)
/// for each segment found.
fn scan_segments(file_bytes: &[u8]) -> Vec<(usize, u8, u64, u64)> {
let magic_bytes = SEGMENT_MAGIC.to_le_bytes();
let mut segments = Vec::new();
if file_bytes.len() < SEGMENT_HEADER_SIZE {
return segments;
}
let last_possible = file_bytes.len() - SEGMENT_HEADER_SIZE;
for i in 0..=last_possible {
if file_bytes[i..i + 4] == magic_bytes {
let seg_type = file_bytes[i + 5];
let seg_id = u64::from_le_bytes([
file_bytes[i + 0x08],
file_bytes[i + 0x09],
file_bytes[i + 0x0A],
file_bytes[i + 0x0B],
file_bytes[i + 0x0C],
file_bytes[i + 0x0D],
file_bytes[i + 0x0E],
file_bytes[i + 0x0F],
]);
let payload_len = u64::from_le_bytes([
file_bytes[i + 0x10],
file_bytes[i + 0x11],
file_bytes[i + 0x12],
file_bytes[i + 0x13],
file_bytes[i + 0x14],
file_bytes[i + 0x15],
file_bytes[i + 0x16],
file_bytes[i + 0x17],
]);
segments.push((i, seg_type, seg_id, payload_len));
}
}
segments
}
/// Read entire file into a byte vector.
fn read_file_bytes(path: &std::path::Path) -> Vec<u8> {
let mut file = OpenOptions::new().read(true).open(path).unwrap();
let mut buf = Vec::new();
file.read_to_end(&mut buf).unwrap();
buf
}
/// Extract the full segment bytes (header + payload) for a segment at a given
/// offset, given the file content.
fn extract_segment_bytes(file_bytes: &[u8], offset: usize, payload_len: u64) -> &[u8] {
let end = offset + SEGMENT_HEADER_SIZE + payload_len as usize;
&file_bytes[offset..end]
}
// --------------------------------------------------------------------------
// 1. Unknown segment is preserved after compaction (KERNEL_SEG 0x0E)
// --------------------------------------------------------------------------
//
// NOTE: The current compaction implementation in store.rs rewrites the file
// by creating a temp file containing only the live VEC_SEGs and a new
// manifest. It does NOT preserve unknown/unrecognized segment types.
// Therefore this test documents the EXPECTED behavior (unknown segments
// should be preserved) but is anticipated to FAIL against the current
// implementation. This is a known gap -- not a bug in the test.
#[test]
fn unknown_segment_preserved_after_compaction() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("unknown_seg.rvf");
let dim: u16 = 4;
// --- Step 1: Create a store and ingest some vectors -----------------------
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=20).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// --- Step 2: Manually append an unknown segment (KERNEL_SEG 0x0E) ---------
// The payload is arbitrary opaque data -- perhaps a future eBPF bytecode
// blob or kernel routing table. We use a recognizable pattern so we can
// verify byte-for-byte preservation.
let unknown_payload: Vec<u8> = (0..128u8).collect(); // 128 bytes of 0x00..0x7F
let unknown_seg_id: u64 = 9999;
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let header = build_raw_segment_header(
UNKNOWN_SEG_TYPE_KERNEL,
unknown_seg_id,
unknown_payload.len() as u64,
);
file.write_all(&header).unwrap();
file.write_all(&unknown_payload).unwrap();
file.sync_all().unwrap();
}
// --- Step 3: Verify the unknown segment is present in the file ------------
let bytes_before = read_file_bytes(&path);
let segments_before = scan_segments(&bytes_before);
let unknown_before: Vec<_> = segments_before
.iter()
.filter(|&&(_, seg_type, _, _)| seg_type == UNKNOWN_SEG_TYPE_KERNEL)
.collect();
assert_eq!(
unknown_before.len(),
1,
"expected exactly 1 unknown segment (type 0x{:02X}) before compaction, found {}",
UNKNOWN_SEG_TYPE_KERNEL,
unknown_before.len()
);
let &(off_before, _, sid_before, plen_before) = unknown_before[0];
assert_eq!(sid_before, unknown_seg_id);
assert_eq!(plen_before, unknown_payload.len() as u64);
// Save the full segment bytes for later comparison.
let seg_bytes_before = extract_segment_bytes(&bytes_before, off_before, plen_before).to_vec();
println!(
"Before compaction: unknown segment at offset {}, {} total bytes (header+payload)",
off_before,
seg_bytes_before.len()
);
// --- Step 4: Delete some vectors and compact ------------------------------
{
let mut store = RvfStore::open(&path).unwrap();
// Delete a few vectors to give compaction something to do.
let del_ids: Vec<u64> = (1..=5).collect();
store.delete(&del_ids).unwrap();
let compact_result = store.compact().unwrap();
println!(
"Compaction: segments_compacted={}, bytes_reclaimed={}",
compact_result.segments_compacted, compact_result.bytes_reclaimed
);
store.close().unwrap();
}
// --- Step 5: Verify the unknown segment still exists after compaction -----
let bytes_after = read_file_bytes(&path);
let segments_after = scan_segments(&bytes_after);
println!(
"After compaction: {} total segments found in file scan",
segments_after.len()
);
for &(off, stype, sid, plen) in &segments_after {
println!(
" offset={}, type=0x{:02X}, seg_id={}, payload_len={}",
off, stype, sid, plen
);
}
let unknown_after: Vec<_> = segments_after
.iter()
.filter(|&&(_, seg_type, _, _)| seg_type == UNKNOWN_SEG_TYPE_KERNEL)
.collect();
// CRITICAL ASSERTION: The unknown segment must survive compaction.
// If this fails, the compaction implementation is dropping segments it
// does not understand, which breaks forward compatibility.
assert_eq!(
unknown_after.len(),
1,
"FORWARD COMPATIBILITY VIOLATION: unknown segment type 0x{:02X} was dropped \
during compaction. Older tools must preserve segment types they do not recognize. \
Found {} unknown segments after compaction (expected 1).",
UNKNOWN_SEG_TYPE_KERNEL,
unknown_after.len()
);
// Verify byte-for-byte preservation of the segment (header + payload).
let &(off_after, _, _, plen_after) = unknown_after[0];
let seg_bytes_after = extract_segment_bytes(&bytes_after, off_after, plen_after).to_vec();
assert_eq!(
seg_bytes_before,
seg_bytes_after,
"Unknown segment was NOT preserved byte-for-byte. \
Before: {} bytes at offset {}, After: {} bytes at offset {}",
seg_bytes_before.len(),
off_before,
seg_bytes_after.len(),
off_after
);
println!(
"PASS: unknown segment type 0x{:02X} preserved byte-for-byte after compaction",
UNKNOWN_SEG_TYPE_KERNEL
);
}
// --------------------------------------------------------------------------
// 2. Multiple unknown segment types are all preserved
// --------------------------------------------------------------------------
//
// Same forward-compatibility concern as above: if compaction drops one
// unknown type it probably drops all of them.
#[test]
fn multiple_unknown_segment_types_preserved() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("multi_unknown.rvf");
let dim: u16 = 4;
// Create store with some vectors.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Append two different unknown segment types.
let kernel_payload: Vec<u8> = vec![0xDE, 0xAD, 0xBE, 0xEF]; // 4 bytes
let vendor_payload: Vec<u8> = vec![0xCA, 0xFE, 0xBA, 0xBE, 0x00, 0xFF]; // 6 bytes
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
// KERNEL_SEG 0x0E
let h1 =
build_raw_segment_header(UNKNOWN_SEG_TYPE_KERNEL, 8001, kernel_payload.len() as u64);
file.write_all(&h1).unwrap();
file.write_all(&kernel_payload).unwrap();
// VENDOR_SEG 0xFE
let h2 =
build_raw_segment_header(UNKNOWN_SEG_TYPE_VENDOR, 8002, vendor_payload.len() as u64);
file.write_all(&h2).unwrap();
file.write_all(&vendor_payload).unwrap();
file.sync_all().unwrap();
}
// Verify both are present before compaction.
let bytes_before = read_file_bytes(&path);
let segs_before = scan_segments(&bytes_before);
let kernel_before = segs_before
.iter()
.filter(|s| s.1 == UNKNOWN_SEG_TYPE_KERNEL)
.count();
let vendor_before = segs_before
.iter()
.filter(|s| s.1 == UNKNOWN_SEG_TYPE_VENDOR)
.count();
assert_eq!(
kernel_before, 1,
"KERNEL_SEG should exist before compaction"
);
assert_eq!(
vendor_before, 1,
"VENDOR_SEG should exist before compaction"
);
// Compact.
{
let mut store = RvfStore::open(&path).unwrap();
store.delete(&[1, 2]).unwrap();
store.compact().unwrap();
store.close().unwrap();
}
// Verify both unknown types survived.
let bytes_after = read_file_bytes(&path);
let segs_after = scan_segments(&bytes_after);
let kernel_after = segs_after
.iter()
.filter(|s| s.1 == UNKNOWN_SEG_TYPE_KERNEL)
.count();
let vendor_after = segs_after
.iter()
.filter(|s| s.1 == UNKNOWN_SEG_TYPE_VENDOR)
.count();
println!(
"After compaction: KERNEL_SEG(0x0E) count={}, VENDOR_SEG(0xFE) count={}",
kernel_after, vendor_after
);
assert_eq!(
kernel_after, 1,
"FORWARD COMPATIBILITY VIOLATION: KERNEL_SEG (0x{:02X}) was dropped during compaction",
UNKNOWN_SEG_TYPE_KERNEL
);
assert_eq!(
vendor_after, 1,
"FORWARD COMPATIBILITY VIOLATION: VENDOR_SEG (0x{:02X}) was dropped during compaction",
UNKNOWN_SEG_TYPE_VENDOR
);
}
// --------------------------------------------------------------------------
// 3. Unknown segment does not break store open/query (read tolerance)
// --------------------------------------------------------------------------
//
// Even if compaction does not preserve unknown segments, the store should
// at least be able to OPEN and QUERY a file that contains them, without
// panicking or returning errors.
#[test]
fn unknown_segment_does_not_break_read_path() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("read_tolerance.rvf");
let dim: u16 = 4;
// Create and populate.
{
let mut store = RvfStore::create(&path, make_options(dim)).unwrap();
let vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; dim as usize]).collect();
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = (1..=10).collect();
store.ingest_batch(&refs, &ids, None).unwrap();
store.close().unwrap();
}
// Append an unknown segment type before the final manifest so the file
// has: [manifest] [vec_seg] [manifest] [UNKNOWN] at the tail.
// The manifest scanner should skip past it.
{
let mut file = OpenOptions::new().append(true).open(&path).unwrap();
let payload = vec![0xABu8; 64];
let header = build_raw_segment_header(0x0F, 7777, payload.len() as u64);
file.write_all(&header).unwrap();
file.write_all(&payload).unwrap();
file.sync_all().unwrap();
}
// Re-open the store. The manifest scan reads from the tail and should
// skip the unknown segment header (it checks for manifest type 0x05).
// This should NOT panic or error.
let store = RvfStore::open_readonly(&path).unwrap();
let status = store.status();
assert_eq!(
status.total_vectors, 10,
"store should still report 10 vectors even with unknown segment appended"
);
// Query should still work.
let query = vec![5.0f32; dim as usize];
let results = store.query(&query, 5, &QueryOptions::default()).unwrap();
assert!(
!results.is_empty(),
"query should return results despite unknown segment in file"
);
assert_eq!(
results[0].id, 6,
"closest vector to [5,5,5,5] should be id=6 (value [5,5,5,5])"
);
println!("PASS: store opens and queries correctly with unknown segment type 0x0F in file");
}

View File

@@ -0,0 +1,124 @@
//! Round-trip tests: write + read all segment types via rvf-wire,
//! verifying data integrity across the full encode/decode pipeline.
use rvf_types::{SegmentFlags, SegmentType, SEGMENT_HEADER_SIZE, SEGMENT_MAGIC, SEGMENT_VERSION};
use rvf_wire::{read_segment, validate_segment, write_segment};
/// Helper: all segment types that exist in the spec.
fn all_segment_types() -> Vec<(u8, &'static str)> {
vec![
(SegmentType::Vec as u8, "VEC_SEG"),
(SegmentType::Index as u8, "INDEX_SEG"),
(SegmentType::Quant as u8, "QUANT_SEG"),
(SegmentType::Journal as u8, "JOURNAL_SEG"),
(SegmentType::Manifest as u8, "MANIFEST_SEG"),
(SegmentType::Meta as u8, "META_SEG"),
(SegmentType::Hot as u8, "HOT_SEG"),
]
}
#[test]
fn round_trip_all_segment_types() {
for (seg_type, name) in all_segment_types() {
let payload = format!("payload for {name}");
let encoded = write_segment(seg_type, payload.as_bytes(), SegmentFlags::empty(), 42);
let (header, decoded_payload) =
read_segment(&encoded).unwrap_or_else(|e| panic!("failed to read {name}: {e:?}"));
assert_eq!(header.magic, SEGMENT_MAGIC, "{name}: bad magic");
assert_eq!(header.version, SEGMENT_VERSION, "{name}: bad version");
assert_eq!(header.seg_type, seg_type, "{name}: bad seg_type");
assert_eq!(header.segment_id, 42, "{name}: bad segment_id");
assert_eq!(
decoded_payload,
payload.as_bytes(),
"{name}: payload mismatch"
);
}
}
#[test]
fn round_trip_validates_content_hash() {
for (seg_type, name) in all_segment_types() {
let payload: Vec<u8> = (0..256).map(|i| (i & 0xFF) as u8).collect();
let encoded = write_segment(seg_type, &payload, SegmentFlags::empty(), 1);
let (header, decoded_payload) = read_segment(&encoded).unwrap();
validate_segment(&header, decoded_payload)
.unwrap_or_else(|e| panic!("{name}: hash validation failed: {e:?}"));
}
}
#[test]
fn round_trip_preserves_flags() {
let flags = SegmentFlags::empty()
.with(SegmentFlags::COMPRESSED)
.with(SegmentFlags::SEALED);
let encoded = write_segment(SegmentType::Vec as u8, b"flagged", flags, 99);
let (header, _) = read_segment(&encoded).unwrap();
assert!(header.flags & SegmentFlags::COMPRESSED != 0);
assert!(header.flags & SegmentFlags::SEALED != 0);
}
#[test]
fn round_trip_empty_payload() {
let encoded = write_segment(SegmentType::Meta as u8, &[], SegmentFlags::empty(), 0);
let (header, payload) = read_segment(&encoded).unwrap();
assert_eq!(header.payload_length, 0);
assert!(payload.is_empty());
assert_eq!(encoded.len(), SEGMENT_HEADER_SIZE); // 64 bytes, no padding needed
}
#[test]
fn round_trip_large_payload() {
let payload: Vec<u8> = (0..10000).map(|i| (i % 251) as u8).collect();
let encoded = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), 7);
let (header, decoded_payload) = read_segment(&encoded).unwrap();
assert_eq!(header.payload_length, 10000);
assert_eq!(decoded_payload, &payload[..]);
validate_segment(&header, decoded_payload).unwrap();
}
#[test]
fn output_is_64_byte_aligned() {
for size in [0, 1, 10, 63, 64, 65, 100, 127, 128, 129, 255, 256, 1000] {
let payload = vec![0xABu8; size];
let encoded = write_segment(SegmentType::Vec as u8, &payload, SegmentFlags::empty(), 0);
assert_eq!(
encoded.len() % 64,
0,
"not 64-byte aligned for payload size {size}"
);
}
}
#[test]
fn multi_segment_file() {
// Build a file with multiple segments back-to-back.
let mut file = Vec::new();
let mut offsets = Vec::new();
for i in 0..5 {
let payload = format!("segment {i} data");
offsets.push(file.len());
let seg = write_segment(
SegmentType::Vec as u8,
payload.as_bytes(),
SegmentFlags::empty(),
i,
);
file.extend_from_slice(&seg);
}
// Read each segment back.
for (i, &offset) in offsets.iter().enumerate() {
let (header, payload) = read_segment(&file[offset..]).unwrap();
assert_eq!(header.segment_id, i as u64);
let expected = format!("segment {i} data");
assert_eq!(payload, expected.as_bytes());
}
}