Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,57 @@
[package]
name = "ruvector-delta-index"
version = "0.1.0"
edition = "2021"
description = "Delta-aware HNSW index with incremental updates and repair strategies"
license = "MIT OR Apache-2.0"
repository = "https://github.com/ruvnet/ruvector"
keywords = ["hnsw", "delta", "index", "incremental", "vector"]
categories = ["data-structures", "algorithms"]
[features]
default = ["parallel"]
parallel = ["rayon"]
simd = ["simsimd"]
persistence = ["bincode"]
[dependencies]
# Core delta library
ruvector-delta-core = { path = "../ruvector-delta-core" }
# Error handling
thiserror = "2.0"
# Data structures
parking_lot = "0.12"
dashmap = "6.0"
smallvec = { version = "1.13", features = ["union"] }
# Priority queue for HNSW
priority-queue = "2.0"
# Random number generation
rand = "0.8"
rand_xorshift = "0.3"
# Optional parallelism
rayon = { version = "1.10", optional = true }
# Optional SIMD
simsimd = { version = "5.9", optional = true }
# Optional serialization
bincode = { version = "2.0.0-rc.3", optional = true }
serde = { version = "1.0", features = ["derive"], optional = true }
[dev-dependencies]
criterion = "0.6"
proptest = "1.4"
# Benchmarks will be added later
# [[bench]]
# name = "incremental_update"
# harness = false
#
# [[bench]]
# name = "repair_strategies"
# harness = false

View File

@@ -0,0 +1,65 @@
//! Error types for delta index operations
use std::fmt;
/// Result type for index operations
pub type Result<T> = std::result::Result<T, IndexError>;
/// Errors that can occur during index operations
#[derive(Debug, Clone)]
pub enum IndexError {
/// Dimension mismatch
DimensionMismatch {
/// Expected dimension
expected: usize,
/// Actual dimension
actual: usize,
},
/// Duplicate ID
DuplicateId(String),
/// ID not found
NotFound(String),
/// Delta error
DeltaError(String),
/// Index is full
IndexFull {
/// Maximum capacity
max: usize,
},
/// Invalid configuration
InvalidConfig(String),
/// Graph corruption detected
GraphCorruption(String),
/// Serialization error
SerializationError(String),
}
impl fmt::Display for IndexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::DimensionMismatch { expected, actual } => {
write!(
f,
"Dimension mismatch: expected {}, got {}",
expected, actual
)
}
Self::DuplicateId(id) => write!(f, "Duplicate ID: {}", id),
Self::NotFound(id) => write!(f, "ID not found: {}", id),
Self::DeltaError(msg) => write!(f, "Delta error: {}", msg),
Self::IndexFull { max } => write!(f, "Index full (max {})", max),
Self::InvalidConfig(msg) => write!(f, "Invalid config: {}", msg),
Self::GraphCorruption(msg) => write!(f, "Graph corruption: {}", msg),
Self::SerializationError(msg) => write!(f, "Serialization error: {}", msg),
}
}
}
impl std::error::Error for IndexError {}

View File

@@ -0,0 +1,237 @@
//! Incremental index updates
//!
//! Provides efficient strategies for updating the index without full rebuild.
use std::collections::HashMap;
use ruvector_delta_core::{Delta, VectorDelta};
use crate::{DeltaHnsw, Result, SearchResult};
/// Configuration for incremental updates
#[derive(Debug, Clone)]
pub struct IncrementalConfig {
/// Minimum delta magnitude to trigger reconnection
pub reconnect_threshold: f32,
/// Maximum pending updates before batch processing
pub batch_threshold: usize,
/// Whether to use lazy reconnection
pub lazy_reconnect: bool,
}
impl Default for IncrementalConfig {
fn default() -> Self {
Self {
reconnect_threshold: 0.1,
batch_threshold: 100,
lazy_reconnect: true,
}
}
}
/// Handles incremental updates to the HNSW index
pub struct IncrementalUpdater {
config: IncrementalConfig,
pending_updates: HashMap<String, VectorDelta>,
total_updates: usize,
}
impl IncrementalUpdater {
/// Create a new incremental updater
pub fn new(config: IncrementalConfig) -> Self {
Self {
config,
pending_updates: HashMap::new(),
total_updates: 0,
}
}
/// Queue an update for batch processing
pub fn queue_update(&mut self, id: String, delta: VectorDelta) {
self.pending_updates
.entry(id)
.and_modify(|existing| {
*existing = existing.clone().compose(delta.clone());
})
.or_insert(delta);
self.total_updates += 1;
}
/// Check if batch processing is needed
pub fn needs_flush(&self) -> bool {
self.pending_updates.len() >= self.config.batch_threshold
}
/// Flush pending updates to the index
pub fn flush(&mut self, index: &mut DeltaHnsw) -> Result<FlushResult> {
let mut applied = 0;
let mut reconnected = 0;
let mut errors = Vec::new();
let updates: Vec<_> = self.pending_updates.drain().collect();
for (id, delta) in updates {
match index.apply_delta(&id, &delta) {
Ok(()) => {
applied += 1;
// Check if reconnection is needed
if delta.l2_norm() > self.config.reconnect_threshold {
reconnected += 1;
}
}
Err(e) => {
errors.push((id, e.to_string()));
}
}
}
Ok(FlushResult {
applied,
reconnected,
errors,
})
}
/// Get number of pending updates
pub fn pending_count(&self) -> usize {
self.pending_updates.len()
}
/// Get total updates processed
pub fn total_updates(&self) -> usize {
self.total_updates
}
/// Clear pending updates without applying
pub fn clear_pending(&mut self) {
self.pending_updates.clear();
}
}
/// Result of flushing updates
#[derive(Debug)]
pub struct FlushResult {
/// Number of updates applied
pub applied: usize,
/// Number of nodes reconnected
pub reconnected: usize,
/// Errors encountered
pub errors: Vec<(String, String)>,
}
/// Strategies for handling vector updates
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UpdateStrategy {
/// Apply delta without graph modification
DeltaOnly,
/// Apply delta and update local neighbors
LocalRepair,
/// Apply delta and full reconnection
FullReconnect,
/// Queue for batch processing
Deferred,
}
/// Determine the best update strategy based on delta magnitude
pub fn select_strategy(delta: &VectorDelta, config: &IncrementalConfig) -> UpdateStrategy {
let magnitude = delta.l2_norm();
if magnitude < config.reconnect_threshold * 0.1 {
UpdateStrategy::DeltaOnly
} else if magnitude < config.reconnect_threshold {
if config.lazy_reconnect {
UpdateStrategy::DeltaOnly
} else {
UpdateStrategy::LocalRepair
}
} else if magnitude < config.reconnect_threshold * 5.0 {
UpdateStrategy::LocalRepair
} else {
UpdateStrategy::FullReconnect
}
}
/// Statistics about incremental updates
#[derive(Debug, Clone, Default)]
pub struct UpdateStats {
/// Total updates applied
pub total_applied: usize,
/// Updates that triggered reconnection
pub reconnections: usize,
/// Updates that were delta-only
pub delta_only: usize,
/// Average delta magnitude
pub avg_magnitude: f32,
/// Maximum delta magnitude
pub max_magnitude: f32,
}
impl UpdateStats {
/// Record an update
pub fn record(&mut self, delta: &VectorDelta, reconnected: bool) {
let mag = delta.l2_norm();
self.total_applied += 1;
if reconnected {
self.reconnections += 1;
} else {
self.delta_only += 1;
}
// Update running average
let n = self.total_applied as f32;
self.avg_magnitude = self.avg_magnitude * ((n - 1.0) / n) + mag / n;
self.max_magnitude = self.max_magnitude.max(mag);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_incremental_updater() {
let mut updater = IncrementalUpdater::new(IncrementalConfig::default());
let delta = VectorDelta::from_dense(vec![0.1, 0.2, 0.3]);
updater.queue_update("test".to_string(), delta);
assert_eq!(updater.pending_count(), 1);
assert_eq!(updater.total_updates(), 1);
}
#[test]
fn test_delta_composition() {
let mut updater = IncrementalUpdater::new(IncrementalConfig::default());
let delta1 = VectorDelta::from_dense(vec![1.0, 0.0, 0.0]);
let delta2 = VectorDelta::from_dense(vec![0.0, 1.0, 0.0]);
updater.queue_update("test".to_string(), delta1);
updater.queue_update("test".to_string(), delta2);
// Should compose into single update
assert_eq!(updater.pending_count(), 1);
}
#[test]
fn test_strategy_selection() {
let config = IncrementalConfig {
reconnect_threshold: 0.5,
..Default::default()
};
// Small delta -> DeltaOnly
let small = VectorDelta::from_dense(vec![0.01, 0.01, 0.01]);
assert_eq!(select_strategy(&small, &config), UpdateStrategy::DeltaOnly);
// Large delta -> FullReconnect
let large = VectorDelta::from_dense(vec![10.0, 10.0, 10.0]);
assert_eq!(
select_strategy(&large, &config),
UpdateStrategy::FullReconnect
);
}
}

View File

@@ -0,0 +1,774 @@
//! # RuVector Delta Index
//!
//! Delta-aware HNSW index with incremental updates and repair strategies.
//! Optimized for scenarios with frequent small changes to vector embeddings.
//!
//! ## Key Features
//!
//! - Incremental index updates without full rebuild
//! - Repair strategies for maintaining graph quality
//! - Recall quality monitoring
//! - Delta-based versioning
//!
//! ## Example
//!
//! ```rust,ignore
//! use ruvector_delta_index::{DeltaHnsw, DeltaHnswConfig, RepairStrategy};
//! use ruvector_delta_core::VectorDelta;
//!
//! let config = DeltaHnswConfig::default();
//! let mut index = DeltaHnsw::new(384, config);
//!
//! // Insert vectors
//! index.insert("vec1", vec![1.0; 384]);
//!
//! // Apply delta update
//! let delta = VectorDelta::from_dense(vec![0.1; 384]);
//! index.apply_delta("vec1", &delta);
//!
//! // Search (uses repaired graph)
//! let results = index.search(&query, 10);
//! ```
#![warn(missing_docs)]
#![warn(clippy::all)]
pub mod error;
pub mod incremental;
pub mod quality;
pub mod repair;
use std::collections::HashMap;
use std::sync::Arc;
use dashmap::DashMap;
use parking_lot::RwLock;
use priority_queue::PriorityQueue;
use rand::SeedableRng;
use rand_xorshift::XorShiftRng;
use smallvec::SmallVec;
use ruvector_delta_core::{Delta, DeltaStream, VectorDelta};
pub use error::{IndexError, Result};
pub use incremental::IncrementalUpdater;
pub use quality::{QualityMetrics, QualityMonitor, RecallEstimate};
pub use repair::{GraphRepairer, RepairConfig, RepairStrategy};
/// Configuration for Delta HNSW index
#[derive(Debug, Clone)]
pub struct DeltaHnswConfig {
/// Number of connections per node
pub m: usize,
/// Maximum connections per node at layer 0
pub m0: usize,
/// Construction ef (neighbor search budget)
pub ef_construction: usize,
/// Search ef (query-time search budget)
pub ef_search: usize,
/// Maximum elements
pub max_elements: usize,
/// Level multiplier for layer assignment
pub level_mult: f64,
/// Delta threshold for triggering repair
pub repair_threshold: f32,
/// Maximum deltas before compaction
pub max_deltas: usize,
/// Enable automatic quality monitoring
pub auto_monitor: bool,
}
impl Default for DeltaHnswConfig {
fn default() -> Self {
Self {
m: 16,
m0: 32,
ef_construction: 200,
ef_search: 100,
max_elements: 1_000_000,
level_mult: 1.0 / (16.0_f64).ln(),
repair_threshold: 0.5,
max_deltas: 100,
auto_monitor: true,
}
}
}
/// A node in the HNSW graph
#[derive(Clone)]
struct HnswNode {
/// Vector ID
id: String,
/// Vector data
vector: Vec<f32>,
/// Neighbors at each level (level -> neighbors)
neighbors: Vec<SmallVec<[u32; 32]>>,
/// Maximum level for this node
level: usize,
/// Delta stream for this node
delta_stream: DeltaStream<VectorDelta>,
}
impl HnswNode {
fn new(id: String, vector: Vec<f32>, level: usize) -> Self {
Self {
id,
vector: vector.clone(),
neighbors: vec![SmallVec::new(); level + 1],
level,
delta_stream: DeltaStream::for_vectors(vector.len()),
}
}
}
/// Entry point for the HNSW graph
#[derive(Clone)]
struct EntryPoint {
node_idx: u32,
level: usize,
}
/// Delta-aware HNSW index
pub struct DeltaHnsw {
/// Configuration
config: DeltaHnswConfig,
/// Vector dimensions
dimensions: usize,
/// All nodes
nodes: Vec<RwLock<HnswNode>>,
/// ID to node index mapping
id_to_idx: DashMap<String, u32>,
/// Entry point
entry_point: RwLock<Option<EntryPoint>>,
/// Random number generator for level assignment
rng: RwLock<XorShiftRng>,
/// Quality monitor
quality_monitor: Option<QualityMonitor>,
/// Graph repairer
repairer: GraphRepairer,
}
impl DeltaHnsw {
/// Create a new Delta HNSW index
pub fn new(dimensions: usize, config: DeltaHnswConfig) -> Self {
let quality_monitor = if config.auto_monitor {
Some(QualityMonitor::new(dimensions))
} else {
None
};
let repair_config = RepairConfig {
strategy: RepairStrategy::Lazy,
batch_size: 100,
quality_threshold: 0.95,
};
Self {
config: config.clone(),
dimensions,
nodes: Vec::with_capacity(config.max_elements),
id_to_idx: DashMap::new(),
entry_point: RwLock::new(None),
rng: RwLock::new(XorShiftRng::seed_from_u64(42)),
quality_monitor,
repairer: GraphRepairer::new(repair_config),
}
}
/// Get configuration
pub fn config(&self) -> &DeltaHnswConfig {
&self.config
}
/// Get dimensions
pub fn dimensions(&self) -> usize {
self.dimensions
}
/// Get number of elements
pub fn len(&self) -> usize {
self.nodes.len()
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.nodes.is_empty()
}
/// Insert a new vector
pub fn insert(&mut self, id: &str, vector: Vec<f32>) -> Result<()> {
if vector.len() != self.dimensions {
return Err(IndexError::DimensionMismatch {
expected: self.dimensions,
actual: vector.len(),
});
}
if self.id_to_idx.contains_key(id) {
return Err(IndexError::DuplicateId(id.to_string()));
}
// Assign level
let level = self.random_level();
let node_idx = self.nodes.len() as u32;
// Create node
let node = HnswNode::new(id.to_string(), vector.clone(), level);
self.nodes.push(RwLock::new(node));
self.id_to_idx.insert(id.to_string(), node_idx);
// Connect to graph
self.connect_node(node_idx, &vector, level)?;
// Update entry point if needed
let mut entry = self.entry_point.write();
if entry.is_none() || level > entry.as_ref().unwrap().level {
*entry = Some(EntryPoint { node_idx, level });
}
Ok(())
}
/// Apply a delta update to a vector
pub fn apply_delta(&mut self, id: &str, delta: &VectorDelta) -> Result<()> {
let node_idx = *self
.id_to_idx
.get(id)
.ok_or_else(|| IndexError::NotFound(id.to_string()))?;
let mut node = self.nodes[node_idx as usize].write();
// Apply delta to vector
delta
.apply(&mut node.vector)
.map_err(|e| IndexError::DeltaError(format!("{:?}", e)))?;
// Record in stream
node.delta_stream.push(delta.clone());
// Check if repair is needed
let cumulative_change = self.estimate_cumulative_change(&node);
if cumulative_change > self.config.repair_threshold {
drop(node);
self.repair_node(node_idx)?;
}
Ok(())
}
/// Batch apply deltas
pub fn apply_deltas_batch(&mut self, updates: &[(String, VectorDelta)]) -> Result<Vec<u32>> {
let mut repaired = Vec::new();
for (id, delta) in updates {
let node_idx = *self
.id_to_idx
.get(id)
.ok_or_else(|| IndexError::NotFound(id.clone()))?;
let mut node = self.nodes[node_idx as usize].write();
delta
.apply(&mut node.vector)
.map_err(|e| IndexError::DeltaError(format!("{:?}", e)))?;
node.delta_stream.push(delta.clone());
let change = self.estimate_cumulative_change(&node);
if change > self.config.repair_threshold {
repaired.push(node_idx);
}
}
// Batch repair
for node_idx in &repaired {
drop(self.nodes[*node_idx as usize].write());
self.repair_node(*node_idx)?;
}
Ok(repaired)
}
/// Search for k nearest neighbors
pub fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
if query.len() != self.dimensions {
return Err(IndexError::DimensionMismatch {
expected: self.dimensions,
actual: query.len(),
});
}
let entry = self.entry_point.read();
if entry.is_none() {
return Ok(Vec::new());
}
let entry = entry.as_ref().unwrap();
let mut current_node = entry.node_idx;
// Greedy search from top to layer 1
for level in (1..=entry.level).rev() {
current_node = self.greedy_search(query, current_node, level);
}
// Layer 0: ef_search neighbors
let candidates = self.search_layer(query, current_node, 0, self.config.ef_search);
// Take top-k
let results: Vec<SearchResult> = candidates
.into_iter()
.take(k)
.map(|(idx, dist)| {
let node = self.nodes[idx as usize].read();
SearchResult {
id: node.id.clone(),
distance: dist,
vector: Some(node.vector.clone()),
}
})
.collect();
// Update quality monitor
if let Some(monitor) = &self.quality_monitor {
monitor.record_search(query, &results);
}
Ok(results)
}
/// Get current quality metrics
pub fn quality_metrics(&self) -> Option<QualityMetrics> {
self.quality_monitor.as_ref().map(|m| m.metrics())
}
/// Force repair of entire graph
pub fn force_repair(&mut self) -> Result<usize> {
let node_count = self.nodes.len();
let mut repaired = 0;
for idx in 0..node_count {
if self.repair_node(idx as u32)? {
repaired += 1;
}
}
Ok(repaired)
}
/// Delete a vector by ID
pub fn delete(&mut self, id: &str) -> Result<bool> {
let node_idx = match self.id_to_idx.remove(id) {
Some((_, idx)) => idx,
None => return Ok(false),
};
// Mark node as deleted (we don't physically remove to preserve indices)
let mut node = self.nodes[node_idx as usize].write();
node.id = String::new();
node.vector.clear();
node.neighbors.clear();
// Remove from other nodes' neighbor lists
for i in 0..self.nodes.len() {
if i == node_idx as usize {
continue;
}
let mut other = self.nodes[i].write();
for level_neighbors in &mut other.neighbors {
level_neighbors.retain(|n| *n != node_idx);
}
}
Ok(true)
}
/// Compact delta streams for all nodes
pub fn compact_deltas(&mut self) -> usize {
let mut total_compacted = 0;
for node in &self.nodes {
let mut node = node.write();
total_compacted += node.delta_stream.compact().unwrap_or(0);
}
total_compacted
}
// Private methods
fn random_level(&self) -> usize {
let mut rng = self.rng.write();
let r: f64 = rand::Rng::gen(&mut *rng);
(-r.ln() * self.config.level_mult).floor() as usize
}
fn connect_node(&mut self, node_idx: u32, vector: &[f32], level: usize) -> Result<()> {
let entry = self.entry_point.read().clone();
if entry.is_none() {
return Ok(());
}
let entry = entry.unwrap();
let mut current = entry.node_idx;
// Navigate from top level
for l in (level + 1..=entry.level).rev() {
current = self.greedy_search(vector, current, l);
}
// Connect at each level
for l in (0..=level.min(entry.level)).rev() {
let neighbors = self.search_layer(vector, current, l, self.config.ef_construction);
let max_conn = if l == 0 {
self.config.m0
} else {
self.config.m
};
// Select best neighbors
let selected: Vec<u32> = neighbors
.into_iter()
.take(max_conn)
.map(|(idx, _)| idx)
.collect();
// Update node's neighbors
{
let mut node = self.nodes[node_idx as usize].write();
if l < node.neighbors.len() {
node.neighbors[l] = selected.iter().cloned().collect();
}
}
// Add reverse connections
for &neighbor_idx in &selected {
let mut neighbor = self.nodes[neighbor_idx as usize].write();
if l < neighbor.neighbors.len() {
neighbor.neighbors[l].push(node_idx);
// Prune if over limit
if neighbor.neighbors[l].len() > max_conn {
let node_vec = self.nodes[neighbor_idx as usize].read().vector.clone();
self.prune_neighbors(&mut neighbor.neighbors[l], &node_vec, max_conn);
}
}
}
if !selected.is_empty() {
current = selected[0];
}
}
Ok(())
}
fn greedy_search(&self, query: &[f32], start: u32, level: usize) -> u32 {
let mut current = start;
let mut current_dist = self.distance(query, current);
loop {
let node = self.nodes[current as usize].read();
if level >= node.neighbors.len() {
break;
}
let mut improved = false;
for &neighbor in &node.neighbors[level] {
let dist = self.distance(query, neighbor);
if dist < current_dist {
current = neighbor;
current_dist = dist;
improved = true;
}
}
if !improved {
break;
}
}
current
}
fn search_layer(&self, query: &[f32], start: u32, level: usize, ef: usize) -> Vec<(u32, f32)> {
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::collections::HashSet;
#[derive(Clone, Copy)]
struct Candidate {
idx: u32,
dist: f32,
}
impl PartialEq for Candidate {
fn eq(&self, other: &Self) -> bool {
self.dist == other.dist
}
}
impl Eq for Candidate {}
impl PartialOrd for Candidate {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Candidate {
fn cmp(&self, other: &Self) -> Ordering {
// Min-heap by distance
other
.dist
.partial_cmp(&self.dist)
.unwrap_or(Ordering::Equal)
}
}
let start_dist = self.distance(query, start);
let mut candidates = BinaryHeap::new();
let mut results = BinaryHeap::new();
let mut visited = HashSet::new();
candidates.push(Candidate {
idx: start,
dist: start_dist,
});
results.push(Candidate {
idx: start,
dist: -start_dist, // Max-heap for worst result
});
visited.insert(start);
while let Some(current) = candidates.pop() {
// Check if we can stop
if !results.is_empty() {
let worst = results.peek().unwrap();
if current.dist > -worst.dist {
break;
}
}
let node = self.nodes[current.idx as usize].read();
if level >= node.neighbors.len() {
continue;
}
for &neighbor in &node.neighbors[level] {
if visited.contains(&neighbor) {
continue;
}
visited.insert(neighbor);
let dist = self.distance(query, neighbor);
let should_add = results.len() < ef || dist < -results.peek().unwrap().dist;
if should_add {
candidates.push(Candidate {
idx: neighbor,
dist,
});
results.push(Candidate {
idx: neighbor,
dist: -dist,
});
if results.len() > ef {
results.pop();
}
}
}
}
results.into_iter().map(|c| (c.idx, -c.dist)).collect()
}
fn distance(&self, query: &[f32], node_idx: u32) -> f32 {
let node = self.nodes[node_idx as usize].read();
if node.vector.is_empty() {
return f32::MAX;
}
// L2 distance squared
query
.iter()
.zip(node.vector.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>()
.sqrt()
}
fn prune_neighbors(&self, neighbors: &mut SmallVec<[u32; 32]>, node_vec: &[f32], max: usize) {
if neighbors.len() <= max {
return;
}
// Sort by distance and keep closest
let mut with_dist: Vec<(u32, f32)> = neighbors
.iter()
.map(|&n| (n, self.distance(node_vec, n)))
.collect();
with_dist.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
neighbors.clear();
for (idx, _) in with_dist.into_iter().take(max) {
neighbors.push(idx);
}
}
fn estimate_cumulative_change(&self, node: &HnswNode) -> f32 {
// Estimate change based on delta stream
let mut total_change = 0.0f32;
for (_, delta) in node.delta_stream.iter() {
total_change += delta.l2_norm();
}
total_change
}
fn repair_node(&mut self, node_idx: u32) -> Result<bool> {
let node = self.nodes[node_idx as usize].read();
if node.vector.is_empty() {
return Ok(false);
}
let vector = node.vector.clone();
let level = node.level;
drop(node);
// Reconnect based on current vector
self.reconnect_node(node_idx, &vector, level)?;
// Compact delta stream
{
let mut node = self.nodes[node_idx as usize].write();
node.delta_stream.compact().ok();
}
Ok(true)
}
fn reconnect_node(&mut self, node_idx: u32, vector: &[f32], level: usize) -> Result<()> {
// Find new neighbors at each level
let entry = self.entry_point.read().clone();
if entry.is_none() {
return Ok(());
}
let entry = entry.unwrap();
let mut current = entry.node_idx;
for l in (level + 1..=entry.level).rev() {
current = self.greedy_search(vector, current, l);
}
for l in (0..=level.min(entry.level)).rev() {
let neighbors = self.search_layer(vector, current, l, self.config.ef_construction);
let max_conn = if l == 0 {
self.config.m0
} else {
self.config.m
};
// Filter out self
let selected: Vec<u32> = neighbors
.into_iter()
.filter(|(idx, _)| *idx != node_idx)
.take(max_conn)
.map(|(idx, _)| idx)
.collect();
// Update neighbors
{
let mut node = self.nodes[node_idx as usize].write();
if l < node.neighbors.len() {
node.neighbors[l] = selected.iter().cloned().collect();
}
}
if !selected.is_empty() {
current = selected[0];
}
}
Ok(())
}
}
/// Search result
#[derive(Debug, Clone)]
pub struct SearchResult {
/// Vector ID
pub id: String,
/// Distance to query
pub distance: f32,
/// Optional vector data
pub vector: Option<Vec<f32>>,
}
#[cfg(test)]
mod tests {
use super::*;
fn random_vector(dim: usize) -> Vec<f32> {
use rand::Rng;
let mut rng = rand::thread_rng();
(0..dim).map(|_| rng.gen()).collect()
}
#[test]
fn test_insert_and_search() {
let mut index = DeltaHnsw::new(128, DeltaHnswConfig::default());
// Insert some vectors
for i in 0..100 {
let vec = random_vector(128);
index.insert(&format!("vec_{}", i), vec).unwrap();
}
assert_eq!(index.len(), 100);
// Search
let query = random_vector(128);
let results = index.search(&query, 10).unwrap();
assert_eq!(results.len(), 10);
}
#[test]
fn test_delta_update() {
let mut index = DeltaHnsw::new(4, DeltaHnswConfig::default());
let original = vec![1.0, 2.0, 3.0, 4.0];
index.insert("test", original.clone()).unwrap();
let delta = VectorDelta::from_dense(vec![0.5, 0.0, -0.5, 0.0]);
index.apply_delta("test", &delta).unwrap();
// Search should still work
let results = index.search(&[1.5, 2.0, 2.5, 4.0], 1).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].id, "test");
}
#[test]
fn test_delete() {
let mut index = DeltaHnsw::new(4, DeltaHnswConfig::default());
index.insert("a", vec![1.0, 0.0, 0.0, 0.0]).unwrap();
index.insert("b", vec![0.0, 1.0, 0.0, 0.0]).unwrap();
index.insert("c", vec![0.0, 0.0, 1.0, 0.0]).unwrap();
assert!(index.delete("b").unwrap());
assert!(!index.delete("nonexistent").unwrap());
let results = index.search(&[0.0, 1.0, 0.0, 0.0], 10).unwrap();
assert!(results.iter().all(|r| r.id != "b"));
}
}

View File

@@ -0,0 +1,341 @@
//! Quality monitoring for delta-aware HNSW
//!
//! Monitors recall quality and detects when repair is needed.
use std::collections::VecDeque;
use std::sync::atomic::{AtomicU64, Ordering};
use parking_lot::RwLock;
use crate::SearchResult;
/// Configuration for quality monitoring
#[derive(Debug, Clone)]
pub struct QualityConfig {
/// Size of the sample window
pub window_size: usize,
/// Number of random samples for estimation
pub sample_count: usize,
/// Recall threshold below which repair is triggered
pub recall_threshold: f32,
/// How often to run quality checks (in search count)
pub check_interval: usize,
}
impl Default for QualityConfig {
fn default() -> Self {
Self {
window_size: 1000,
sample_count: 100,
recall_threshold: 0.9,
check_interval: 100,
}
}
}
/// Quality metrics
#[derive(Debug, Clone, Default)]
pub struct QualityMetrics {
/// Estimated recall
pub recall: f32,
/// Average number of distance computations
pub avg_distance_comps: f32,
/// Average search latency (ns)
pub avg_latency_ns: f64,
/// Total searches performed
pub total_searches: u64,
/// Searches since last repair
pub searches_since_repair: u64,
}
/// Recall estimate with confidence
#[derive(Debug, Clone)]
pub struct RecallEstimate {
/// Point estimate
pub recall: f32,
/// Lower bound (95% CI)
pub lower_bound: f32,
/// Upper bound (95% CI)
pub upper_bound: f32,
/// Number of samples
pub samples: usize,
}
/// Search sample for quality estimation
struct SearchSample {
/// Query vector (for potential re-search)
query_hash: u64,
/// Returned result IDs
result_ids: Vec<String>,
/// Distances
distances: Vec<f32>,
/// Timestamp
timestamp_ns: u64,
}
/// Quality monitor for the index
pub struct QualityMonitor {
config: QualityConfig,
samples: RwLock<VecDeque<SearchSample>>,
metrics: RwLock<QualityMetrics>,
search_count: AtomicU64,
dimensions: usize,
}
impl QualityMonitor {
/// Create a new quality monitor
pub fn new(dimensions: usize) -> Self {
Self {
config: QualityConfig::default(),
samples: RwLock::new(VecDeque::with_capacity(1000)),
metrics: RwLock::new(QualityMetrics::default()),
search_count: AtomicU64::new(0),
dimensions,
}
}
/// Create with custom configuration
pub fn with_config(dimensions: usize, config: QualityConfig) -> Self {
Self {
config,
samples: RwLock::new(VecDeque::with_capacity(1000)),
metrics: RwLock::new(QualityMetrics::default()),
search_count: AtomicU64::new(0),
dimensions,
}
}
/// Record a search for quality monitoring
pub fn record_search(&self, query: &[f32], results: &[SearchResult]) {
let count = self.search_count.fetch_add(1, Ordering::Relaxed);
// Only sample periodically
if count % (self.config.check_interval as u64) != 0 {
return;
}
let sample = SearchSample {
query_hash: hash_vector(query),
result_ids: results.iter().map(|r| r.id.clone()).collect(),
distances: results.iter().map(|r| r.distance).collect(),
timestamp_ns: current_time_ns(),
};
let mut samples = self.samples.write();
samples.push_back(sample);
// Maintain window size
while samples.len() > self.config.window_size {
samples.pop_front();
}
// Update metrics
let mut metrics = self.metrics.write();
metrics.total_searches = count + 1;
metrics.searches_since_repair += 1;
// Update average distance
if !results.is_empty() {
let avg_dist = results.iter().map(|r| r.distance).sum::<f32>() / results.len() as f32;
let n = metrics.total_searches as f32;
metrics.avg_distance_comps =
metrics.avg_distance_comps * ((n - 1.0) / n) + avg_dist / n;
}
}
/// Get current metrics
pub fn metrics(&self) -> QualityMetrics {
self.metrics.read().clone()
}
/// Estimate current recall
pub fn estimate_recall(&self) -> RecallEstimate {
let samples = self.samples.read();
if samples.is_empty() {
return RecallEstimate {
recall: 1.0,
lower_bound: 0.0,
upper_bound: 1.0,
samples: 0,
};
}
// Estimate based on distance consistency
let mut consistent = 0;
let mut total = 0;
for i in 1..samples.len() {
let prev = &samples[i - 1];
let curr = &samples[i];
// If queries are similar, results should overlap
if similar_queries(prev.query_hash, curr.query_hash) {
total += 1;
let overlap = count_overlap(&prev.result_ids, &curr.result_ids);
if overlap > 0 {
consistent += 1;
}
}
}
let recall = if total > 0 {
consistent as f32 / total as f32
} else {
1.0
};
// Wilson confidence interval
let n = total.max(1) as f32;
let z = 1.96; // 95% CI
let center = (recall + z * z / (2.0 * n)) / (1.0 + z * z / n);
let width =
z * (recall * (1.0 - recall) / n + z * z / (4.0 * n * n)).sqrt() / (1.0 + z * z / n);
RecallEstimate {
recall,
lower_bound: (center - width).max(0.0),
upper_bound: (center + width).min(1.0),
samples: samples.len(),
}
}
/// Check if repair is needed based on quality
pub fn needs_repair(&self) -> bool {
let estimate = self.estimate_recall();
estimate.recall < self.config.recall_threshold
}
/// Reset counters after repair
pub fn on_repair(&self) {
let mut metrics = self.metrics.write();
metrics.searches_since_repair = 0;
}
/// Clear all samples
pub fn clear(&self) {
self.samples.write().clear();
*self.metrics.write() = QualityMetrics::default();
self.search_count.store(0, Ordering::Relaxed);
}
}
/// Hash a vector for comparison
fn hash_vector(v: &[f32]) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
// Hash first few elements for quick comparison
let sample_size = v.len().min(16);
for &x in &v[..sample_size] {
x.to_bits().hash(&mut hasher);
}
hasher.finish()
}
/// Check if two queries are similar (by hash)
fn similar_queries(h1: u64, h2: u64) -> bool {
// XOR and count differing bits
let diff = (h1 ^ h2).count_ones();
diff < 32
}
/// Count overlapping IDs
fn count_overlap(a: &[String], b: &[String]) -> usize {
a.iter().filter(|id| b.contains(id)).count()
}
/// Get current time in nanoseconds
fn current_time_ns() -> u64 {
use std::time::SystemTime;
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0)
}
/// Distance distribution statistics
#[derive(Debug, Clone, Default)]
pub struct DistanceStats {
/// Mean distance
pub mean: f32,
/// Standard deviation
pub std_dev: f32,
/// Minimum distance
pub min: f32,
/// Maximum distance
pub max: f32,
/// Median distance
pub median: f32,
}
impl DistanceStats {
/// Calculate from a list of distances
pub fn from_distances(distances: &[f32]) -> Self {
if distances.is_empty() {
return Self::default();
}
let n = distances.len() as f32;
let mean = distances.iter().sum::<f32>() / n;
let variance = distances.iter().map(|d| (d - mean).powi(2)).sum::<f32>() / n;
let mut sorted = distances.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
Self {
mean,
std_dev: variance.sqrt(),
min: *sorted.first().unwrap_or(&0.0),
max: *sorted.last().unwrap_or(&0.0),
median: sorted[sorted.len() / 2],
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quality_monitor_creation() {
let monitor = QualityMonitor::new(128);
let metrics = monitor.metrics();
assert_eq!(metrics.total_searches, 0);
}
#[test]
fn test_recall_estimation() {
let monitor = QualityMonitor::new(128);
let estimate = monitor.estimate_recall();
// Empty monitor should return 1.0 recall
assert!((estimate.recall - 1.0).abs() < 1e-6);
assert_eq!(estimate.samples, 0);
}
#[test]
fn test_hash_vector() {
let v1 = vec![1.0f32, 2.0, 3.0, 4.0];
let v2 = vec![1.0f32, 2.0, 3.0, 4.0];
let v3 = vec![5.0f32, 6.0, 7.0, 8.0];
assert_eq!(hash_vector(&v1), hash_vector(&v2));
assert_ne!(hash_vector(&v1), hash_vector(&v3));
}
#[test]
fn test_distance_stats() {
let distances = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
let stats = DistanceStats::from_distances(&distances);
assert!((stats.mean - 3.0).abs() < 1e-6);
assert!((stats.min - 1.0).abs() < 1e-6);
assert!((stats.max - 5.0).abs() < 1e-6);
assert!((stats.median - 3.0).abs() < 1e-6);
}
}

View File

@@ -0,0 +1,213 @@
//! Graph repair strategies
//!
//! Provides strategies for maintaining HNSW graph quality after delta updates.
use std::collections::HashSet;
/// Repair strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RepairStrategy {
/// No automatic repair
None,
/// Repair only when explicitly triggered
Lazy,
/// Immediate repair on every update
Eager,
/// Batch repair at intervals
Batched,
/// Adaptive based on quality monitoring
Adaptive,
}
/// Configuration for graph repair
#[derive(Debug, Clone)]
pub struct RepairConfig {
/// Repair strategy to use
pub strategy: RepairStrategy,
/// Batch size for batched repair
pub batch_size: usize,
/// Quality threshold below which repair is triggered
pub quality_threshold: f32,
}
impl Default for RepairConfig {
fn default() -> Self {
Self {
strategy: RepairStrategy::Lazy,
batch_size: 100,
quality_threshold: 0.95,
}
}
}
/// Handles graph repair operations
pub struct GraphRepairer {
config: RepairConfig,
pending_repairs: HashSet<u32>,
repair_count: usize,
}
impl GraphRepairer {
/// Create a new graph repairer
pub fn new(config: RepairConfig) -> Self {
Self {
config,
pending_repairs: HashSet::new(),
repair_count: 0,
}
}
/// Mark a node as needing repair
pub fn mark_for_repair(&mut self, node_idx: u32) {
self.pending_repairs.insert(node_idx);
}
/// Check if batch repair is needed
pub fn needs_batch_repair(&self) -> bool {
self.pending_repairs.len() >= self.config.batch_size
}
/// Get nodes pending repair
pub fn pending_nodes(&self) -> Vec<u32> {
self.pending_repairs.iter().cloned().collect()
}
/// Clear pending repairs
pub fn clear_pending(&mut self) {
self.pending_repairs.clear();
}
/// Record completed repair
pub fn record_repair(&mut self, count: usize) {
self.repair_count += count;
}
/// Get total repairs performed
pub fn total_repairs(&self) -> usize {
self.repair_count
}
/// Get repair strategy
pub fn strategy(&self) -> RepairStrategy {
self.config.strategy
}
}
/// Result of a repair operation
#[derive(Debug, Clone)]
pub struct RepairResult {
/// Number of nodes repaired
pub nodes_repaired: usize,
/// Number of edges updated
pub edges_updated: usize,
/// Quality before repair
pub quality_before: f32,
/// Quality after repair
pub quality_after: f32,
}
/// Repair scope
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RepairScope {
/// Single node
Node(u32),
/// Neighborhood of a node (N-hop)
Neighborhood { center: u32, hops: usize },
/// Specific level of the graph
Level(usize),
/// Full graph
Full,
}
/// Local repair operations for a single node
pub struct LocalRepair {
/// Node being repaired
pub node_idx: u32,
/// Old neighbors to remove
pub remove: Vec<(usize, u32)>, // (level, neighbor_idx)
/// New neighbors to add
pub add: Vec<(usize, u32)>,
}
impl LocalRepair {
/// Create a new local repair
pub fn new(node_idx: u32) -> Self {
Self {
node_idx,
remove: Vec::new(),
add: Vec::new(),
}
}
/// Check if repair is empty
pub fn is_empty(&self) -> bool {
self.remove.is_empty() && self.add.is_empty()
}
/// Get total changes
pub fn change_count(&self) -> usize {
self.remove.len() + self.add.len()
}
}
/// Determines repair priority for nodes
pub fn repair_priority(delta_magnitude: f32, neighbor_count: usize) -> f32 {
// Higher priority for larger deltas and nodes with many neighbors
delta_magnitude * (1.0 + (neighbor_count as f32).ln())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_repairer_creation() {
let config = RepairConfig::default();
let repairer = GraphRepairer::new(config);
assert_eq!(repairer.strategy(), RepairStrategy::Lazy);
assert_eq!(repairer.total_repairs(), 0);
}
#[test]
fn test_mark_for_repair() {
let mut repairer = GraphRepairer::new(RepairConfig {
batch_size: 5,
..Default::default()
});
for i in 0..3 {
repairer.mark_for_repair(i);
}
assert!(!repairer.needs_batch_repair());
assert_eq!(repairer.pending_nodes().len(), 3);
for i in 3..10 {
repairer.mark_for_repair(i);
}
assert!(repairer.needs_batch_repair());
}
#[test]
fn test_local_repair() {
let mut repair = LocalRepair::new(0);
assert!(repair.is_empty());
repair.add.push((0, 1));
repair.remove.push((0, 2));
assert!(!repair.is_empty());
assert_eq!(repair.change_count(), 2);
}
#[test]
fn test_repair_priority() {
// Higher delta = higher priority
assert!(repair_priority(1.0, 10) > repair_priority(0.5, 10));
// More neighbors = higher priority
assert!(repair_priority(0.5, 20) > repair_priority(0.5, 10));
}
}