Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
57
vendor/ruvector/crates/ruvector-delta-index/Cargo.toml
vendored
Normal file
57
vendor/ruvector/crates/ruvector-delta-index/Cargo.toml
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
[package]
|
||||
name = "ruvector-delta-index"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Delta-aware HNSW index with incremental updates and repair strategies"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/ruvnet/ruvector"
|
||||
keywords = ["hnsw", "delta", "index", "incremental", "vector"]
|
||||
categories = ["data-structures", "algorithms"]
|
||||
|
||||
[features]
|
||||
default = ["parallel"]
|
||||
parallel = ["rayon"]
|
||||
simd = ["simsimd"]
|
||||
persistence = ["bincode"]
|
||||
|
||||
[dependencies]
|
||||
# Core delta library
|
||||
ruvector-delta-core = { path = "../ruvector-delta-core" }
|
||||
|
||||
# Error handling
|
||||
thiserror = "2.0"
|
||||
|
||||
# Data structures
|
||||
parking_lot = "0.12"
|
||||
dashmap = "6.0"
|
||||
smallvec = { version = "1.13", features = ["union"] }
|
||||
|
||||
# Priority queue for HNSW
|
||||
priority-queue = "2.0"
|
||||
|
||||
# Random number generation
|
||||
rand = "0.8"
|
||||
rand_xorshift = "0.3"
|
||||
|
||||
# Optional parallelism
|
||||
rayon = { version = "1.10", optional = true }
|
||||
|
||||
# Optional SIMD
|
||||
simsimd = { version = "5.9", optional = true }
|
||||
|
||||
# Optional serialization
|
||||
bincode = { version = "2.0.0-rc.3", optional = true }
|
||||
serde = { version = "1.0", features = ["derive"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.6"
|
||||
proptest = "1.4"
|
||||
|
||||
# Benchmarks will be added later
|
||||
# [[bench]]
|
||||
# name = "incremental_update"
|
||||
# harness = false
|
||||
#
|
||||
# [[bench]]
|
||||
# name = "repair_strategies"
|
||||
# harness = false
|
||||
65
vendor/ruvector/crates/ruvector-delta-index/src/error.rs
vendored
Normal file
65
vendor/ruvector/crates/ruvector-delta-index/src/error.rs
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
//! Error types for delta index operations
|
||||
|
||||
use std::fmt;
|
||||
|
||||
/// Result type for index operations
|
||||
pub type Result<T> = std::result::Result<T, IndexError>;
|
||||
|
||||
/// Errors that can occur during index operations
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum IndexError {
|
||||
/// Dimension mismatch
|
||||
DimensionMismatch {
|
||||
/// Expected dimension
|
||||
expected: usize,
|
||||
/// Actual dimension
|
||||
actual: usize,
|
||||
},
|
||||
|
||||
/// Duplicate ID
|
||||
DuplicateId(String),
|
||||
|
||||
/// ID not found
|
||||
NotFound(String),
|
||||
|
||||
/// Delta error
|
||||
DeltaError(String),
|
||||
|
||||
/// Index is full
|
||||
IndexFull {
|
||||
/// Maximum capacity
|
||||
max: usize,
|
||||
},
|
||||
|
||||
/// Invalid configuration
|
||||
InvalidConfig(String),
|
||||
|
||||
/// Graph corruption detected
|
||||
GraphCorruption(String),
|
||||
|
||||
/// Serialization error
|
||||
SerializationError(String),
|
||||
}
|
||||
|
||||
impl fmt::Display for IndexError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::DimensionMismatch { expected, actual } => {
|
||||
write!(
|
||||
f,
|
||||
"Dimension mismatch: expected {}, got {}",
|
||||
expected, actual
|
||||
)
|
||||
}
|
||||
Self::DuplicateId(id) => write!(f, "Duplicate ID: {}", id),
|
||||
Self::NotFound(id) => write!(f, "ID not found: {}", id),
|
||||
Self::DeltaError(msg) => write!(f, "Delta error: {}", msg),
|
||||
Self::IndexFull { max } => write!(f, "Index full (max {})", max),
|
||||
Self::InvalidConfig(msg) => write!(f, "Invalid config: {}", msg),
|
||||
Self::GraphCorruption(msg) => write!(f, "Graph corruption: {}", msg),
|
||||
Self::SerializationError(msg) => write!(f, "Serialization error: {}", msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for IndexError {}
|
||||
237
vendor/ruvector/crates/ruvector-delta-index/src/incremental.rs
vendored
Normal file
237
vendor/ruvector/crates/ruvector-delta-index/src/incremental.rs
vendored
Normal file
@@ -0,0 +1,237 @@
|
||||
//! Incremental index updates
|
||||
//!
|
||||
//! Provides efficient strategies for updating the index without full rebuild.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use ruvector_delta_core::{Delta, VectorDelta};
|
||||
|
||||
use crate::{DeltaHnsw, Result, SearchResult};
|
||||
|
||||
/// Configuration for incremental updates
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IncrementalConfig {
|
||||
/// Minimum delta magnitude to trigger reconnection
|
||||
pub reconnect_threshold: f32,
|
||||
/// Maximum pending updates before batch processing
|
||||
pub batch_threshold: usize,
|
||||
/// Whether to use lazy reconnection
|
||||
pub lazy_reconnect: bool,
|
||||
}
|
||||
|
||||
impl Default for IncrementalConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
reconnect_threshold: 0.1,
|
||||
batch_threshold: 100,
|
||||
lazy_reconnect: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles incremental updates to the HNSW index
|
||||
pub struct IncrementalUpdater {
|
||||
config: IncrementalConfig,
|
||||
pending_updates: HashMap<String, VectorDelta>,
|
||||
total_updates: usize,
|
||||
}
|
||||
|
||||
impl IncrementalUpdater {
|
||||
/// Create a new incremental updater
|
||||
pub fn new(config: IncrementalConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
pending_updates: HashMap::new(),
|
||||
total_updates: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Queue an update for batch processing
|
||||
pub fn queue_update(&mut self, id: String, delta: VectorDelta) {
|
||||
self.pending_updates
|
||||
.entry(id)
|
||||
.and_modify(|existing| {
|
||||
*existing = existing.clone().compose(delta.clone());
|
||||
})
|
||||
.or_insert(delta);
|
||||
|
||||
self.total_updates += 1;
|
||||
}
|
||||
|
||||
/// Check if batch processing is needed
|
||||
pub fn needs_flush(&self) -> bool {
|
||||
self.pending_updates.len() >= self.config.batch_threshold
|
||||
}
|
||||
|
||||
/// Flush pending updates to the index
|
||||
pub fn flush(&mut self, index: &mut DeltaHnsw) -> Result<FlushResult> {
|
||||
let mut applied = 0;
|
||||
let mut reconnected = 0;
|
||||
let mut errors = Vec::new();
|
||||
|
||||
let updates: Vec<_> = self.pending_updates.drain().collect();
|
||||
|
||||
for (id, delta) in updates {
|
||||
match index.apply_delta(&id, &delta) {
|
||||
Ok(()) => {
|
||||
applied += 1;
|
||||
|
||||
// Check if reconnection is needed
|
||||
if delta.l2_norm() > self.config.reconnect_threshold {
|
||||
reconnected += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
errors.push((id, e.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(FlushResult {
|
||||
applied,
|
||||
reconnected,
|
||||
errors,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get number of pending updates
|
||||
pub fn pending_count(&self) -> usize {
|
||||
self.pending_updates.len()
|
||||
}
|
||||
|
||||
/// Get total updates processed
|
||||
pub fn total_updates(&self) -> usize {
|
||||
self.total_updates
|
||||
}
|
||||
|
||||
/// Clear pending updates without applying
|
||||
pub fn clear_pending(&mut self) {
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of flushing updates
|
||||
#[derive(Debug)]
|
||||
pub struct FlushResult {
|
||||
/// Number of updates applied
|
||||
pub applied: usize,
|
||||
/// Number of nodes reconnected
|
||||
pub reconnected: usize,
|
||||
/// Errors encountered
|
||||
pub errors: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
/// Strategies for handling vector updates
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum UpdateStrategy {
|
||||
/// Apply delta without graph modification
|
||||
DeltaOnly,
|
||||
/// Apply delta and update local neighbors
|
||||
LocalRepair,
|
||||
/// Apply delta and full reconnection
|
||||
FullReconnect,
|
||||
/// Queue for batch processing
|
||||
Deferred,
|
||||
}
|
||||
|
||||
/// Determine the best update strategy based on delta magnitude
|
||||
pub fn select_strategy(delta: &VectorDelta, config: &IncrementalConfig) -> UpdateStrategy {
|
||||
let magnitude = delta.l2_norm();
|
||||
|
||||
if magnitude < config.reconnect_threshold * 0.1 {
|
||||
UpdateStrategy::DeltaOnly
|
||||
} else if magnitude < config.reconnect_threshold {
|
||||
if config.lazy_reconnect {
|
||||
UpdateStrategy::DeltaOnly
|
||||
} else {
|
||||
UpdateStrategy::LocalRepair
|
||||
}
|
||||
} else if magnitude < config.reconnect_threshold * 5.0 {
|
||||
UpdateStrategy::LocalRepair
|
||||
} else {
|
||||
UpdateStrategy::FullReconnect
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics about incremental updates
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct UpdateStats {
|
||||
/// Total updates applied
|
||||
pub total_applied: usize,
|
||||
/// Updates that triggered reconnection
|
||||
pub reconnections: usize,
|
||||
/// Updates that were delta-only
|
||||
pub delta_only: usize,
|
||||
/// Average delta magnitude
|
||||
pub avg_magnitude: f32,
|
||||
/// Maximum delta magnitude
|
||||
pub max_magnitude: f32,
|
||||
}
|
||||
|
||||
impl UpdateStats {
|
||||
/// Record an update
|
||||
pub fn record(&mut self, delta: &VectorDelta, reconnected: bool) {
|
||||
let mag = delta.l2_norm();
|
||||
|
||||
self.total_applied += 1;
|
||||
if reconnected {
|
||||
self.reconnections += 1;
|
||||
} else {
|
||||
self.delta_only += 1;
|
||||
}
|
||||
|
||||
// Update running average
|
||||
let n = self.total_applied as f32;
|
||||
self.avg_magnitude = self.avg_magnitude * ((n - 1.0) / n) + mag / n;
|
||||
self.max_magnitude = self.max_magnitude.max(mag);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_incremental_updater() {
|
||||
let mut updater = IncrementalUpdater::new(IncrementalConfig::default());
|
||||
|
||||
let delta = VectorDelta::from_dense(vec![0.1, 0.2, 0.3]);
|
||||
updater.queue_update("test".to_string(), delta);
|
||||
|
||||
assert_eq!(updater.pending_count(), 1);
|
||||
assert_eq!(updater.total_updates(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delta_composition() {
|
||||
let mut updater = IncrementalUpdater::new(IncrementalConfig::default());
|
||||
|
||||
let delta1 = VectorDelta::from_dense(vec![1.0, 0.0, 0.0]);
|
||||
let delta2 = VectorDelta::from_dense(vec![0.0, 1.0, 0.0]);
|
||||
|
||||
updater.queue_update("test".to_string(), delta1);
|
||||
updater.queue_update("test".to_string(), delta2);
|
||||
|
||||
// Should compose into single update
|
||||
assert_eq!(updater.pending_count(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_selection() {
|
||||
let config = IncrementalConfig {
|
||||
reconnect_threshold: 0.5,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Small delta -> DeltaOnly
|
||||
let small = VectorDelta::from_dense(vec![0.01, 0.01, 0.01]);
|
||||
assert_eq!(select_strategy(&small, &config), UpdateStrategy::DeltaOnly);
|
||||
|
||||
// Large delta -> FullReconnect
|
||||
let large = VectorDelta::from_dense(vec![10.0, 10.0, 10.0]);
|
||||
assert_eq!(
|
||||
select_strategy(&large, &config),
|
||||
UpdateStrategy::FullReconnect
|
||||
);
|
||||
}
|
||||
}
|
||||
774
vendor/ruvector/crates/ruvector-delta-index/src/lib.rs
vendored
Normal file
774
vendor/ruvector/crates/ruvector-delta-index/src/lib.rs
vendored
Normal file
@@ -0,0 +1,774 @@
|
||||
//! # RuVector Delta Index
|
||||
//!
|
||||
//! Delta-aware HNSW index with incremental updates and repair strategies.
|
||||
//! Optimized for scenarios with frequent small changes to vector embeddings.
|
||||
//!
|
||||
//! ## Key Features
|
||||
//!
|
||||
//! - Incremental index updates without full rebuild
|
||||
//! - Repair strategies for maintaining graph quality
|
||||
//! - Recall quality monitoring
|
||||
//! - Delta-based versioning
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust,ignore
|
||||
//! use ruvector_delta_index::{DeltaHnsw, DeltaHnswConfig, RepairStrategy};
|
||||
//! use ruvector_delta_core::VectorDelta;
|
||||
//!
|
||||
//! let config = DeltaHnswConfig::default();
|
||||
//! let mut index = DeltaHnsw::new(384, config);
|
||||
//!
|
||||
//! // Insert vectors
|
||||
//! index.insert("vec1", vec![1.0; 384]);
|
||||
//!
|
||||
//! // Apply delta update
|
||||
//! let delta = VectorDelta::from_dense(vec![0.1; 384]);
|
||||
//! index.apply_delta("vec1", &delta);
|
||||
//!
|
||||
//! // Search (uses repaired graph)
|
||||
//! let results = index.search(&query, 10);
|
||||
//! ```
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![warn(clippy::all)]
|
||||
|
||||
pub mod error;
|
||||
pub mod incremental;
|
||||
pub mod quality;
|
||||
pub mod repair;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use dashmap::DashMap;
|
||||
use parking_lot::RwLock;
|
||||
use priority_queue::PriorityQueue;
|
||||
use rand::SeedableRng;
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use ruvector_delta_core::{Delta, DeltaStream, VectorDelta};
|
||||
|
||||
pub use error::{IndexError, Result};
|
||||
pub use incremental::IncrementalUpdater;
|
||||
pub use quality::{QualityMetrics, QualityMonitor, RecallEstimate};
|
||||
pub use repair::{GraphRepairer, RepairConfig, RepairStrategy};
|
||||
|
||||
/// Configuration for Delta HNSW index
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DeltaHnswConfig {
|
||||
/// Number of connections per node
|
||||
pub m: usize,
|
||||
/// Maximum connections per node at layer 0
|
||||
pub m0: usize,
|
||||
/// Construction ef (neighbor search budget)
|
||||
pub ef_construction: usize,
|
||||
/// Search ef (query-time search budget)
|
||||
pub ef_search: usize,
|
||||
/// Maximum elements
|
||||
pub max_elements: usize,
|
||||
/// Level multiplier for layer assignment
|
||||
pub level_mult: f64,
|
||||
/// Delta threshold for triggering repair
|
||||
pub repair_threshold: f32,
|
||||
/// Maximum deltas before compaction
|
||||
pub max_deltas: usize,
|
||||
/// Enable automatic quality monitoring
|
||||
pub auto_monitor: bool,
|
||||
}
|
||||
|
||||
impl Default for DeltaHnswConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
m: 16,
|
||||
m0: 32,
|
||||
ef_construction: 200,
|
||||
ef_search: 100,
|
||||
max_elements: 1_000_000,
|
||||
level_mult: 1.0 / (16.0_f64).ln(),
|
||||
repair_threshold: 0.5,
|
||||
max_deltas: 100,
|
||||
auto_monitor: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A node in the HNSW graph
|
||||
#[derive(Clone)]
|
||||
struct HnswNode {
|
||||
/// Vector ID
|
||||
id: String,
|
||||
/// Vector data
|
||||
vector: Vec<f32>,
|
||||
/// Neighbors at each level (level -> neighbors)
|
||||
neighbors: Vec<SmallVec<[u32; 32]>>,
|
||||
/// Maximum level for this node
|
||||
level: usize,
|
||||
/// Delta stream for this node
|
||||
delta_stream: DeltaStream<VectorDelta>,
|
||||
}
|
||||
|
||||
impl HnswNode {
|
||||
fn new(id: String, vector: Vec<f32>, level: usize) -> Self {
|
||||
Self {
|
||||
id,
|
||||
vector: vector.clone(),
|
||||
neighbors: vec![SmallVec::new(); level + 1],
|
||||
level,
|
||||
delta_stream: DeltaStream::for_vectors(vector.len()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry point for the HNSW graph
|
||||
#[derive(Clone)]
|
||||
struct EntryPoint {
|
||||
node_idx: u32,
|
||||
level: usize,
|
||||
}
|
||||
|
||||
/// Delta-aware HNSW index
|
||||
pub struct DeltaHnsw {
|
||||
/// Configuration
|
||||
config: DeltaHnswConfig,
|
||||
/// Vector dimensions
|
||||
dimensions: usize,
|
||||
/// All nodes
|
||||
nodes: Vec<RwLock<HnswNode>>,
|
||||
/// ID to node index mapping
|
||||
id_to_idx: DashMap<String, u32>,
|
||||
/// Entry point
|
||||
entry_point: RwLock<Option<EntryPoint>>,
|
||||
/// Random number generator for level assignment
|
||||
rng: RwLock<XorShiftRng>,
|
||||
/// Quality monitor
|
||||
quality_monitor: Option<QualityMonitor>,
|
||||
/// Graph repairer
|
||||
repairer: GraphRepairer,
|
||||
}
|
||||
|
||||
impl DeltaHnsw {
|
||||
/// Create a new Delta HNSW index
|
||||
pub fn new(dimensions: usize, config: DeltaHnswConfig) -> Self {
|
||||
let quality_monitor = if config.auto_monitor {
|
||||
Some(QualityMonitor::new(dimensions))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let repair_config = RepairConfig {
|
||||
strategy: RepairStrategy::Lazy,
|
||||
batch_size: 100,
|
||||
quality_threshold: 0.95,
|
||||
};
|
||||
|
||||
Self {
|
||||
config: config.clone(),
|
||||
dimensions,
|
||||
nodes: Vec::with_capacity(config.max_elements),
|
||||
id_to_idx: DashMap::new(),
|
||||
entry_point: RwLock::new(None),
|
||||
rng: RwLock::new(XorShiftRng::seed_from_u64(42)),
|
||||
quality_monitor,
|
||||
repairer: GraphRepairer::new(repair_config),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get configuration
|
||||
pub fn config(&self) -> &DeltaHnswConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Get dimensions
|
||||
pub fn dimensions(&self) -> usize {
|
||||
self.dimensions
|
||||
}
|
||||
|
||||
/// Get number of elements
|
||||
pub fn len(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
/// Check if empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.nodes.is_empty()
|
||||
}
|
||||
|
||||
/// Insert a new vector
|
||||
pub fn insert(&mut self, id: &str, vector: Vec<f32>) -> Result<()> {
|
||||
if vector.len() != self.dimensions {
|
||||
return Err(IndexError::DimensionMismatch {
|
||||
expected: self.dimensions,
|
||||
actual: vector.len(),
|
||||
});
|
||||
}
|
||||
|
||||
if self.id_to_idx.contains_key(id) {
|
||||
return Err(IndexError::DuplicateId(id.to_string()));
|
||||
}
|
||||
|
||||
// Assign level
|
||||
let level = self.random_level();
|
||||
let node_idx = self.nodes.len() as u32;
|
||||
|
||||
// Create node
|
||||
let node = HnswNode::new(id.to_string(), vector.clone(), level);
|
||||
self.nodes.push(RwLock::new(node));
|
||||
self.id_to_idx.insert(id.to_string(), node_idx);
|
||||
|
||||
// Connect to graph
|
||||
self.connect_node(node_idx, &vector, level)?;
|
||||
|
||||
// Update entry point if needed
|
||||
let mut entry = self.entry_point.write();
|
||||
if entry.is_none() || level > entry.as_ref().unwrap().level {
|
||||
*entry = Some(EntryPoint { node_idx, level });
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply a delta update to a vector
|
||||
pub fn apply_delta(&mut self, id: &str, delta: &VectorDelta) -> Result<()> {
|
||||
let node_idx = *self
|
||||
.id_to_idx
|
||||
.get(id)
|
||||
.ok_or_else(|| IndexError::NotFound(id.to_string()))?;
|
||||
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
|
||||
// Apply delta to vector
|
||||
delta
|
||||
.apply(&mut node.vector)
|
||||
.map_err(|e| IndexError::DeltaError(format!("{:?}", e)))?;
|
||||
|
||||
// Record in stream
|
||||
node.delta_stream.push(delta.clone());
|
||||
|
||||
// Check if repair is needed
|
||||
let cumulative_change = self.estimate_cumulative_change(&node);
|
||||
if cumulative_change > self.config.repair_threshold {
|
||||
drop(node);
|
||||
self.repair_node(node_idx)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Batch apply deltas
|
||||
pub fn apply_deltas_batch(&mut self, updates: &[(String, VectorDelta)]) -> Result<Vec<u32>> {
|
||||
let mut repaired = Vec::new();
|
||||
|
||||
for (id, delta) in updates {
|
||||
let node_idx = *self
|
||||
.id_to_idx
|
||||
.get(id)
|
||||
.ok_or_else(|| IndexError::NotFound(id.clone()))?;
|
||||
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
delta
|
||||
.apply(&mut node.vector)
|
||||
.map_err(|e| IndexError::DeltaError(format!("{:?}", e)))?;
|
||||
node.delta_stream.push(delta.clone());
|
||||
|
||||
let change = self.estimate_cumulative_change(&node);
|
||||
if change > self.config.repair_threshold {
|
||||
repaired.push(node_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Batch repair
|
||||
for node_idx in &repaired {
|
||||
drop(self.nodes[*node_idx as usize].write());
|
||||
self.repair_node(*node_idx)?;
|
||||
}
|
||||
|
||||
Ok(repaired)
|
||||
}
|
||||
|
||||
/// Search for k nearest neighbors
|
||||
pub fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
|
||||
if query.len() != self.dimensions {
|
||||
return Err(IndexError::DimensionMismatch {
|
||||
expected: self.dimensions,
|
||||
actual: query.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let entry = self.entry_point.read();
|
||||
if entry.is_none() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let entry = entry.as_ref().unwrap();
|
||||
let mut current_node = entry.node_idx;
|
||||
|
||||
// Greedy search from top to layer 1
|
||||
for level in (1..=entry.level).rev() {
|
||||
current_node = self.greedy_search(query, current_node, level);
|
||||
}
|
||||
|
||||
// Layer 0: ef_search neighbors
|
||||
let candidates = self.search_layer(query, current_node, 0, self.config.ef_search);
|
||||
|
||||
// Take top-k
|
||||
let results: Vec<SearchResult> = candidates
|
||||
.into_iter()
|
||||
.take(k)
|
||||
.map(|(idx, dist)| {
|
||||
let node = self.nodes[idx as usize].read();
|
||||
SearchResult {
|
||||
id: node.id.clone(),
|
||||
distance: dist,
|
||||
vector: Some(node.vector.clone()),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Update quality monitor
|
||||
if let Some(monitor) = &self.quality_monitor {
|
||||
monitor.record_search(query, &results);
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get current quality metrics
|
||||
pub fn quality_metrics(&self) -> Option<QualityMetrics> {
|
||||
self.quality_monitor.as_ref().map(|m| m.metrics())
|
||||
}
|
||||
|
||||
/// Force repair of entire graph
|
||||
pub fn force_repair(&mut self) -> Result<usize> {
|
||||
let node_count = self.nodes.len();
|
||||
let mut repaired = 0;
|
||||
|
||||
for idx in 0..node_count {
|
||||
if self.repair_node(idx as u32)? {
|
||||
repaired += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(repaired)
|
||||
}
|
||||
|
||||
/// Delete a vector by ID
|
||||
pub fn delete(&mut self, id: &str) -> Result<bool> {
|
||||
let node_idx = match self.id_to_idx.remove(id) {
|
||||
Some((_, idx)) => idx,
|
||||
None => return Ok(false),
|
||||
};
|
||||
|
||||
// Mark node as deleted (we don't physically remove to preserve indices)
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
node.id = String::new();
|
||||
node.vector.clear();
|
||||
node.neighbors.clear();
|
||||
|
||||
// Remove from other nodes' neighbor lists
|
||||
for i in 0..self.nodes.len() {
|
||||
if i == node_idx as usize {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut other = self.nodes[i].write();
|
||||
for level_neighbors in &mut other.neighbors {
|
||||
level_neighbors.retain(|n| *n != node_idx);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Compact delta streams for all nodes
|
||||
pub fn compact_deltas(&mut self) -> usize {
|
||||
let mut total_compacted = 0;
|
||||
|
||||
for node in &self.nodes {
|
||||
let mut node = node.write();
|
||||
total_compacted += node.delta_stream.compact().unwrap_or(0);
|
||||
}
|
||||
|
||||
total_compacted
|
||||
}
|
||||
|
||||
// Private methods
|
||||
|
||||
fn random_level(&self) -> usize {
|
||||
let mut rng = self.rng.write();
|
||||
let r: f64 = rand::Rng::gen(&mut *rng);
|
||||
(-r.ln() * self.config.level_mult).floor() as usize
|
||||
}
|
||||
|
||||
fn connect_node(&mut self, node_idx: u32, vector: &[f32], level: usize) -> Result<()> {
|
||||
let entry = self.entry_point.read().clone();
|
||||
|
||||
if entry.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let entry = entry.unwrap();
|
||||
let mut current = entry.node_idx;
|
||||
|
||||
// Navigate from top level
|
||||
for l in (level + 1..=entry.level).rev() {
|
||||
current = self.greedy_search(vector, current, l);
|
||||
}
|
||||
|
||||
// Connect at each level
|
||||
for l in (0..=level.min(entry.level)).rev() {
|
||||
let neighbors = self.search_layer(vector, current, l, self.config.ef_construction);
|
||||
|
||||
let max_conn = if l == 0 {
|
||||
self.config.m0
|
||||
} else {
|
||||
self.config.m
|
||||
};
|
||||
|
||||
// Select best neighbors
|
||||
let selected: Vec<u32> = neighbors
|
||||
.into_iter()
|
||||
.take(max_conn)
|
||||
.map(|(idx, _)| idx)
|
||||
.collect();
|
||||
|
||||
// Update node's neighbors
|
||||
{
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
if l < node.neighbors.len() {
|
||||
node.neighbors[l] = selected.iter().cloned().collect();
|
||||
}
|
||||
}
|
||||
|
||||
// Add reverse connections
|
||||
for &neighbor_idx in &selected {
|
||||
let mut neighbor = self.nodes[neighbor_idx as usize].write();
|
||||
if l < neighbor.neighbors.len() {
|
||||
neighbor.neighbors[l].push(node_idx);
|
||||
|
||||
// Prune if over limit
|
||||
if neighbor.neighbors[l].len() > max_conn {
|
||||
let node_vec = self.nodes[neighbor_idx as usize].read().vector.clone();
|
||||
self.prune_neighbors(&mut neighbor.neighbors[l], &node_vec, max_conn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !selected.is_empty() {
|
||||
current = selected[0];
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn greedy_search(&self, query: &[f32], start: u32, level: usize) -> u32 {
|
||||
let mut current = start;
|
||||
let mut current_dist = self.distance(query, current);
|
||||
|
||||
loop {
|
||||
let node = self.nodes[current as usize].read();
|
||||
if level >= node.neighbors.len() {
|
||||
break;
|
||||
}
|
||||
|
||||
let mut improved = false;
|
||||
|
||||
for &neighbor in &node.neighbors[level] {
|
||||
let dist = self.distance(query, neighbor);
|
||||
if dist < current_dist {
|
||||
current = neighbor;
|
||||
current_dist = dist;
|
||||
improved = true;
|
||||
}
|
||||
}
|
||||
|
||||
if !improved {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
current
|
||||
}
|
||||
|
||||
fn search_layer(&self, query: &[f32], start: u32, level: usize, ef: usize) -> Vec<(u32, f32)> {
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct Candidate {
|
||||
idx: u32,
|
||||
dist: f32,
|
||||
}
|
||||
|
||||
impl PartialEq for Candidate {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.dist == other.dist
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Candidate {}
|
||||
|
||||
impl PartialOrd for Candidate {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Candidate {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// Min-heap by distance
|
||||
other
|
||||
.dist
|
||||
.partial_cmp(&self.dist)
|
||||
.unwrap_or(Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
let start_dist = self.distance(query, start);
|
||||
let mut candidates = BinaryHeap::new();
|
||||
let mut results = BinaryHeap::new();
|
||||
let mut visited = HashSet::new();
|
||||
|
||||
candidates.push(Candidate {
|
||||
idx: start,
|
||||
dist: start_dist,
|
||||
});
|
||||
results.push(Candidate {
|
||||
idx: start,
|
||||
dist: -start_dist, // Max-heap for worst result
|
||||
});
|
||||
visited.insert(start);
|
||||
|
||||
while let Some(current) = candidates.pop() {
|
||||
// Check if we can stop
|
||||
if !results.is_empty() {
|
||||
let worst = results.peek().unwrap();
|
||||
if current.dist > -worst.dist {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let node = self.nodes[current.idx as usize].read();
|
||||
if level >= node.neighbors.len() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for &neighbor in &node.neighbors[level] {
|
||||
if visited.contains(&neighbor) {
|
||||
continue;
|
||||
}
|
||||
visited.insert(neighbor);
|
||||
|
||||
let dist = self.distance(query, neighbor);
|
||||
|
||||
let should_add = results.len() < ef || dist < -results.peek().unwrap().dist;
|
||||
|
||||
if should_add {
|
||||
candidates.push(Candidate {
|
||||
idx: neighbor,
|
||||
dist,
|
||||
});
|
||||
results.push(Candidate {
|
||||
idx: neighbor,
|
||||
dist: -dist,
|
||||
});
|
||||
|
||||
if results.len() > ef {
|
||||
results.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results.into_iter().map(|c| (c.idx, -c.dist)).collect()
|
||||
}
|
||||
|
||||
fn distance(&self, query: &[f32], node_idx: u32) -> f32 {
|
||||
let node = self.nodes[node_idx as usize].read();
|
||||
if node.vector.is_empty() {
|
||||
return f32::MAX;
|
||||
}
|
||||
|
||||
// L2 distance squared
|
||||
query
|
||||
.iter()
|
||||
.zip(node.vector.iter())
|
||||
.map(|(a, b)| (a - b).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
fn prune_neighbors(&self, neighbors: &mut SmallVec<[u32; 32]>, node_vec: &[f32], max: usize) {
|
||||
if neighbors.len() <= max {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort by distance and keep closest
|
||||
let mut with_dist: Vec<(u32, f32)> = neighbors
|
||||
.iter()
|
||||
.map(|&n| (n, self.distance(node_vec, n)))
|
||||
.collect();
|
||||
|
||||
with_dist.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
||||
|
||||
neighbors.clear();
|
||||
for (idx, _) in with_dist.into_iter().take(max) {
|
||||
neighbors.push(idx);
|
||||
}
|
||||
}
|
||||
|
||||
fn estimate_cumulative_change(&self, node: &HnswNode) -> f32 {
|
||||
// Estimate change based on delta stream
|
||||
let mut total_change = 0.0f32;
|
||||
|
||||
for (_, delta) in node.delta_stream.iter() {
|
||||
total_change += delta.l2_norm();
|
||||
}
|
||||
|
||||
total_change
|
||||
}
|
||||
|
||||
fn repair_node(&mut self, node_idx: u32) -> Result<bool> {
|
||||
let node = self.nodes[node_idx as usize].read();
|
||||
if node.vector.is_empty() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let vector = node.vector.clone();
|
||||
let level = node.level;
|
||||
drop(node);
|
||||
|
||||
// Reconnect based on current vector
|
||||
self.reconnect_node(node_idx, &vector, level)?;
|
||||
|
||||
// Compact delta stream
|
||||
{
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
node.delta_stream.compact().ok();
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn reconnect_node(&mut self, node_idx: u32, vector: &[f32], level: usize) -> Result<()> {
|
||||
// Find new neighbors at each level
|
||||
let entry = self.entry_point.read().clone();
|
||||
if entry.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let entry = entry.unwrap();
|
||||
let mut current = entry.node_idx;
|
||||
|
||||
for l in (level + 1..=entry.level).rev() {
|
||||
current = self.greedy_search(vector, current, l);
|
||||
}
|
||||
|
||||
for l in (0..=level.min(entry.level)).rev() {
|
||||
let neighbors = self.search_layer(vector, current, l, self.config.ef_construction);
|
||||
|
||||
let max_conn = if l == 0 {
|
||||
self.config.m0
|
||||
} else {
|
||||
self.config.m
|
||||
};
|
||||
|
||||
// Filter out self
|
||||
let selected: Vec<u32> = neighbors
|
||||
.into_iter()
|
||||
.filter(|(idx, _)| *idx != node_idx)
|
||||
.take(max_conn)
|
||||
.map(|(idx, _)| idx)
|
||||
.collect();
|
||||
|
||||
// Update neighbors
|
||||
{
|
||||
let mut node = self.nodes[node_idx as usize].write();
|
||||
if l < node.neighbors.len() {
|
||||
node.neighbors[l] = selected.iter().cloned().collect();
|
||||
}
|
||||
}
|
||||
|
||||
if !selected.is_empty() {
|
||||
current = selected[0];
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Search result
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SearchResult {
|
||||
/// Vector ID
|
||||
pub id: String,
|
||||
/// Distance to query
|
||||
pub distance: f32,
|
||||
/// Optional vector data
|
||||
pub vector: Option<Vec<f32>>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn random_vector(dim: usize) -> Vec<f32> {
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..dim).map(|_| rng.gen()).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_and_search() {
|
||||
let mut index = DeltaHnsw::new(128, DeltaHnswConfig::default());
|
||||
|
||||
// Insert some vectors
|
||||
for i in 0..100 {
|
||||
let vec = random_vector(128);
|
||||
index.insert(&format!("vec_{}", i), vec).unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(index.len(), 100);
|
||||
|
||||
// Search
|
||||
let query = random_vector(128);
|
||||
let results = index.search(&query, 10).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delta_update() {
|
||||
let mut index = DeltaHnsw::new(4, DeltaHnswConfig::default());
|
||||
|
||||
let original = vec![1.0, 2.0, 3.0, 4.0];
|
||||
index.insert("test", original.clone()).unwrap();
|
||||
|
||||
let delta = VectorDelta::from_dense(vec![0.5, 0.0, -0.5, 0.0]);
|
||||
index.apply_delta("test", &delta).unwrap();
|
||||
|
||||
// Search should still work
|
||||
let results = index.search(&[1.5, 2.0, 2.5, 4.0], 1).unwrap();
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].id, "test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete() {
|
||||
let mut index = DeltaHnsw::new(4, DeltaHnswConfig::default());
|
||||
|
||||
index.insert("a", vec![1.0, 0.0, 0.0, 0.0]).unwrap();
|
||||
index.insert("b", vec![0.0, 1.0, 0.0, 0.0]).unwrap();
|
||||
index.insert("c", vec![0.0, 0.0, 1.0, 0.0]).unwrap();
|
||||
|
||||
assert!(index.delete("b").unwrap());
|
||||
assert!(!index.delete("nonexistent").unwrap());
|
||||
|
||||
let results = index.search(&[0.0, 1.0, 0.0, 0.0], 10).unwrap();
|
||||
assert!(results.iter().all(|r| r.id != "b"));
|
||||
}
|
||||
}
|
||||
341
vendor/ruvector/crates/ruvector-delta-index/src/quality.rs
vendored
Normal file
341
vendor/ruvector/crates/ruvector-delta-index/src/quality.rs
vendored
Normal file
@@ -0,0 +1,341 @@
|
||||
//! Quality monitoring for delta-aware HNSW
|
||||
//!
|
||||
//! Monitors recall quality and detects when repair is needed.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use parking_lot::RwLock;
|
||||
|
||||
use crate::SearchResult;
|
||||
|
||||
/// Configuration for quality monitoring
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QualityConfig {
|
||||
/// Size of the sample window
|
||||
pub window_size: usize,
|
||||
/// Number of random samples for estimation
|
||||
pub sample_count: usize,
|
||||
/// Recall threshold below which repair is triggered
|
||||
pub recall_threshold: f32,
|
||||
/// How often to run quality checks (in search count)
|
||||
pub check_interval: usize,
|
||||
}
|
||||
|
||||
impl Default for QualityConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
window_size: 1000,
|
||||
sample_count: 100,
|
||||
recall_threshold: 0.9,
|
||||
check_interval: 100,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Quality metrics
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct QualityMetrics {
|
||||
/// Estimated recall
|
||||
pub recall: f32,
|
||||
/// Average number of distance computations
|
||||
pub avg_distance_comps: f32,
|
||||
/// Average search latency (ns)
|
||||
pub avg_latency_ns: f64,
|
||||
/// Total searches performed
|
||||
pub total_searches: u64,
|
||||
/// Searches since last repair
|
||||
pub searches_since_repair: u64,
|
||||
}
|
||||
|
||||
/// Recall estimate with confidence
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RecallEstimate {
|
||||
/// Point estimate
|
||||
pub recall: f32,
|
||||
/// Lower bound (95% CI)
|
||||
pub lower_bound: f32,
|
||||
/// Upper bound (95% CI)
|
||||
pub upper_bound: f32,
|
||||
/// Number of samples
|
||||
pub samples: usize,
|
||||
}
|
||||
|
||||
/// Search sample for quality estimation
|
||||
struct SearchSample {
|
||||
/// Query vector (for potential re-search)
|
||||
query_hash: u64,
|
||||
/// Returned result IDs
|
||||
result_ids: Vec<String>,
|
||||
/// Distances
|
||||
distances: Vec<f32>,
|
||||
/// Timestamp
|
||||
timestamp_ns: u64,
|
||||
}
|
||||
|
||||
/// Quality monitor for the index
|
||||
pub struct QualityMonitor {
|
||||
config: QualityConfig,
|
||||
samples: RwLock<VecDeque<SearchSample>>,
|
||||
metrics: RwLock<QualityMetrics>,
|
||||
search_count: AtomicU64,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
impl QualityMonitor {
|
||||
/// Create a new quality monitor
|
||||
pub fn new(dimensions: usize) -> Self {
|
||||
Self {
|
||||
config: QualityConfig::default(),
|
||||
samples: RwLock::new(VecDeque::with_capacity(1000)),
|
||||
metrics: RwLock::new(QualityMetrics::default()),
|
||||
search_count: AtomicU64::new(0),
|
||||
dimensions,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom configuration
|
||||
pub fn with_config(dimensions: usize, config: QualityConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
samples: RwLock::new(VecDeque::with_capacity(1000)),
|
||||
metrics: RwLock::new(QualityMetrics::default()),
|
||||
search_count: AtomicU64::new(0),
|
||||
dimensions,
|
||||
}
|
||||
}
|
||||
|
||||
/// Record a search for quality monitoring
|
||||
pub fn record_search(&self, query: &[f32], results: &[SearchResult]) {
|
||||
let count = self.search_count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Only sample periodically
|
||||
if count % (self.config.check_interval as u64) != 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let sample = SearchSample {
|
||||
query_hash: hash_vector(query),
|
||||
result_ids: results.iter().map(|r| r.id.clone()).collect(),
|
||||
distances: results.iter().map(|r| r.distance).collect(),
|
||||
timestamp_ns: current_time_ns(),
|
||||
};
|
||||
|
||||
let mut samples = self.samples.write();
|
||||
samples.push_back(sample);
|
||||
|
||||
// Maintain window size
|
||||
while samples.len() > self.config.window_size {
|
||||
samples.pop_front();
|
||||
}
|
||||
|
||||
// Update metrics
|
||||
let mut metrics = self.metrics.write();
|
||||
metrics.total_searches = count + 1;
|
||||
metrics.searches_since_repair += 1;
|
||||
|
||||
// Update average distance
|
||||
if !results.is_empty() {
|
||||
let avg_dist = results.iter().map(|r| r.distance).sum::<f32>() / results.len() as f32;
|
||||
let n = metrics.total_searches as f32;
|
||||
metrics.avg_distance_comps =
|
||||
metrics.avg_distance_comps * ((n - 1.0) / n) + avg_dist / n;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current metrics
|
||||
pub fn metrics(&self) -> QualityMetrics {
|
||||
self.metrics.read().clone()
|
||||
}
|
||||
|
||||
/// Estimate current recall
|
||||
pub fn estimate_recall(&self) -> RecallEstimate {
|
||||
let samples = self.samples.read();
|
||||
|
||||
if samples.is_empty() {
|
||||
return RecallEstimate {
|
||||
recall: 1.0,
|
||||
lower_bound: 0.0,
|
||||
upper_bound: 1.0,
|
||||
samples: 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Estimate based on distance consistency
|
||||
let mut consistent = 0;
|
||||
let mut total = 0;
|
||||
|
||||
for i in 1..samples.len() {
|
||||
let prev = &samples[i - 1];
|
||||
let curr = &samples[i];
|
||||
|
||||
// If queries are similar, results should overlap
|
||||
if similar_queries(prev.query_hash, curr.query_hash) {
|
||||
total += 1;
|
||||
let overlap = count_overlap(&prev.result_ids, &curr.result_ids);
|
||||
if overlap > 0 {
|
||||
consistent += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let recall = if total > 0 {
|
||||
consistent as f32 / total as f32
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
|
||||
// Wilson confidence interval
|
||||
let n = total.max(1) as f32;
|
||||
let z = 1.96; // 95% CI
|
||||
let center = (recall + z * z / (2.0 * n)) / (1.0 + z * z / n);
|
||||
let width =
|
||||
z * (recall * (1.0 - recall) / n + z * z / (4.0 * n * n)).sqrt() / (1.0 + z * z / n);
|
||||
|
||||
RecallEstimate {
|
||||
recall,
|
||||
lower_bound: (center - width).max(0.0),
|
||||
upper_bound: (center + width).min(1.0),
|
||||
samples: samples.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if repair is needed based on quality
|
||||
pub fn needs_repair(&self) -> bool {
|
||||
let estimate = self.estimate_recall();
|
||||
estimate.recall < self.config.recall_threshold
|
||||
}
|
||||
|
||||
/// Reset counters after repair
|
||||
pub fn on_repair(&self) {
|
||||
let mut metrics = self.metrics.write();
|
||||
metrics.searches_since_repair = 0;
|
||||
}
|
||||
|
||||
/// Clear all samples
|
||||
pub fn clear(&self) {
|
||||
self.samples.write().clear();
|
||||
*self.metrics.write() = QualityMetrics::default();
|
||||
self.search_count.store(0, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Hash a vector for comparison
|
||||
fn hash_vector(v: &[f32]) -> u64 {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
|
||||
// Hash first few elements for quick comparison
|
||||
let sample_size = v.len().min(16);
|
||||
for &x in &v[..sample_size] {
|
||||
x.to_bits().hash(&mut hasher);
|
||||
}
|
||||
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
/// Check if two queries are similar (by hash)
|
||||
fn similar_queries(h1: u64, h2: u64) -> bool {
|
||||
// XOR and count differing bits
|
||||
let diff = (h1 ^ h2).count_ones();
|
||||
diff < 32
|
||||
}
|
||||
|
||||
/// Count overlapping IDs
|
||||
fn count_overlap(a: &[String], b: &[String]) -> usize {
|
||||
a.iter().filter(|id| b.contains(id)).count()
|
||||
}
|
||||
|
||||
/// Get current time in nanoseconds
|
||||
fn current_time_ns() -> u64 {
|
||||
use std::time::SystemTime;
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map(|d| d.as_nanos() as u64)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Distance distribution statistics
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct DistanceStats {
|
||||
/// Mean distance
|
||||
pub mean: f32,
|
||||
/// Standard deviation
|
||||
pub std_dev: f32,
|
||||
/// Minimum distance
|
||||
pub min: f32,
|
||||
/// Maximum distance
|
||||
pub max: f32,
|
||||
/// Median distance
|
||||
pub median: f32,
|
||||
}
|
||||
|
||||
impl DistanceStats {
|
||||
/// Calculate from a list of distances
|
||||
pub fn from_distances(distances: &[f32]) -> Self {
|
||||
if distances.is_empty() {
|
||||
return Self::default();
|
||||
}
|
||||
|
||||
let n = distances.len() as f32;
|
||||
let mean = distances.iter().sum::<f32>() / n;
|
||||
let variance = distances.iter().map(|d| (d - mean).powi(2)).sum::<f32>() / n;
|
||||
|
||||
let mut sorted = distances.to_vec();
|
||||
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
|
||||
Self {
|
||||
mean,
|
||||
std_dev: variance.sqrt(),
|
||||
min: *sorted.first().unwrap_or(&0.0),
|
||||
max: *sorted.last().unwrap_or(&0.0),
|
||||
median: sorted[sorted.len() / 2],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_quality_monitor_creation() {
|
||||
let monitor = QualityMonitor::new(128);
|
||||
let metrics = monitor.metrics();
|
||||
|
||||
assert_eq!(metrics.total_searches, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_recall_estimation() {
|
||||
let monitor = QualityMonitor::new(128);
|
||||
let estimate = monitor.estimate_recall();
|
||||
|
||||
// Empty monitor should return 1.0 recall
|
||||
assert!((estimate.recall - 1.0).abs() < 1e-6);
|
||||
assert_eq!(estimate.samples, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_vector() {
|
||||
let v1 = vec![1.0f32, 2.0, 3.0, 4.0];
|
||||
let v2 = vec![1.0f32, 2.0, 3.0, 4.0];
|
||||
let v3 = vec![5.0f32, 6.0, 7.0, 8.0];
|
||||
|
||||
assert_eq!(hash_vector(&v1), hash_vector(&v2));
|
||||
assert_ne!(hash_vector(&v1), hash_vector(&v3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_stats() {
|
||||
let distances = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
|
||||
let stats = DistanceStats::from_distances(&distances);
|
||||
|
||||
assert!((stats.mean - 3.0).abs() < 1e-6);
|
||||
assert!((stats.min - 1.0).abs() < 1e-6);
|
||||
assert!((stats.max - 5.0).abs() < 1e-6);
|
||||
assert!((stats.median - 3.0).abs() < 1e-6);
|
||||
}
|
||||
}
|
||||
213
vendor/ruvector/crates/ruvector-delta-index/src/repair.rs
vendored
Normal file
213
vendor/ruvector/crates/ruvector-delta-index/src/repair.rs
vendored
Normal file
@@ -0,0 +1,213 @@
|
||||
//! Graph repair strategies
|
||||
//!
|
||||
//! Provides strategies for maintaining HNSW graph quality after delta updates.
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Repair strategy
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RepairStrategy {
|
||||
/// No automatic repair
|
||||
None,
|
||||
/// Repair only when explicitly triggered
|
||||
Lazy,
|
||||
/// Immediate repair on every update
|
||||
Eager,
|
||||
/// Batch repair at intervals
|
||||
Batched,
|
||||
/// Adaptive based on quality monitoring
|
||||
Adaptive,
|
||||
}
|
||||
|
||||
/// Configuration for graph repair
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RepairConfig {
|
||||
/// Repair strategy to use
|
||||
pub strategy: RepairStrategy,
|
||||
/// Batch size for batched repair
|
||||
pub batch_size: usize,
|
||||
/// Quality threshold below which repair is triggered
|
||||
pub quality_threshold: f32,
|
||||
}
|
||||
|
||||
impl Default for RepairConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
strategy: RepairStrategy::Lazy,
|
||||
batch_size: 100,
|
||||
quality_threshold: 0.95,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles graph repair operations
|
||||
pub struct GraphRepairer {
|
||||
config: RepairConfig,
|
||||
pending_repairs: HashSet<u32>,
|
||||
repair_count: usize,
|
||||
}
|
||||
|
||||
impl GraphRepairer {
|
||||
/// Create a new graph repairer
|
||||
pub fn new(config: RepairConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
pending_repairs: HashSet::new(),
|
||||
repair_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark a node as needing repair
|
||||
pub fn mark_for_repair(&mut self, node_idx: u32) {
|
||||
self.pending_repairs.insert(node_idx);
|
||||
}
|
||||
|
||||
/// Check if batch repair is needed
|
||||
pub fn needs_batch_repair(&self) -> bool {
|
||||
self.pending_repairs.len() >= self.config.batch_size
|
||||
}
|
||||
|
||||
/// Get nodes pending repair
|
||||
pub fn pending_nodes(&self) -> Vec<u32> {
|
||||
self.pending_repairs.iter().cloned().collect()
|
||||
}
|
||||
|
||||
/// Clear pending repairs
|
||||
pub fn clear_pending(&mut self) {
|
||||
self.pending_repairs.clear();
|
||||
}
|
||||
|
||||
/// Record completed repair
|
||||
pub fn record_repair(&mut self, count: usize) {
|
||||
self.repair_count += count;
|
||||
}
|
||||
|
||||
/// Get total repairs performed
|
||||
pub fn total_repairs(&self) -> usize {
|
||||
self.repair_count
|
||||
}
|
||||
|
||||
/// Get repair strategy
|
||||
pub fn strategy(&self) -> RepairStrategy {
|
||||
self.config.strategy
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a repair operation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RepairResult {
|
||||
/// Number of nodes repaired
|
||||
pub nodes_repaired: usize,
|
||||
/// Number of edges updated
|
||||
pub edges_updated: usize,
|
||||
/// Quality before repair
|
||||
pub quality_before: f32,
|
||||
/// Quality after repair
|
||||
pub quality_after: f32,
|
||||
}
|
||||
|
||||
/// Repair scope
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RepairScope {
|
||||
/// Single node
|
||||
Node(u32),
|
||||
/// Neighborhood of a node (N-hop)
|
||||
Neighborhood { center: u32, hops: usize },
|
||||
/// Specific level of the graph
|
||||
Level(usize),
|
||||
/// Full graph
|
||||
Full,
|
||||
}
|
||||
|
||||
/// Local repair operations for a single node
|
||||
pub struct LocalRepair {
|
||||
/// Node being repaired
|
||||
pub node_idx: u32,
|
||||
/// Old neighbors to remove
|
||||
pub remove: Vec<(usize, u32)>, // (level, neighbor_idx)
|
||||
/// New neighbors to add
|
||||
pub add: Vec<(usize, u32)>,
|
||||
}
|
||||
|
||||
impl LocalRepair {
|
||||
/// Create a new local repair
|
||||
pub fn new(node_idx: u32) -> Self {
|
||||
Self {
|
||||
node_idx,
|
||||
remove: Vec::new(),
|
||||
add: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if repair is empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.remove.is_empty() && self.add.is_empty()
|
||||
}
|
||||
|
||||
/// Get total changes
|
||||
pub fn change_count(&self) -> usize {
|
||||
self.remove.len() + self.add.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Determines repair priority for nodes
|
||||
pub fn repair_priority(delta_magnitude: f32, neighbor_count: usize) -> f32 {
|
||||
// Higher priority for larger deltas and nodes with many neighbors
|
||||
delta_magnitude * (1.0 + (neighbor_count as f32).ln())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_repairer_creation() {
|
||||
let config = RepairConfig::default();
|
||||
let repairer = GraphRepairer::new(config);
|
||||
|
||||
assert_eq!(repairer.strategy(), RepairStrategy::Lazy);
|
||||
assert_eq!(repairer.total_repairs(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mark_for_repair() {
|
||||
let mut repairer = GraphRepairer::new(RepairConfig {
|
||||
batch_size: 5,
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
for i in 0..3 {
|
||||
repairer.mark_for_repair(i);
|
||||
}
|
||||
|
||||
assert!(!repairer.needs_batch_repair());
|
||||
assert_eq!(repairer.pending_nodes().len(), 3);
|
||||
|
||||
for i in 3..10 {
|
||||
repairer.mark_for_repair(i);
|
||||
}
|
||||
|
||||
assert!(repairer.needs_batch_repair());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_local_repair() {
|
||||
let mut repair = LocalRepair::new(0);
|
||||
assert!(repair.is_empty());
|
||||
|
||||
repair.add.push((0, 1));
|
||||
repair.remove.push((0, 2));
|
||||
|
||||
assert!(!repair.is_empty());
|
||||
assert_eq!(repair.change_count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_repair_priority() {
|
||||
// Higher delta = higher priority
|
||||
assert!(repair_priority(1.0, 10) > repair_priority(0.5, 10));
|
||||
|
||||
// More neighbors = higher priority
|
||||
assert!(repair_priority(0.5, 20) > repair_priority(0.5, 10));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user