git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
654 lines
20 KiB
Rust
654 lines
20 KiB
Rust
//! Integration tests for Vector Space Context
|
|
//!
|
|
//! Tests for HNSW index creation, vector insertion, k-NN search accuracy,
|
|
//! index persistence, and batch insertion performance.
|
|
|
|
use vibecast_tests::fixtures::*;
|
|
use vibecast_tests::mocks::*;
|
|
use std::collections::HashSet;
|
|
use std::time::Instant;
|
|
|
|
// ============================================================================
|
|
// HNSW Index Creation Tests
|
|
// ============================================================================
|
|
|
|
mod index_creation {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_create_empty_index() {
|
|
let index = MockVectorIndex::new();
|
|
assert_eq!(index.count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_create_index_with_config() {
|
|
let config = HnswConfig {
|
|
m: 32,
|
|
ef_construction: 400,
|
|
ef_search: 200,
|
|
max_layers: 8,
|
|
};
|
|
|
|
let index = MockVectorIndex::with_config(config);
|
|
assert_eq!(index.count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_default_hnsw_config() {
|
|
let config = HnswConfig::default();
|
|
|
|
assert_eq!(config.m, 16);
|
|
assert_eq!(config.ef_construction, 200);
|
|
assert_eq!(config.ef_search, 100);
|
|
assert_eq!(config.max_layers, 6);
|
|
}
|
|
|
|
#[test]
|
|
fn test_index_with_distance_metrics() {
|
|
let cosine_index = MockVectorIndex::new().with_distance_metric(DistanceMetric::Cosine);
|
|
let euclidean_index =
|
|
MockVectorIndex::new().with_distance_metric(DistanceMetric::Euclidean);
|
|
let poincare_index = MockVectorIndex::new().with_distance_metric(DistanceMetric::Poincare);
|
|
|
|
// All should be created successfully
|
|
assert_eq!(cosine_index.count(), 0);
|
|
assert_eq!(euclidean_index.count(), 0);
|
|
assert_eq!(poincare_index.count(), 0);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Vector Insertion Tests
|
|
// ============================================================================
|
|
|
|
mod vector_insertion {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_insert_single_vector() {
|
|
let index = MockVectorIndex::new();
|
|
let vector = create_normalized_vector(1536);
|
|
let embedding_id = EmbeddingId::new();
|
|
|
|
let vector_id = index.insert(embedding_id, vector).unwrap();
|
|
assert_eq!(index.count(), 1);
|
|
|
|
let retrieved = index.get(&vector_id).unwrap().unwrap();
|
|
assert_eq!(retrieved.embedding_id, embedding_id);
|
|
}
|
|
|
|
#[test]
|
|
fn test_insert_multiple_vectors() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
assert_eq!(index.count(), 100);
|
|
}
|
|
|
|
#[test]
|
|
fn test_insert_normalized_vectors() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..10 {
|
|
let vector = create_normalized_vector(1536);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
assert_eq!(index.count(), 10);
|
|
}
|
|
|
|
#[test]
|
|
fn test_insert_preserves_vector_data() {
|
|
let index = MockVectorIndex::new();
|
|
let original_vector = create_deterministic_vector(1536, 42);
|
|
let embedding_id = EmbeddingId::new();
|
|
|
|
let vector_id = index.insert(embedding_id, original_vector.clone()).unwrap();
|
|
|
|
let retrieved = index.get(&vector_id).unwrap().unwrap();
|
|
assert_eq!(retrieved.vector, original_vector);
|
|
}
|
|
|
|
#[test]
|
|
fn test_insert_assigns_layer() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
let mut layers_seen = HashSet::new();
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
let vector_id = index.insert(EmbeddingId::new(), vector).unwrap();
|
|
let indexed = index.get(&vector_id).unwrap().unwrap();
|
|
layers_seen.insert(indexed.layer);
|
|
}
|
|
|
|
// HNSW should assign vectors to multiple layers
|
|
assert!(layers_seen.len() >= 1, "Should have at least one layer");
|
|
}
|
|
|
|
#[test]
|
|
fn test_remove_vector() {
|
|
let index = MockVectorIndex::new();
|
|
let vector = create_normalized_vector(1536);
|
|
let vector_id = index.insert(EmbeddingId::new(), vector).unwrap();
|
|
|
|
assert_eq!(index.count(), 1);
|
|
|
|
index.remove(&vector_id).unwrap();
|
|
assert_eq!(index.count(), 0);
|
|
assert!(index.get(&vector_id).unwrap().is_none());
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// k-NN Search Accuracy Tests
|
|
// ============================================================================
|
|
|
|
mod knn_search {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_search_returns_k_results() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
// Insert 100 vectors
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let query = create_deterministic_vector(1536, 50);
|
|
let results = index.search(&query, 10).unwrap();
|
|
|
|
assert_eq!(results.len(), 10);
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_results_sorted_by_distance() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let query = create_deterministic_vector(1536, 50);
|
|
let results = index.search(&query, 20).unwrap();
|
|
|
|
// Verify sorted by distance
|
|
for i in 0..results.len() - 1 {
|
|
assert!(
|
|
results[i].distance <= results[i + 1].distance,
|
|
"Results should be sorted by distance"
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_finds_exact_match() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
let target_vector = create_deterministic_vector(1536, 42);
|
|
let target_id = index.insert(EmbeddingId::new(), target_vector.clone()).unwrap();
|
|
|
|
// Insert other vectors
|
|
for i in 0..50 {
|
|
if i != 42 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
}
|
|
|
|
let results = index.search(&target_vector, 1).unwrap();
|
|
|
|
assert_eq!(results.len(), 1);
|
|
assert_eq!(results[0].vector_id, target_id);
|
|
assert!(results[0].distance < 0.0001, "Exact match should have near-zero distance");
|
|
}
|
|
|
|
#[test]
|
|
fn test_recall_at_10_meets_threshold() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
// Insert vectors with known structure
|
|
let num_vectors = 1000;
|
|
let mut all_vectors: Vec<(VectorId, Vec<f32>)> = Vec::new();
|
|
|
|
for i in 0..num_vectors {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
let vector_id = index.insert(EmbeddingId::new(), vector.clone()).unwrap();
|
|
all_vectors.push((vector_id, vector));
|
|
}
|
|
|
|
// Test recall with multiple queries
|
|
let mut total_recall = 0.0;
|
|
let num_queries = 20;
|
|
|
|
for query_idx in (0..num_vectors).step_by(num_vectors / num_queries) {
|
|
let query = &all_vectors[query_idx].1;
|
|
|
|
// Compute true k-NN (brute force)
|
|
let mut true_distances: Vec<(VectorId, f32)> = all_vectors
|
|
.iter()
|
|
.map(|(id, v)| (*id, cosine_distance(query, v)))
|
|
.collect();
|
|
true_distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
|
let true_top_10: HashSet<VectorId> =
|
|
true_distances.iter().take(10).map(|(id, _)| *id).collect();
|
|
|
|
// Get approximate k-NN
|
|
let approx_results = index.search(query, 10).unwrap();
|
|
let approx_top_10: HashSet<VectorId> =
|
|
approx_results.iter().map(|r| r.vector_id).collect();
|
|
|
|
// Compute recall
|
|
let intersection_count = true_top_10.intersection(&approx_top_10).count();
|
|
let recall = intersection_count as f32 / 10.0;
|
|
total_recall += recall;
|
|
}
|
|
|
|
let avg_recall = total_recall / num_queries as f32;
|
|
assert!(
|
|
avg_recall >= 0.95,
|
|
"Recall@10 should be >= 0.95, got {}",
|
|
avg_recall
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_with_varying_k() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let query = create_normalized_vector(1536);
|
|
|
|
for k in [1, 5, 10, 20, 50] {
|
|
let results = index.search(&query, k).unwrap();
|
|
assert_eq!(results.len(), k, "Should return exactly k={} results", k);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_empty_index() {
|
|
let index = MockVectorIndex::new();
|
|
let query = create_normalized_vector(1536);
|
|
|
|
let results = index.search(&query, 10).unwrap();
|
|
assert_eq!(results.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_k_larger_than_index_size() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..5 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let query = create_normalized_vector(1536);
|
|
let results = index.search(&query, 100).unwrap();
|
|
|
|
assert_eq!(results.len(), 5, "Should return all vectors if k > index size");
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Neighbor Graph Tests
|
|
// ============================================================================
|
|
|
|
mod neighbor_graph {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_get_neighbors() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
// Insert vectors
|
|
let mut vector_ids = Vec::new();
|
|
for i in 0..20 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
let id = index.insert(EmbeddingId::new(), vector).unwrap();
|
|
vector_ids.push(id);
|
|
}
|
|
|
|
// Get neighbors of first vector
|
|
let neighbors = index.get_neighbors(&vector_ids[0], 5).unwrap();
|
|
|
|
assert_eq!(neighbors.len(), 5);
|
|
// Should not include self
|
|
assert!(!neighbors.iter().any(|r| r.vector_id == vector_ids[0]));
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_edge_creation() {
|
|
let source_id = VectorId::new();
|
|
let target_id = VectorId::new();
|
|
|
|
let edge = SimilarityEdge {
|
|
source_id,
|
|
target_id,
|
|
distance: 0.15,
|
|
edge_type: "SIMILAR".to_string(),
|
|
};
|
|
|
|
assert_eq!(edge.source_id, source_id);
|
|
assert_eq!(edge.target_id, target_id);
|
|
assert!(edge.distance < 0.2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_search_result_ranking() {
|
|
let results = create_search_results(10);
|
|
|
|
for (i, result) in results.iter().enumerate() {
|
|
assert_eq!(result.rank, i + 1, "Rank should be 1-indexed");
|
|
if i > 0 {
|
|
assert!(
|
|
result.distance >= results[i - 1].distance,
|
|
"Distance should be non-decreasing"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Index Persistence Tests
|
|
// ============================================================================
|
|
|
|
mod persistence {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_save_and_load_index() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
// Insert vectors
|
|
for i in 0..50 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let original_count = index.count();
|
|
|
|
// Save to bytes
|
|
let bytes = index.save_to_bytes().unwrap();
|
|
assert!(!bytes.is_empty());
|
|
|
|
// Load from bytes (mock - doesn't restore actual data)
|
|
let loaded = MockVectorIndex::load_from_bytes(&bytes).unwrap();
|
|
|
|
// In real implementation, this would verify:
|
|
// assert_eq!(loaded.count(), original_count);
|
|
}
|
|
|
|
#[test]
|
|
fn test_persistence_format() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
for i in 0..10 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
let bytes = index.save_to_bytes().unwrap();
|
|
|
|
// Check header (count as u64)
|
|
assert!(bytes.len() >= 8);
|
|
let count = u64::from_le_bytes([
|
|
bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
|
|
]);
|
|
assert_eq!(count, 10);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Batch Insertion Performance Tests
|
|
// ============================================================================
|
|
|
|
mod batch_performance {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_batch_insert() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
let embeddings: Vec<(EmbeddingId, Vec<f32>)> = (0..100)
|
|
.map(|i| (EmbeddingId::new(), create_deterministic_vector(1536, i)))
|
|
.collect();
|
|
|
|
let vector_ids = index.insert_batch(embeddings).unwrap();
|
|
|
|
assert_eq!(vector_ids.len(), 100);
|
|
assert_eq!(index.count(), 100);
|
|
}
|
|
|
|
#[test]
|
|
fn test_batch_insert_performance() {
|
|
let index = MockVectorIndex::new();
|
|
let batch_size = 1000;
|
|
|
|
let embeddings: Vec<(EmbeddingId, Vec<f32>)> = (0..batch_size)
|
|
.map(|i| (EmbeddingId::new(), create_deterministic_vector(1536, i)))
|
|
.collect();
|
|
|
|
let start = Instant::now();
|
|
let vector_ids = index.insert_batch(embeddings).unwrap();
|
|
let duration = start.elapsed();
|
|
|
|
assert_eq!(vector_ids.len(), batch_size);
|
|
|
|
// Should complete reasonably fast (mock implementation)
|
|
let vectors_per_second = batch_size as f64 / duration.as_secs_f64();
|
|
assert!(
|
|
vectors_per_second > 1000.0,
|
|
"Batch insertion should be fast, got {} vec/sec",
|
|
vectors_per_second
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_incremental_vs_batch_insert() {
|
|
// Test incremental insertion
|
|
let index1 = MockVectorIndex::new();
|
|
let start1 = Instant::now();
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index1.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
let duration1 = start1.elapsed();
|
|
|
|
// Test batch insertion
|
|
let index2 = MockVectorIndex::new();
|
|
let embeddings: Vec<(EmbeddingId, Vec<f32>)> = (0..100)
|
|
.map(|i| (EmbeddingId::new(), create_deterministic_vector(1536, i)))
|
|
.collect();
|
|
|
|
let start2 = Instant::now();
|
|
index2.insert_batch(embeddings).unwrap();
|
|
let duration2 = start2.elapsed();
|
|
|
|
assert_eq!(index1.count(), index2.count());
|
|
}
|
|
|
|
#[test]
|
|
fn test_scaling_with_index_size() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
let sizes = vec![100, 500, 1000];
|
|
let mut search_times = Vec::new();
|
|
|
|
for size in &sizes {
|
|
// Build index to target size
|
|
while index.count() < *size {
|
|
let vector = create_deterministic_vector(1536, index.count());
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
// Measure search time
|
|
let query = create_normalized_vector(1536);
|
|
let start = Instant::now();
|
|
for _ in 0..100 {
|
|
index.search(&query, 10).unwrap();
|
|
}
|
|
let duration = start.elapsed();
|
|
search_times.push(duration);
|
|
}
|
|
|
|
// Search time should scale sub-linearly with index size (HNSW property)
|
|
// With mock implementation, just verify search completes
|
|
for (i, time) in search_times.iter().enumerate() {
|
|
assert!(
|
|
time.as_millis() < 10000,
|
|
"Search at size {} took too long",
|
|
sizes[i]
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Distance Metric Tests
|
|
// ============================================================================
|
|
|
|
mod distance_metrics {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_cosine_distance_identical_vectors() {
|
|
let v = create_normalized_vector(1536);
|
|
let dist = cosine_distance(&v, &v);
|
|
assert!(dist < 0.0001, "Identical vectors should have distance ~0");
|
|
}
|
|
|
|
#[test]
|
|
fn test_cosine_distance_orthogonal_vectors() {
|
|
let v1 = vec![1.0, 0.0, 0.0];
|
|
let v2 = vec![0.0, 1.0, 0.0];
|
|
let dist = cosine_distance(&v1, &v2);
|
|
assert!((dist - 1.0).abs() < 0.0001, "Orthogonal vectors should have distance 1");
|
|
}
|
|
|
|
#[test]
|
|
fn test_cosine_distance_opposite_vectors() {
|
|
let v1 = vec![1.0, 0.0];
|
|
let v2 = vec![-1.0, 0.0];
|
|
let dist = cosine_distance(&v1, &v2);
|
|
assert!((dist - 2.0).abs() < 0.0001, "Opposite vectors should have distance 2");
|
|
}
|
|
|
|
#[test]
|
|
fn test_euclidean_distance_identical_vectors() {
|
|
let v = create_normalized_vector(1536);
|
|
let dist = euclidean_distance(&v, &v);
|
|
assert!(dist < 0.0001, "Identical vectors should have distance 0");
|
|
}
|
|
|
|
#[test]
|
|
fn test_euclidean_distance_known_value() {
|
|
let v1 = vec![0.0, 0.0];
|
|
let v2 = vec![3.0, 4.0];
|
|
let dist = euclidean_distance(&v1, &v2);
|
|
assert!((dist - 5.0).abs() < 0.0001, "3-4-5 triangle");
|
|
}
|
|
|
|
#[test]
|
|
fn test_distance_metric_symmetry() {
|
|
let v1 = create_normalized_vector(1536);
|
|
let v2 = create_deterministic_vector(1536, 42);
|
|
|
|
let cosine_12 = cosine_distance(&v1, &v2);
|
|
let cosine_21 = cosine_distance(&v2, &v1);
|
|
assert!((cosine_12 - cosine_21).abs() < 0.0001, "Cosine distance should be symmetric");
|
|
|
|
let eucl_12 = euclidean_distance(&v1, &v2);
|
|
let eucl_21 = euclidean_distance(&v2, &v1);
|
|
assert!((eucl_12 - eucl_21).abs() < 0.0001, "Euclidean distance should be symmetric");
|
|
}
|
|
|
|
#[test]
|
|
fn test_triangle_inequality() {
|
|
let v1 = create_deterministic_vector(100, 0);
|
|
let v2 = create_deterministic_vector(100, 1);
|
|
let v3 = create_deterministic_vector(100, 2);
|
|
|
|
let d12 = euclidean_distance(&v1, &v2);
|
|
let d23 = euclidean_distance(&v2, &v3);
|
|
let d13 = euclidean_distance(&v1, &v3);
|
|
|
|
assert!(
|
|
d13 <= d12 + d23 + 0.0001,
|
|
"Triangle inequality should hold: {} <= {} + {}",
|
|
d13,
|
|
d12,
|
|
d23
|
|
);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Indexed Vector Tests
|
|
// ============================================================================
|
|
|
|
mod indexed_vector {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_indexed_vector_creation() {
|
|
let indexed = IndexedVector::default();
|
|
|
|
assert_eq!(indexed.vector.len(), 1536);
|
|
assert_normalized(&indexed.vector, 0.0001);
|
|
}
|
|
|
|
#[test]
|
|
fn test_create_indexed_vectors() {
|
|
let vectors = create_indexed_vectors(10);
|
|
|
|
assert_eq!(vectors.len(), 10);
|
|
for (i, v) in vectors.iter().enumerate() {
|
|
assert_eq!(v.vector.len(), 1536);
|
|
// Verify deterministic generation
|
|
let expected = create_deterministic_vector(1536, i);
|
|
assert_eq!(v.vector, expected);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_vector_space_integration_smoke_test() {
|
|
let index = MockVectorIndex::new();
|
|
|
|
// Insert vectors
|
|
for i in 0..100 {
|
|
let vector = create_deterministic_vector(1536, i);
|
|
index.insert(EmbeddingId::new(), vector).unwrap();
|
|
}
|
|
|
|
// Search
|
|
let query = create_deterministic_vector(1536, 50);
|
|
let results = index.search(&query, 10).unwrap();
|
|
|
|
// Verify
|
|
assert_eq!(results.len(), 10);
|
|
assert!(results[0].distance < results[9].distance);
|
|
|
|
// Check recall
|
|
let exact_match = results.iter().find(|r| r.distance < 0.01);
|
|
assert!(exact_match.is_some(), "Should find near-exact match");
|
|
}
|
|
}
|