git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
813 lines
25 KiB
Rust
813 lines
25 KiB
Rust
//! Integration tests for Analysis Context
|
|
//!
|
|
//! Tests for HDBSCAN clustering, cluster assignment, motif detection,
|
|
//! entropy calculation, and transition matrix operations.
|
|
|
|
use vibecast_tests::fixtures::*;
|
|
use vibecast_tests::mocks::*;
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
// ============================================================================
|
|
// HDBSCAN Clustering Tests
|
|
// ============================================================================
|
|
|
|
mod hdbscan_clustering {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_cluster_with_clear_groups() {
|
|
let service = MockClusteringService::with_params(5, 3);
|
|
|
|
// Create two well-separated clusters
|
|
let base1 = create_deterministic_vector(1536, 0);
|
|
let base2 = create_deterministic_vector(1536, 1000);
|
|
|
|
let mut embeddings = Vec::new();
|
|
|
|
// Cluster 1: variations around base1
|
|
for i in 0..15 {
|
|
let noisy: Vec<f32> = base1.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
embeddings.push(create_test_embedding_with_vector(l2_normalize(&noisy)));
|
|
}
|
|
|
|
// Cluster 2: variations around base2
|
|
for i in 0..15 {
|
|
let noisy: Vec<f32> = base2.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
embeddings.push(create_test_embedding_with_vector(l2_normalize(&noisy)));
|
|
}
|
|
|
|
let clusters = service.cluster_hdbscan(&embeddings).unwrap();
|
|
|
|
assert!(clusters.len() >= 1, "Should find at least one cluster");
|
|
}
|
|
|
|
#[test]
|
|
fn test_cluster_with_insufficient_data() {
|
|
let service = MockClusteringService::with_params(10, 5);
|
|
|
|
// Only 3 embeddings - below min_cluster_size
|
|
let embeddings: Vec<Embedding> = (0..3).map(|_| create_test_embedding()).collect();
|
|
|
|
let clusters = service.cluster_hdbscan(&embeddings).unwrap();
|
|
|
|
assert_eq!(clusters.len(), 0, "Should not form clusters with too few points");
|
|
}
|
|
|
|
#[test]
|
|
fn test_cluster_method_assignment() {
|
|
let cluster = create_test_cluster();
|
|
assert_eq!(cluster.method, ClusteringMethod::Hdbscan);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cluster_cohesion_in_valid_range() {
|
|
let cluster = create_test_cluster();
|
|
|
|
assert!(cluster.cohesion >= 0.0 && cluster.cohesion <= 1.0);
|
|
assert!(cluster.separation >= 0.0 && cluster.separation <= 1.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cluster_has_members() {
|
|
let cluster = create_test_cluster_with_members(20);
|
|
|
|
assert_eq!(cluster.member_ids.len(), 20);
|
|
assert!(!cluster.centroid.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_cluster_centroid_is_normalized() {
|
|
let cluster = create_test_cluster_with_members(10);
|
|
|
|
let norm: f32 = cluster.centroid.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
assert!(
|
|
(norm - 1.0).abs() < 0.0001,
|
|
"Centroid should be normalized"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_multiple_clusters() {
|
|
let clusters = create_test_clusters(5);
|
|
|
|
assert_eq!(clusters.len(), 5);
|
|
|
|
// Each cluster should have unique ID
|
|
let ids: HashSet<_> = clusters.iter().map(|c| c.id.0).collect();
|
|
assert_eq!(ids.len(), 5, "All cluster IDs should be unique");
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Cluster Assignment Tests
|
|
// ============================================================================
|
|
|
|
mod cluster_assignment {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_assign_embedding_to_nearest_cluster() {
|
|
let service = MockClusteringService::new();
|
|
|
|
// Create clusters with known centroids
|
|
let clusters = create_test_clusters(3);
|
|
|
|
// Create embedding similar to first cluster's centroid
|
|
let embedding = create_test_embedding_with_vector(clusters[0].centroid.clone());
|
|
|
|
let assignment = service.assign_to_cluster(&embedding, &clusters).unwrap();
|
|
|
|
assert!(assignment.is_some(), "Should assign to a cluster");
|
|
let assignment = assignment.unwrap();
|
|
assert_eq!(assignment.cluster_id, clusters[0].id);
|
|
assert!(assignment.confidence > 0.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_assignment_confidence_based_on_distance() {
|
|
let service = MockClusteringService::new();
|
|
let clusters = create_test_clusters(2);
|
|
|
|
// Very close to centroid
|
|
let close_embedding = create_test_embedding_with_vector(clusters[0].centroid.clone());
|
|
let close_assignment = service
|
|
.assign_to_cluster(&close_embedding, &clusters)
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
// Farther from centroid
|
|
let far_vector: Vec<f32> = clusters[0]
|
|
.centroid
|
|
.iter()
|
|
.map(|v| v + 0.5)
|
|
.collect();
|
|
let far_embedding = create_test_embedding_with_vector(l2_normalize(&far_vector));
|
|
let far_assignment = service
|
|
.assign_to_cluster(&far_embedding, &clusters)
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
assert!(
|
|
close_assignment.confidence > far_assignment.confidence,
|
|
"Closer embeddings should have higher confidence"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_assignment_to_empty_clusters() {
|
|
let service = MockClusteringService::new();
|
|
let embedding = create_test_embedding();
|
|
let empty_clusters: Vec<Cluster> = vec![];
|
|
|
|
let assignment = service.assign_to_cluster(&embedding, &empty_clusters).unwrap();
|
|
assert!(assignment.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_assignment_includes_distance_to_centroid() {
|
|
let service = MockClusteringService::new();
|
|
let clusters = create_test_clusters(1);
|
|
let embedding = create_test_embedding();
|
|
|
|
let assignment = service
|
|
.assign_to_cluster(&embedding, &clusters)
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
assert!(
|
|
assignment.distance_to_centroid >= 0.0,
|
|
"Distance should be non-negative"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_soft_assignment_concept() {
|
|
// Test that an embedding near cluster boundary has lower confidence
|
|
let service = MockClusteringService::new();
|
|
|
|
// Create two clusters
|
|
let base1 = create_deterministic_vector(1536, 0);
|
|
let base2 = create_deterministic_vector(1536, 100);
|
|
|
|
let clusters = vec![
|
|
Cluster {
|
|
id: ClusterId::new(),
|
|
method: ClusteringMethod::Hdbscan,
|
|
member_ids: vec![],
|
|
centroid: l2_normalize(&base1),
|
|
cohesion: 0.8,
|
|
separation: 0.6,
|
|
},
|
|
Cluster {
|
|
id: ClusterId::new(),
|
|
method: ClusteringMethod::Hdbscan,
|
|
member_ids: vec![],
|
|
centroid: l2_normalize(&base2),
|
|
cohesion: 0.8,
|
|
separation: 0.6,
|
|
},
|
|
];
|
|
|
|
// Point exactly between clusters
|
|
let midpoint: Vec<f32> = base1
|
|
.iter()
|
|
.zip(base2.iter())
|
|
.map(|(a, b)| (a + b) / 2.0)
|
|
.collect();
|
|
let mid_embedding = create_test_embedding_with_vector(l2_normalize(&midpoint));
|
|
|
|
let assignment = service
|
|
.assign_to_cluster(&mid_embedding, &clusters)
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
// Confidence should reflect uncertainty
|
|
assert!(assignment.confidence < 0.9, "Boundary point should have lower confidence");
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Motif Detection Tests
|
|
// ============================================================================
|
|
|
|
mod motif_detection {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_detect_motifs_in_sequences() {
|
|
let service = MockMotifDetectionService::new();
|
|
|
|
// Create sequences with repeating patterns
|
|
let cluster_ids: Vec<ClusterId> = (0..5).map(|_| ClusterId::new()).collect();
|
|
|
|
let sequences: Vec<Vec<ClusterId>> = vec![
|
|
vec![
|
|
cluster_ids[0],
|
|
cluster_ids[1],
|
|
cluster_ids[2],
|
|
cluster_ids[0],
|
|
cluster_ids[1],
|
|
cluster_ids[2],
|
|
],
|
|
vec![
|
|
cluster_ids[0],
|
|
cluster_ids[1],
|
|
cluster_ids[2],
|
|
cluster_ids[3],
|
|
],
|
|
vec![
|
|
cluster_ids[2],
|
|
cluster_ids[0],
|
|
cluster_ids[1],
|
|
cluster_ids[2],
|
|
],
|
|
];
|
|
|
|
let motifs = service.detect_motifs(&sequences).unwrap();
|
|
|
|
// Should find the [0,1,2] pattern that appears multiple times
|
|
assert!(
|
|
motifs.iter().any(|m| m.pattern.len() >= 2),
|
|
"Should find at least one motif"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_motif_occurrence_count() {
|
|
let motif = create_test_motif();
|
|
|
|
assert!(motif.occurrence_count > 0);
|
|
assert_eq!(motif.pattern.len(), 3);
|
|
}
|
|
|
|
#[test]
|
|
fn test_motif_confidence_calculation() {
|
|
let motif = create_test_motif();
|
|
|
|
assert!(
|
|
motif.confidence >= 0.0 && motif.confidence <= 1.0,
|
|
"Confidence should be in [0, 1]"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_motifs_in_random_sequences() {
|
|
let service = MockMotifDetectionService::new();
|
|
|
|
// Create completely random sequences with no patterns
|
|
let sequences: Vec<Vec<ClusterId>> = (0..5)
|
|
.map(|_| (0..10).map(|_| ClusterId::new()).collect())
|
|
.collect();
|
|
|
|
let motifs = service.detect_motifs(&sequences).unwrap();
|
|
|
|
// Random sequences unlikely to have recurring motifs
|
|
// (though technically possible with mock implementation)
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_sequence_handling() {
|
|
let service = MockMotifDetectionService::new();
|
|
let empty_sequences: Vec<Vec<ClusterId>> = vec![];
|
|
|
|
let motifs = service.detect_motifs(&empty_sequences).unwrap();
|
|
assert_eq!(motifs.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_motif_duration_estimation() {
|
|
let motif = create_test_motif();
|
|
|
|
// 3-element motif at 5s per segment
|
|
assert!(motif.avg_duration_ms >= 5000);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Entropy Calculation Tests
|
|
// ============================================================================
|
|
|
|
mod entropy_calculation {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_entropy_rate_uniform_distribution() {
|
|
// Create transition matrix with uniform distribution
|
|
let n = 4;
|
|
let cluster_ids: Vec<ClusterId> = (0..n).map(|_| ClusterId::new()).collect();
|
|
let uniform_prob = 1.0 / n as f32;
|
|
|
|
let matrix = TransitionMatrix {
|
|
cluster_ids: cluster_ids.clone(),
|
|
probabilities: vec![vec![uniform_prob; n]; n],
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
let entropy = compute_entropy_rate(&matrix);
|
|
|
|
// Maximum entropy for uniform distribution = log2(n) = 2 bits for n=4
|
|
let max_entropy = (n as f32).log2();
|
|
assert!(
|
|
(entropy - max_entropy).abs() < 0.1,
|
|
"Uniform distribution should have maximum entropy: {} vs {}",
|
|
entropy,
|
|
max_entropy
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_entropy_rate_deterministic() {
|
|
// Create transition matrix with deterministic transitions
|
|
let n = 4;
|
|
let cluster_ids: Vec<ClusterId> = (0..n).map(|_| ClusterId::new()).collect();
|
|
|
|
// Each state always transitions to the next state
|
|
let mut probabilities = vec![vec![0.0; n]; n];
|
|
for i in 0..n {
|
|
probabilities[i][(i + 1) % n] = 1.0;
|
|
}
|
|
|
|
let matrix = TransitionMatrix {
|
|
cluster_ids,
|
|
probabilities,
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
let entropy = compute_entropy_rate(&matrix);
|
|
|
|
// Deterministic transitions should have zero entropy
|
|
assert!(
|
|
entropy < 0.1,
|
|
"Deterministic transitions should have near-zero entropy: {}",
|
|
entropy
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_entropy_rate_non_negative() {
|
|
for _ in 0..10 {
|
|
let matrix = create_test_transition_matrix(5);
|
|
let entropy = compute_entropy_rate(&matrix);
|
|
|
|
assert!(
|
|
entropy >= 0.0,
|
|
"Entropy should never be negative: {}",
|
|
entropy
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_entropy_increases_with_randomness() {
|
|
// Low entropy (predictable)
|
|
let n = 4;
|
|
let cluster_ids: Vec<ClusterId> = (0..n).map(|_| ClusterId::new()).collect();
|
|
|
|
let mut low_rand_probs = vec![vec![0.0; n]; n];
|
|
for i in 0..n {
|
|
low_rand_probs[i][i] = 0.8; // High self-loop probability
|
|
for j in 0..n {
|
|
if i != j {
|
|
low_rand_probs[i][j] = 0.2 / (n - 1) as f32;
|
|
}
|
|
}
|
|
}
|
|
|
|
let low_entropy_matrix = TransitionMatrix {
|
|
cluster_ids: cluster_ids.clone(),
|
|
probabilities: low_rand_probs,
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
// High entropy (uniform)
|
|
let uniform_prob = 1.0 / n as f32;
|
|
let high_entropy_matrix = TransitionMatrix {
|
|
cluster_ids,
|
|
probabilities: vec![vec![uniform_prob; n]; n],
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
let low_entropy = compute_entropy_rate(&low_entropy_matrix);
|
|
let high_entropy = compute_entropy_rate(&high_entropy_matrix);
|
|
|
|
assert!(
|
|
high_entropy > low_entropy,
|
|
"More uniform distribution should have higher entropy: {} vs {}",
|
|
high_entropy,
|
|
low_entropy
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_matrix_entropy() {
|
|
let matrix = TransitionMatrix {
|
|
cluster_ids: vec![],
|
|
probabilities: vec![],
|
|
observations: vec![],
|
|
};
|
|
|
|
let entropy = compute_entropy_rate(&matrix);
|
|
assert_eq!(entropy, 0.0);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Transition Matrix Tests
|
|
// ============================================================================
|
|
|
|
mod transition_matrix {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_create_transition_matrix() {
|
|
let matrix = create_test_transition_matrix(5);
|
|
|
|
assert_eq!(matrix.cluster_ids.len(), 5);
|
|
assert_eq!(matrix.probabilities.len(), 5);
|
|
assert_eq!(matrix.probabilities[0].len(), 5);
|
|
}
|
|
|
|
#[test]
|
|
fn test_transition_matrix_rows_sum_to_one() {
|
|
let matrix = create_test_transition_matrix(5);
|
|
|
|
for (i, row) in matrix.probabilities.iter().enumerate() {
|
|
let row_sum: f32 = row.iter().copied().sum();
|
|
assert!(
|
|
(row_sum - 1.0).abs() < 0.0001,
|
|
"Row {} should sum to 1.0, got {}",
|
|
i,
|
|
row_sum
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_transition_matrix_probabilities_non_negative() {
|
|
let matrix = create_test_transition_matrix(5);
|
|
|
|
for (i, row) in matrix.probabilities.iter().enumerate() {
|
|
for (j, prob) in row.iter().copied().enumerate() {
|
|
assert!(
|
|
prob >= 0.0,
|
|
"Probability at ({}, {}) should be non-negative: {}",
|
|
i,
|
|
j,
|
|
prob
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_observations_matrix() {
|
|
let matrix = create_test_transition_matrix(4);
|
|
|
|
assert_eq!(matrix.observations.len(), 4);
|
|
assert_eq!(matrix.observations[0].len(), 4);
|
|
|
|
// All observations should be positive
|
|
for row in &matrix.observations {
|
|
for &count in row {
|
|
assert!(count > 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_transition_matrix_from_sequence() {
|
|
let cluster_ids: Vec<ClusterId> = (0..3).map(|_| ClusterId::new()).collect();
|
|
let sequence = vec![
|
|
cluster_ids[0],
|
|
cluster_ids[1],
|
|
cluster_ids[0],
|
|
cluster_ids[2],
|
|
cluster_ids[1],
|
|
cluster_ids[0],
|
|
];
|
|
|
|
// Count transitions
|
|
let mut counts: HashMap<(usize, usize), u32> = HashMap::new();
|
|
for window in sequence.windows(2) {
|
|
let from_idx = cluster_ids.iter().position(|c| *c == window[0]).unwrap();
|
|
let to_idx = cluster_ids.iter().position(|c| *c == window[1]).unwrap();
|
|
*counts.entry((from_idx, to_idx)).or_insert(0) += 1;
|
|
}
|
|
|
|
// Sequence: [0, 1, 0, 2, 1, 0]
|
|
// Transitions: 0->1 (1x), 1->0 (2x), 0->2 (1x), 2->1 (1x)
|
|
assert_eq!(*counts.get(&(0, 1)).unwrap_or(&0), 1);
|
|
assert_eq!(*counts.get(&(1, 0)).unwrap_or(&0), 2);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Sequence Analysis Tests
|
|
// ============================================================================
|
|
|
|
mod sequence_analysis {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_sequence_segment_ordering() {
|
|
let segments = create_segment_sequence(10, 500);
|
|
|
|
for i in 0..segments.len() - 1 {
|
|
assert!(
|
|
segments[i].end_ms <= segments[i + 1].start_ms,
|
|
"Segments should be in temporal order"
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_stereotypy_calculation() {
|
|
// Stereotypy = measure of how predictable transitions are
|
|
// High stereotypy = consistent patterns
|
|
// Low stereotypy = varied patterns
|
|
|
|
let n = 4;
|
|
let cluster_ids: Vec<ClusterId> = (0..n).map(|_| ClusterId::new()).collect();
|
|
|
|
// Highly stereotyped (deterministic cycle)
|
|
let mut stereotyped_probs = vec![vec![0.0; n]; n];
|
|
for i in 0..n {
|
|
stereotyped_probs[i][(i + 1) % n] = 1.0;
|
|
}
|
|
|
|
// Low stereotypy (uniform)
|
|
let uniform_prob = 1.0 / n as f32;
|
|
|
|
let stereotyped_matrix = TransitionMatrix {
|
|
cluster_ids: cluster_ids.clone(),
|
|
probabilities: stereotyped_probs,
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
let varied_matrix = TransitionMatrix {
|
|
cluster_ids,
|
|
probabilities: vec![vec![uniform_prob; n]; n],
|
|
observations: vec![vec![10; n]; n],
|
|
};
|
|
|
|
let stereotyped_entropy = compute_entropy_rate(&stereotyped_matrix);
|
|
let varied_entropy = compute_entropy_rate(&varied_matrix);
|
|
|
|
// Stereotyped should have lower entropy (more predictable)
|
|
assert!(stereotyped_entropy < varied_entropy);
|
|
}
|
|
|
|
#[test]
|
|
fn test_motif_density() {
|
|
// Motif density = ratio of segments that are part of motifs
|
|
|
|
let total_segments = 100;
|
|
let motif_segments = 60;
|
|
|
|
let density = motif_segments as f32 / total_segments as f32;
|
|
assert!((density - 0.6).abs() < 0.001);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Anomaly Detection Tests
|
|
// ============================================================================
|
|
|
|
mod anomaly_detection {
|
|
use super::*;
|
|
|
|
fn compute_local_outlier_factor(
|
|
embedding: &Embedding,
|
|
neighbors: &[Embedding],
|
|
) -> f32 {
|
|
if neighbors.is_empty() {
|
|
return 1.0;
|
|
}
|
|
|
|
// Compute average distance to neighbors
|
|
let avg_distance: f32 = neighbors
|
|
.iter()
|
|
.map(|n| cosine_distance(&embedding.vector, &n.vector))
|
|
.sum::<f32>()
|
|
/ neighbors.len() as f32;
|
|
|
|
// LOF > 1 indicates anomaly
|
|
// This is simplified; real LOF compares local density to neighbors' densities
|
|
avg_distance * 10.0 // Scale factor for detection
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_outlier_embedding() {
|
|
// Create cluster of normal embeddings
|
|
let base = create_deterministic_vector(1536, 0);
|
|
let normal_embeddings: Vec<Embedding> = (0..20)
|
|
.map(|i| {
|
|
let noisy: Vec<f32> = base.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
create_test_embedding_with_vector(l2_normalize(&noisy))
|
|
})
|
|
.collect();
|
|
|
|
// Create outlier (very different)
|
|
let outlier_base = create_deterministic_vector(1536, 1000);
|
|
let outlier = create_test_embedding_with_vector(l2_normalize(&outlier_base));
|
|
|
|
// Compute LOF for outlier
|
|
let lof = compute_local_outlier_factor(&outlier, &normal_embeddings);
|
|
|
|
// LOF should be high for outlier
|
|
assert!(lof > 1.0, "Outlier should have high LOF: {}", lof);
|
|
}
|
|
|
|
#[test]
|
|
fn test_normal_embedding_not_anomalous() {
|
|
let base = create_deterministic_vector(1536, 0);
|
|
let embeddings: Vec<Embedding> = (0..20)
|
|
.map(|i| {
|
|
let noisy: Vec<f32> = base.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
create_test_embedding_with_vector(l2_normalize(&noisy))
|
|
})
|
|
.collect();
|
|
|
|
// Check LOF for a normal point
|
|
let test_point = &embeddings[10];
|
|
let neighbors: Vec<Embedding> = embeddings
|
|
.iter()
|
|
.filter(|e| e.id != test_point.id)
|
|
.cloned()
|
|
.collect();
|
|
|
|
let lof = compute_local_outlier_factor(test_point, &neighbors);
|
|
|
|
// Should be relatively low for normal point
|
|
assert!(
|
|
lof < 5.0,
|
|
"Normal point should have low LOF: {}",
|
|
lof
|
|
);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Cluster Validation Tests
|
|
// ============================================================================
|
|
|
|
mod cluster_validation {
|
|
use super::*;
|
|
|
|
fn compute_silhouette_score(
|
|
embedding: &Embedding,
|
|
own_cluster_members: &[Embedding],
|
|
other_cluster_members: &[Embedding],
|
|
) -> f32 {
|
|
if own_cluster_members.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
// a = average distance to own cluster members
|
|
let a: f32 = own_cluster_members
|
|
.iter()
|
|
.filter(|e| e.id != embedding.id)
|
|
.map(|e| cosine_distance(&embedding.vector, &e.vector))
|
|
.sum::<f32>()
|
|
/ (own_cluster_members.len() - 1).max(1) as f32;
|
|
|
|
// b = average distance to nearest other cluster
|
|
let b: f32 = if other_cluster_members.is_empty() {
|
|
1.0
|
|
} else {
|
|
other_cluster_members
|
|
.iter()
|
|
.map(|e| cosine_distance(&embedding.vector, &e.vector))
|
|
.sum::<f32>()
|
|
/ other_cluster_members.len() as f32
|
|
};
|
|
|
|
// Silhouette = (b - a) / max(a, b)
|
|
let max_ab = a.max(b);
|
|
if max_ab > 0.0 {
|
|
(b - a) / max_ab
|
|
} else {
|
|
0.0
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_silhouette_score_well_separated_clusters() {
|
|
// Create well-separated clusters
|
|
let base1 = create_deterministic_vector(1536, 0);
|
|
let base2 = create_deterministic_vector(1536, 1000);
|
|
|
|
let cluster1: Vec<Embedding> = (0..10)
|
|
.map(|i| {
|
|
let noisy: Vec<f32> = base1.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
create_test_embedding_with_vector(l2_normalize(&noisy))
|
|
})
|
|
.collect();
|
|
|
|
let cluster2: Vec<Embedding> = (0..10)
|
|
.map(|i| {
|
|
let noisy: Vec<f32> = base2.iter().map(|v| v + (i as f32 * 0.001)).collect();
|
|
create_test_embedding_with_vector(l2_normalize(&noisy))
|
|
})
|
|
.collect();
|
|
|
|
// Compute silhouette for point in cluster 1
|
|
let score = compute_silhouette_score(&cluster1[5], &cluster1, &cluster2);
|
|
|
|
// Should be positive (closer to own cluster)
|
|
assert!(
|
|
score > 0.0,
|
|
"Well-separated clusters should have positive silhouette: {}",
|
|
score
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_silhouette_score_range() {
|
|
let embeddings = create_embedding_batch(20);
|
|
|
|
// Split into two arbitrary clusters
|
|
let cluster1: Vec<Embedding> = embeddings[0..10].to_vec();
|
|
let cluster2: Vec<Embedding> = embeddings[10..20].to_vec();
|
|
|
|
for emb in &cluster1 {
|
|
let score = compute_silhouette_score(emb, &cluster1, &cluster2);
|
|
assert!(
|
|
score >= -1.0 && score <= 1.0,
|
|
"Silhouette should be in [-1, 1]: {}",
|
|
score
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_analysis_integration_smoke_test() {
|
|
// Create embeddings
|
|
let embeddings = create_embedding_batch(50);
|
|
|
|
// Run clustering
|
|
let service = MockClusteringService::with_params(5, 3);
|
|
let clusters = service.cluster_hdbscan(&embeddings).unwrap();
|
|
|
|
// Create transition matrix
|
|
let matrix = create_test_transition_matrix(clusters.len().max(3));
|
|
|
|
// Compute entropy
|
|
let entropy = compute_entropy_rate(&matrix);
|
|
assert!(entropy >= 0.0);
|
|
|
|
// Detect motifs
|
|
let motif_service = MockMotifDetectionService::new();
|
|
let sequences: Vec<Vec<ClusterId>> = clusters
|
|
.iter()
|
|
.map(|c| vec![c.id, c.id, c.id])
|
|
.collect();
|
|
let _motifs = motif_service.detect_motifs(&sequences).unwrap();
|
|
}
|
|
}
|