Files
wifi-densepose/crates/ruvector-nervous-system/src/hdc/similarity.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

397 lines
11 KiB
Rust

//! Similarity and distance metrics for hypervectors
use super::vector::Hypervector;
use super::HYPERVECTOR_BITS;
/// Computes Hamming distance between two hypervectors
///
/// Returns the number of bits that differ between the two vectors.
///
/// # Performance
///
/// <100ns with SIMD popcount instruction
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, hamming_distance};
///
/// let a = Hypervector::random();
/// let b = Hypervector::random();
/// let dist = hamming_distance(&a, &b);
/// assert!(dist > 0);
/// ```
#[inline]
pub fn hamming_distance(v1: &Hypervector, v2: &Hypervector) -> u32 {
v1.hamming_distance(v2)
}
/// Computes cosine similarity approximation for binary hypervectors
///
/// For binary vectors, cosine similarity ≈ 1 - 2*hamming_distance/dimension
///
/// Returns a value in [0.0, 1.0] where:
/// - 1.0 = identical vectors
/// - 0.5 = orthogonal/random vectors
/// - 0.0 = opposite vectors
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, cosine_similarity};
///
/// let a = Hypervector::random();
/// let b = a.clone();
/// let sim = cosine_similarity(&a, &b);
/// assert!((sim - 1.0).abs() < 0.001);
/// ```
#[inline]
pub fn cosine_similarity(v1: &Hypervector, v2: &Hypervector) -> f32 {
v1.similarity(v2)
}
/// Computes normalized Hamming similarity [0.0, 1.0]
///
/// This is equivalent to `1.0 - (hamming_distance / dimension)`
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, normalized_hamming};
///
/// let a = Hypervector::random();
/// let sim = normalized_hamming(&a, &a);
/// assert!((sim - 1.0).abs() < 0.001);
/// ```
pub fn normalized_hamming(v1: &Hypervector, v2: &Hypervector) -> f32 {
let hamming = v1.hamming_distance(v2);
1.0 - (hamming as f32 / HYPERVECTOR_BITS as f32)
}
/// Computes Jaccard similarity coefficient
///
/// Jaccard = |intersection| / |union| for binary vectors
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, jaccard_similarity};
///
/// let a = Hypervector::random();
/// let b = Hypervector::random();
/// let sim = jaccard_similarity(&a, &b);
/// assert!(sim >= 0.0 && sim <= 1.0);
/// ```
pub fn jaccard_similarity(v1: &Hypervector, v2: &Hypervector) -> f32 {
let mut intersection = 0u32;
let mut union = 0u32;
let bits1 = v1.bits();
let bits2 = v2.bits();
for i in 0..bits1.len() {
let and = bits1[i] & bits2[i];
let or = bits1[i] | bits2[i];
intersection += and.count_ones();
union += or.count_ones();
}
if union == 0 {
1.0 // Both vectors are zero
} else {
intersection as f32 / union as f32
}
}
/// Finds the k most similar vectors from a set
///
/// Returns indices and similarities of top-k matches, sorted by similarity (descending).
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, top_k_similar};
///
/// let query = Hypervector::random();
/// let candidates: Vec<_> = (0..10).map(|_| Hypervector::random()).collect();
///
/// let top3 = top_k_similar(&query, &candidates, 3);
/// assert_eq!(top3.len(), 3);
/// assert!(top3[0].1 >= top3[1].1); // Sorted descending
/// ```
pub fn top_k_similar(
query: &Hypervector,
candidates: &[Hypervector],
k: usize,
) -> Vec<(usize, f32)> {
let mut similarities: Vec<_> = candidates
.iter()
.enumerate()
.map(|(idx, candidate)| (idx, query.similarity(candidate)))
.collect();
// Partial sort to get top k (NaN-safe)
similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
similarities.into_iter().take(k).collect()
}
/// Computes pairwise similarity matrix
///
/// Returns NxN matrix where result\[i\]\[j\] = similarity(vectors\[i\], vectors\[j\])
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, pairwise_similarities};
///
/// let vectors: Vec<_> = (0..5).map(|_| Hypervector::random()).collect();
/// let matrix = pairwise_similarities(&vectors);
///
/// assert_eq!(matrix.len(), 5);
/// assert_eq!(matrix[0].len(), 5);
/// assert!((matrix[0][0] - 1.0).abs() < 0.001); // Diagonal is 1.0
/// ```
pub fn pairwise_similarities(vectors: &[Hypervector]) -> Vec<Vec<f32>> {
let n = vectors.len();
let mut matrix = vec![vec![0.0; n]; n];
for i in 0..n {
matrix[i][i] = 1.0; // Diagonal
for j in (i + 1)..n {
let sim = vectors[i].similarity(&vectors[j]);
matrix[i][j] = sim;
matrix[j][i] = sim; // Symmetric
}
}
matrix
}
/// Computes batch similarities of query against all candidates
///
/// Optimized for computing one-to-many similarities efficiently.
/// Uses loop unrolling for better CPU pipeline utilization.
///
/// # Performance
///
/// ~20ns per similarity (amortized over batch)
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, batch_similarities};
///
/// let query = Hypervector::random();
/// let candidates: Vec<_> = (0..100).map(|_| Hypervector::random()).collect();
///
/// let sims = batch_similarities(&query, &candidates);
/// assert_eq!(sims.len(), 100);
/// ```
#[inline]
pub fn batch_similarities(query: &Hypervector, candidates: &[Hypervector]) -> Vec<f32> {
let n = candidates.len();
let mut results = Vec::with_capacity(n);
// Process in chunks of 4 for better cache utilization
let chunks = n / 4;
let remainder = n % 4;
for i in 0..chunks {
let base = i * 4;
results.push(query.similarity(&candidates[base]));
results.push(query.similarity(&candidates[base + 1]));
results.push(query.similarity(&candidates[base + 2]));
results.push(query.similarity(&candidates[base + 3]));
}
// Handle remainder
let base = chunks * 4;
for i in 0..remainder {
results.push(query.similarity(&candidates[base + i]));
}
results
}
/// Finds indices of all vectors with similarity above threshold
///
/// # Example
///
/// ```rust
/// use ruvector_nervous_system::hdc::{Hypervector, find_similar};
///
/// let query = Hypervector::from_seed(42);
/// let candidates: Vec<_> = (0..100).map(|i| Hypervector::from_seed(i)).collect();
///
/// let matches = find_similar(&query, &candidates, 0.9);
/// assert!(matches.contains(&42)); // Should find itself
/// ```
pub fn find_similar(query: &Hypervector, candidates: &[Hypervector], threshold: f32) -> Vec<usize> {
candidates
.iter()
.enumerate()
.filter_map(|(idx, candidate)| {
if query.similarity(candidate) >= threshold {
Some(idx)
} else {
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hamming_distance_identical() {
let v = Hypervector::random();
assert_eq!(hamming_distance(&v, &v), 0);
}
#[test]
fn test_hamming_distance_random() {
let v1 = Hypervector::random();
let v2 = Hypervector::random();
let dist = hamming_distance(&v1, &v2);
// Random vectors should differ in ~50% of bits
assert!(dist > 4000 && dist < 6000, "distance: {}", dist);
}
#[test]
fn test_cosine_similarity_identical() {
let v = Hypervector::random();
let sim = cosine_similarity(&v, &v);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_cosine_similarity_bounds() {
let v1 = Hypervector::random();
let v2 = Hypervector::random();
let sim = cosine_similarity(&v1, &v2);
// Cosine similarity for binary vectors: 1 - 2*hamming/dim gives [-1, 1]
assert!(
sim >= -1.0 && sim <= 1.0,
"similarity out of bounds: {}",
sim
);
}
#[test]
fn test_normalized_hamming_identical() {
let v = Hypervector::random();
let sim = normalized_hamming(&v, &v);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_normalized_hamming_random() {
let v1 = Hypervector::random();
let v2 = Hypervector::random();
let sim = normalized_hamming(&v1, &v2);
// Random vectors should have ~0.5 similarity
assert!(sim > 0.3 && sim < 0.7, "similarity: {}", sim);
}
#[test]
fn test_jaccard_identical() {
let v = Hypervector::random();
let sim = jaccard_similarity(&v, &v);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_jaccard_zero_vectors() {
let v1 = Hypervector::zero();
let v2 = Hypervector::zero();
let sim = jaccard_similarity(&v1, &v2);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn test_jaccard_bounds() {
let v1 = Hypervector::random();
let v2 = Hypervector::random();
let sim = jaccard_similarity(&v1, &v2);
assert!(sim >= 0.0 && sim <= 1.0);
}
#[test]
fn test_top_k_similar() {
let query = Hypervector::from_seed(0);
let candidates: Vec<_> = (1..11).map(|i| Hypervector::from_seed(i)).collect();
let top3 = top_k_similar(&query, &candidates, 3);
assert_eq!(top3.len(), 3);
// Should be sorted descending
assert!(top3[0].1 >= top3[1].1);
assert!(top3[1].1 >= top3[2].1);
}
#[test]
fn test_top_k_more_than_candidates() {
let query = Hypervector::random();
let candidates: Vec<_> = (0..5).map(|_| Hypervector::random()).collect();
let top10 = top_k_similar(&query, &candidates, 10);
// Should return all 5, not 10
assert_eq!(top10.len(), 5);
}
#[test]
fn test_pairwise_similarities_diagonal() {
let vectors: Vec<_> = (0..5).map(|i| Hypervector::from_seed(i)).collect();
let matrix = pairwise_similarities(&vectors);
assert_eq!(matrix.len(), 5);
for i in 0..5 {
assert!((matrix[i][i] - 1.0).abs() < 0.001);
}
}
#[test]
fn test_pairwise_similarities_symmetric() {
let vectors: Vec<_> = (0..5).map(|i| Hypervector::from_seed(i)).collect();
let matrix = pairwise_similarities(&vectors);
for i in 0..5 {
for j in 0..5 {
assert!((matrix[i][j] - matrix[j][i]).abs() < 0.001);
}
}
}
#[test]
fn test_pairwise_similarities_bounds() {
let vectors: Vec<_> = (0..5).map(|_| Hypervector::random()).collect();
let matrix = pairwise_similarities(&vectors);
for row in &matrix {
for &sim in row {
// Similarity range is [-1, 1] for cosine similarity
assert!(
sim >= -1.0 && sim <= 1.0,
"similarity out of bounds: {}",
sim
);
}
}
}
}