Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
use env_logger::Builder;
use anndists::dist::DistL1;
use hnsw_rs::hnsw::Hnsw;
// A test program to see if memory from insertions gets deallocated.
// This program sets up a process that iteratively builds a new model and lets it go out of scope.
// Since the models go out of scope, the desired behavior is that memory consumption is constant while this program is running.
fn main() {
//
Builder::from_default_env().init();
//
let mut counter: usize = 0;
loop {
let hnsw: Hnsw<f32, DistL1> = Hnsw::new(15, 100_000, 20, 500_000, DistL1 {});
let s1 = [1.0, 0.0, 0.0, 0.0];
hnsw.insert_slice((&s1, 0));
let s2 = [0.0, 1.0, 1.0];
hnsw.insert_slice((&s2, 1));
let s3 = [0.0, 0.0, 1.0];
hnsw.insert_slice((&s3, 2));
let s4 = [1.0, 0.0, 0.0, 1.0];
hnsw.insert_slice((&s4, 3));
let s5 = [1.0, 1.0, 1.0];
hnsw.insert_slice((&s5, 4));
let s6 = [1.0, -1.0, 1.0];
hnsw.insert_slice((&s6, 5));
if counter % 1_000_000 == 0 {
println!("counter : {}", counter)
}
counter += 1;
}
}

View File

@@ -0,0 +1,266 @@
#![allow(clippy::needless_range_loop)]
#![allow(clippy::range_zip_with_len)]
use anndists::dist::*;
use hnsw_rs::prelude::*;
use rand::{Rng, distr::Uniform};
use std::iter;
#[allow(unused)]
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
// Shows two ways to do filtering, by a sorted vector or with a closure
// We define a hnsw-index with 500 entries
// Only ids within 300-400 should be in the result-set
// Used to create a random string
fn generate_random_string(len: usize) -> String {
const CHARSET: &[u8] = b"abcdefghij";
let mut rng = rand::rng();
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
iter::repeat_with(one_char).take(len).collect()
}
// this function uses a sorted vector as a filter
fn search_closure_filter(
word: &str,
hns: &Hnsw<u16, DistLevenshtein>,
words: &[String],
filter_vector: &[usize],
) {
// transform string to u16 values
let vec: Vec<u16> = word.chars().map(|c| c as u16).collect();
// now create a closure using this filter_vector
// here we can off course implement more advanced filter logic
let filter = |id: &usize| -> bool { filter_vector.binary_search(id).is_ok() };
// Now let us do the search by using the defined clojure, which in turn uses our vector
// ids not in the vector will not be indluced in the search results
println!("========== Search with closure filter");
let ef_search = 30;
let res = hns.search_possible_filter(&vec, 10, ef_search, Some(&filter));
for r in res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
}
#[test]
fn filter_levenstein() {
let nb_elem = 500000; // number of possible words in the dictionary
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 200;
let hns = Hnsw::<u16, DistLevenshtein>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
DistLevenshtein {},
);
let mut words = vec![];
for _n in 1..1000 {
let tw = generate_random_string(8);
words.push(tw);
}
for (i, w) in words.iter().enumerate() {
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
hns.insert((&vec, i));
if i % 1000 == 0 {
println!("Inserting: {:?}", i);
}
}
// Create a sorted vector of ids
// the ids in the vector will be used as a filter
let filtered_hns = Hnsw::<u16, DistLevenshtein>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
DistLevenshtein {},
);
let mut filter_vector: Vec<usize> = Vec::new();
for i in 300..400 {
filter_vector.push(i);
let v: Vec<u16> = words[i].chars().map(|c| c as u16).collect();
filtered_hns.insert((&v, i));
}
//
let ef_search = 30;
let tosearch = "abcdefg";
let knbn = 10;
let vec_tosearch: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
//
println!("========== Search in full hns with filter");
let vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &vec_res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
//
println!("========== Search in restricted_hns but without filter");
//
let vec: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
let res: Vec<Neighbour> = filtered_hns.search(&vec, knbn, ef_search);
for r in &res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
//
// search with filter
// first with closure
println!("========== Search in full hns with closure filter");
search_closure_filter(tosearch, &hns, &words, &filter_vector);
//
// now with vector filter and estimate recall
//
println!("========== Search in full hns with vector filter");
let filter_vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &filter_vec_res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
// how many neighbours in res are in filter_vec_res
let mut nb_found: usize = 0;
for n in &res {
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
if found.is_some() {
nb_found += 1;
assert_eq!(n.distance, found.unwrap().distance);
}
}
println!(" recall : {}", nb_found as f32 / res.len() as f32);
println!(
" last distances ratio : {} ",
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
);
}
// A test with random uniform data vectors and L2 distance
// We compare a search of a random vector in hnsw structure with a filter to a filtered_hnsw
// containing only the data fitting the filter
#[test]
fn filter_l2() {
let nb_elem = 5000;
let dim = 25;
// generate nb_elem colmuns vectors of dimension dim
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let mut data = Vec::with_capacity(nb_elem);
for _ in 0..nb_elem {
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
data.push(column);
}
// give an id to each data
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let ef_c = 200;
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
hnsw.parallel_insert(&data_with_id);
//
let ef_search = 30;
let knbn = 10;
let vec_tosearch = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
//
// Create a sorted vector of ids
// the ids in the vector will be used as a filter
let filtered_hns =
Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
let mut filter_vector: Vec<usize> = Vec::new();
for i in 300..400 {
filter_vector.push(i);
filtered_hns.insert((&data[i], i));
}
//
println!("========== Search in full hnsw with filter");
let filter_vec_res = hnsw.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &filter_vec_res {
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
}
//
println!("========== Search in restricted_hns but without filter");
let res: Vec<Neighbour> = filtered_hns.search(&vec_tosearch, knbn, ef_search);
for r in &res {
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
}
// how many neighbours in res are in filter_vec_res and what is the distance gap
let mut nb_found: usize = 0;
for n in &res {
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
if found.is_some() {
nb_found += 1;
assert!((1. - n.distance / found.unwrap().distance).abs() < 1.0e-5);
}
}
println!(" recall : {}", nb_found as f32 / res.len() as f32);
println!(
" last distances ratio : {} ",
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
);
} // end of filter_l2
//
use std::collections::HashMap;
#[test]
fn filter_villsnow() {
println!("\n\n in test villsnow");
log_init_test();
//
let grid_size = 100;
let mut hnsw = Hnsw::<f64, DistL2>::new(4, grid_size * grid_size, 16, 100, DistL2::default());
let mut points = HashMap::new();
{
for (id, (i, j)) in itertools::iproduct!(0..grid_size, 0..grid_size,).enumerate() {
let data = [
(i as f64 + 0.5) / grid_size as f64,
(j as f64 + 0.5) / grid_size as f64,
];
hnsw.insert((&data, id));
points.insert(id, data);
}
hnsw.set_searching_mode(true);
}
{
println!("first case");
// first case
let filter = |id: &usize| DistL2::default().eval(&points[id], &[1.0, 1.0]) < 1e-2;
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 1
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 4, Some(&filter));
if !hit.is_empty() {
log::info!("got point : {:?}", points.get(&hit[0].d_id));
log::info!("got {:?}, must be true", filter(&hit[0].d_id)); // -> sometimes false
} else {
log::info!("found no point");
}
assert!(hit.len() <= 1);
}
{
println!("second case");
// second case
let filter = |_id: &usize| false;
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 0, obviously
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 64, Some(&filter));
println!("villsnow , {:?}", hit.len());
log::info!("got {:?}, must be 0", hit.len()); // -> 1
assert_eq!(hit.len(), 0);
}
}

View File

@@ -0,0 +1,328 @@
#![allow(clippy::range_zip_with_len)]
//! some testing utilities.
//! run with to get output statistics : cargo test --release -- --nocapture --test test_parallel.
//! serial test corresponds to random-10nn-euclidean(k=10)
//! parallel test corresponds to random data in 25 dimensions k = 10, dist Cosine
use rand::distr::Uniform;
use rand::prelude::*;
use skiplist::OrderedSkipList;
use anndists::dist;
use hnsw_rs::prelude::*;
use serde::{de::DeserializeOwned, Serialize};
pub fn gen_random_vector_f32(nbrow: usize) -> Vec<f32> {
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
(0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>()
}
/// return nbcolumn vectors of dimension nbrow
pub fn gen_random_matrix_f32(nbrow: usize, nbcolumn: usize) -> Vec<Vec<f32>> {
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let mut data = Vec::with_capacity(nbcolumn);
for _ in 0..nbcolumn {
let column = (0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
data.push(column);
}
data
}
fn brute_force_neighbours<T: Serialize + DeserializeOwned + Copy + Send + Sync>(
nb_neighbours: usize,
refdata: &PointIndexation<T>,
distance: PointDistance<T>,
data: &[T],
) -> OrderedSkipList<PointIdWithOrder> {
let mut neighbours = OrderedSkipList::<PointIdWithOrder>::with_capacity(refdata.get_nb_point());
let mut ptiter = refdata.into_iter();
let mut more = true;
while more {
if let Some(point) = ptiter.next() {
let dist_p = distance.eval(data, point.get_v());
let ordered_point = PointIdWithOrder::new(point.get_point_id(), dist_p);
// log::debug!(" brute force inserting {:?}", ordered_point);
if neighbours.len() < nb_neighbours {
neighbours.insert(ordered_point);
} else {
neighbours.insert(ordered_point);
neighbours.pop_back();
}
} else {
more = false;
}
} // end while
neighbours
} // end of brute_force_2
//================================================================================================
mod tests {
use cpu_time::ProcessTime;
use std::time::Duration;
use super::*;
use dist::l2_normalize;
#[test]
fn test_serial() {
//
//
let nb_elem = 1000;
let dim = 10;
let knbn = 10;
let ef = 20;
let parallel = true;
//
println!("\n\n test_serial nb_elem {:?}", nb_elem);
//
let data = gen_random_matrix_f32(dim, nb_elem);
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let ef_c = 400;
let max_nb_connection = 32;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let mut hns = Hnsw::<f32, dist::DistL1>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
dist::DistL1 {},
);
hns.set_extend_candidates(true);
hns.set_keeping_pruned(true);
let mut start = ProcessTime::now();
if parallel {
println!("parallel insertion");
hns.parallel_insert(&data_with_id);
} else {
println!("serial insertion");
for (i, d) in data.iter().enumerate() {
hns.insert((d, i));
}
}
let mut cpu_time: Duration = start.elapsed();
println!(" hnsw serial data insertion {:?}", cpu_time);
hns.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
//
let nbtest = 300;
let mut recalls = Vec::<usize>::with_capacity(nbtest);
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
let mut search_times = Vec::<f32>::with_capacity(nbtest);
for _itest in 0..nbtest {
//
let mut r_vec = Vec::<f32>::with_capacity(dim);
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for _ in 0..dim {
r_vec.push(rng.sample(unif));
}
start = ProcessTime::now();
let brute_neighbours = brute_force_neighbours(
knbn,
hns.get_point_indexation(),
Box::new(dist::DistL1 {}),
&r_vec,
);
cpu_time = start.elapsed();
if nbtest <= 100 {
println!("\n\n **************** test {:?}", _itest);
println!("\n brute force neighbours :");
println!("======================");
println!(" brute force computing {:?} \n ", cpu_time);
for i in 0..brute_neighbours.len() {
let p = brute_neighbours[i].point_id;
println!(" {:?} {:?} ", p, brute_neighbours[i].dist_to_ref);
}
}
//
hns.set_searching_mode(true);
start = ProcessTime::now();
let knn_neighbours = hns.search(&r_vec, knbn, ef);
cpu_time = start.elapsed();
search_times.push(cpu_time.as_micros() as f32);
if nbtest <= 100 {
println!("\n\n hnsw searching {:?} \n", cpu_time);
println!("\n knn neighbours");
println!("======================");
for n in &knn_neighbours {
println!(" {:?} {:?} {:?} ", n.d_id, n.p_id, n.distance);
}
}
// compute recall
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
let recall = knn_neighbours_dist
.iter()
.filter(|d| *d <= &max_dist)
.count();
if nbtest <= 100 {
println!("recall {:?}", (recall as f32) / (knbn as f32));
}
recalls.push(recall);
nb_returned.push(knn_neighbours.len());
} // end on nbtest
//
// compute recall
//
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
println!(
"\n mean fraction (of knbn) returned by search {:?} ",
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
);
println!(
"\n nb element {:?} nb search : {:?} recall rate is {:?} search time inverse {:?} ",
nb_elem,
nbtest,
mean_recall,
1.0e+6_f32 / mean_search_time
);
} // end test1
#[test]
fn test_parallel() {
//
let nb_elem = 1000;
let dim = 25;
let knbn = 10;
let ef_c = 800;
let max_nb_connection = 48;
let ef = 20;
//
//
let mut data = gen_random_matrix_f32(dim, nb_elem);
for v in &mut data {
l2_normalize(v);
}
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let mut hns = Hnsw::<f32, dist::DistDot>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
dist::DistDot {},
);
// !
// hns.set_extend_candidates(true);
let mut start = ProcessTime::now();
let now = std::time::SystemTime::now();
// parallel insertion
hns.parallel_insert(&data_with_id);
let mut cpu_time: Duration = start.elapsed();
println!(
"\n hnsw data parallel insertion cpu time {:?} , system time {:?}",
cpu_time,
now.elapsed()
);
// one serial more to check
let mut v = gen_random_vector_f32(dim);
l2_normalize(&mut v);
hns.insert((&v, hns.get_nb_point() + 1));
//
hns.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
//
println!("\n hnsw testing requests ...");
let nbtest = 100;
let mut recalls = Vec::<usize>::with_capacity(nbtest);
let mut recalls_id = Vec::<usize>::with_capacity(nbtest);
let mut search_times = Vec::<f32>::with_capacity(nbtest);
for _itest in 0..nbtest {
let mut r_vec = Vec::<f32>::with_capacity(dim);
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for _ in 0..dim {
r_vec.push(rng.sample(unif));
}
l2_normalize(&mut r_vec);
start = ProcessTime::now();
let brute_neighbours = brute_force_neighbours(
knbn,
hns.get_point_indexation(),
Box::new(dist::DistDot),
&r_vec,
);
cpu_time = start.elapsed();
if nbtest <= 100 {
println!("\n\n test_par nb_elem {:?}", nb_elem);
println!("\n brute force neighbours :");
println!("======================");
println!(" brute force computing {:?} \n", cpu_time);
for i in 0..brute_neighbours.len() {
println!(
" {:?} {:?} ",
brute_neighbours[i].point_id, brute_neighbours[i].dist_to_ref
);
}
}
//
let knbn = 10;
hns.set_searching_mode(true);
start = ProcessTime::now();
let knn_neighbours = hns.search(&r_vec, knbn, ef);
cpu_time = start.elapsed();
search_times.push(cpu_time.as_micros() as f32);
if nbtest <= 100 {
println!("\n knn neighbours");
println!("======================");
println!(" hnsw searching {:?} \n", cpu_time);
for n in &knn_neighbours {
println!(" {:?} \t {:?} \t {:?}", n.d_id, n.p_id, n.distance);
}
}
// compute recall with balls
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
let recall = knn_neighbours_dist
.iter()
.filter(|d| *d <= &max_dist)
.count();
if nbtest <= 100 {
println!("recall {:?}", (recall as f32) / (knbn as f32));
}
recalls.push(recall);
// compute recall with id
let mut recall_id = 0;
let mut knn_neighbours_id: Vec<PointId> =
knn_neighbours.iter().map(|p| p.p_id).collect();
knn_neighbours_id.sort_unstable();
let snbn = knbn.min(brute_neighbours.len());
for j in 0..snbn {
let to_search = brute_neighbours[j].point_id;
if knn_neighbours_id.binary_search(&to_search).is_ok() {
recall_id += 1;
}
}
recalls_id.push(recall_id);
} // end on nbtest
//
// compute recall
//
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
println!(
"\n nb search {:?} recall rate is {:?} search time inverse {:?} ",
nbtest,
mean_recall,
1.0e+6_f32 / mean_search_time
);
let mean_recall_id =
(recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
println!("mean recall rate with point ids {:?}", mean_recall_id);
//
// assert!(1==0);
} // end test_par
}