Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
220
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-glove25-angular.rs
vendored
Normal file
220
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-glove25-angular.rs
vendored
Normal file
@@ -0,0 +1,220 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
// glove 25 // 2.7 Ghz 4 cores 8Mb L3 k = 10
|
||||
// ============================================
|
||||
//
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
// 24 800 64 1. 1 0 0.928 4090 1.003
|
||||
// 24 800 64 1. 1 1 0.927 4594 1.003
|
||||
// 24 400, 48 1. 1 0 0.919 6349 1.0044
|
||||
// 24 800 48 1 1 1 0.918 5785 1.005
|
||||
// 24 400 32 1. 0 0 0.898 8662
|
||||
// 24 400 64 1. 1 0 0.930 4711 1.0027
|
||||
// 24 400 64 1. 1 1 0.921 4550 1.0039
|
||||
// 24 1600 48 1 1 0 0.924 5380 1.0034
|
||||
|
||||
// 32 400 48 1 1 0 0.93 4706 1.0026
|
||||
// 32 800 64 1 1 0 0.94 3780. 1.0015
|
||||
// 32 1600 48 1 1 0 0.934 4455 1.0023
|
||||
// 48 1600 48 1 1 0 0.945 3253 1.00098
|
||||
|
||||
// 24 400 48 1 1 0 0.92 6036. 1.0038
|
||||
// 48 800 48 1 1 0 0.935 4018 1.002
|
||||
// 48 800 64 1 1 0 0.942 3091 1.0014
|
||||
// 48 800 64 1 1 1 0.9435 2640 1.00126
|
||||
|
||||
// k = 100
|
||||
|
||||
// 24 800 48 1 1 0 0.96 2432 1.004
|
||||
// 48 800 128 1 1 0 0.979 1626 1.001
|
||||
|
||||
// glove 25 // 8 cores i7 2.3 Ghz 8Mb L3 knbn = 100
|
||||
// ==================================================
|
||||
|
||||
// 48 800 48 1 1 0 0.935 13400 1.002
|
||||
// 48 800 128 1 1 0 0.979 5227 1.002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
|
||||
// ==================================================
|
||||
// 48 800 48 1 1 0 0.936 30748 1.002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
|
||||
// ==================================================
|
||||
// 48 800 128 1 1 0 0.979 12000 1.002
|
||||
|
||||
// results with scale modification 0.5
|
||||
//====================================
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
|
||||
// ==================================================
|
||||
// 24 800 48 0.5 1 0 0.931 40700 1.002
|
||||
// 48 800 48 0.5 1 0 0.941 30001 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
|
||||
// ==================================================
|
||||
// 24 800 128 0.5 1 0 0.974 16521 1.002
|
||||
// 48 800 128 0.5 1 0 0.985 11484 1.001
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
use log::info;
|
||||
|
||||
mod utils;
|
||||
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
let _ = env_logger::builder().is_test(true).try_init().unwrap();
|
||||
let parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/glove-25-angular.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let mut anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
// pre normalisation to use Dot computations instead of Cosine
|
||||
anndata.do_l2_normalization();
|
||||
// run bench
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
info!(
|
||||
"Train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!("Nb neighbours answers for test data : {} \n\n", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 24;
|
||||
let ef_c = 800;
|
||||
println!(
|
||||
" max_nb_conn : {:?}, ef_construction : {:?} ",
|
||||
max_nb_connection, ef_c
|
||||
);
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
let nb_search = anndata.test_data.len();
|
||||
println!(" number of search {:?}", nb_search);
|
||||
// Hnsw allocation
|
||||
let mut hnsw =
|
||||
Hnsw::<f32, DistDot>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistDot {});
|
||||
//
|
||||
hnsw.set_extend_candidates(true);
|
||||
hnsw.modify_level_scale(0.5);
|
||||
//
|
||||
// parallel insertion
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
// Now the bench with 10 neighbours
|
||||
//
|
||||
let knbn = 10;
|
||||
let ef_search = 48;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
|
||||
let knbn = 100;
|
||||
let ef_search = 128;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
}
|
||||
|
||||
pub fn search<Dist>(
|
||||
hnsw: &mut Hnsw<f32, Dist>,
|
||||
knbn: usize,
|
||||
ef_search: usize,
|
||||
anndata: &annhdf5::AnnBenchmarkData,
|
||||
) where
|
||||
Dist: Distance<f32> + Send + Sync,
|
||||
{
|
||||
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
|
||||
let parallel = true;
|
||||
//
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let nb_search = anndata.test_data.len();
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
println!("searching with ef : {:?}", ef_search);
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
// search
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> =
|
||||
hnsw.search(&anndata.test_data[i], knbn, ef_search);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
let cpu_time = start.elapsed();
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let max_dist = anndata.test_distances.row(i)[knbn - 1];
|
||||
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_d.len());
|
||||
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_d.is_empty() {
|
||||
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
}
|
||||
162
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-mnist-784-euclidean.rs
vendored
Normal file
162
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-mnist-784-euclidean.rs
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
// search in serial mode i7-core @2.7Ghz for 10 fist neighbours
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
//
|
||||
// 12 400 12 1 0 0 0.917 6486 1.005
|
||||
// 24 400 24 1 1 0 0.9779 3456 1.001
|
||||
|
||||
// parallel mode 4 i7-core @2.7Ghz
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
// 24 400 24 1 0 0 0.977 12566 1.001
|
||||
// 24 400 12 1 0 0 0.947 18425 1.003
|
||||
|
||||
// 8 hyperthreaded i7-core @ 2.3 Ghz
|
||||
// 24 400 24 1 0 0 0.977 22197 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez
|
||||
// 24 400 24 1 0 0 0.977 62000 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez with modify_level_scale at 0.5
|
||||
// 24 400 24 0.5 0 0 0.990 58722 1.000
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
use log::info;
|
||||
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
let mut parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/fashion-mnist-784-euclidean.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
let nb_elem = anndata.train_data.len();
|
||||
info!(
|
||||
"Train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!("Nb neighbours answers for test data : {}", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 24;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 400;
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
println!(
|
||||
" ====================================================================================="
|
||||
);
|
||||
let nb_search = anndata.test_data.len();
|
||||
println!(" number of search {:?}", nb_search);
|
||||
|
||||
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
hnsw.set_extend_candidates(false);
|
||||
//
|
||||
hnsw.modify_level_scale(0.25);
|
||||
// parallel insertion
|
||||
let mut start = ProcessTime::now();
|
||||
let mut now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
// Now the bench with 10 neighbours
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
let knbn = 10;
|
||||
let ef_c = max_nb_connection;
|
||||
println!("\n searching with ef : {:?}", ef_c);
|
||||
start = ProcessTime::now();
|
||||
now = SystemTime::now();
|
||||
// search
|
||||
parallel = true;
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_c);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> = hnsw.search(&anndata.test_data[i], knbn, ef_c);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
cpu_time = start.elapsed();
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time, search_sys_time
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let true_distances = anndata.test_distances.row(i);
|
||||
let max_dist = true_distances[knbn - 1];
|
||||
let mut _knn_neighbours_id: Vec<usize> =
|
||||
knn_neighbours_for_tests[i].iter().map(|p| p.d_id).collect();
|
||||
let knn_neighbours_dist: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_dist.len());
|
||||
// count how many distances of knn_neighbours_dist are less than
|
||||
let recall = knn_neighbours_dist
|
||||
.iter()
|
||||
.filter(|x| *x <= &max_dist)
|
||||
.count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_dist.is_empty() {
|
||||
ratio = knn_neighbours_dist[knn_neighbours_dist.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
}
|
||||
196
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-sift1m-128-euclidean.rs
vendored
Normal file
196
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-sift1m-128-euclidean.rs
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use env_logger::Builder;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anndists::dist::*;
|
||||
use log::info;
|
||||
|
||||
// search in paralle mode 8 core i7-10875H @2.3Ghz time 100 neighbours
|
||||
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
//
|
||||
// 64 800 64 1 0 0 0.976 4894 1.001
|
||||
// 64 800 128 1 0 0 0.985 3811 1.00064
|
||||
// 64 800 128 1 1 0 0.9854 3765 1.0
|
||||
|
||||
// 64 1600 64 1 0 0 0.9877 3419. 1.0005
|
||||
|
||||
// search in parallel mode 8 core i7-10875H @2.3Ghz time for 10 neighbours
|
||||
|
||||
// 64 1600 64 1 0 0 0.9907 6100 1.0004
|
||||
// 64 1600 128 1 0 0 0.9959 3077. 1.0001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez
|
||||
|
||||
// 64 1600 64 1 0 0 0.9907 15258 1.0004
|
||||
// 64 1600 128 1 0 0 0.9957 8296 1.0002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez with level scale modification factor 0.5
|
||||
//=============================================================================
|
||||
|
||||
// 48 1600 64 0.5 0 0 0.9938 14073 1.0002
|
||||
// 48 1600 128 0.5 0 0 0.9992 7906 1.0000
|
||||
|
||||
// with an AMD ryzen 9 7950X 16-Core simdeez with level scale modification factor 0.5
|
||||
//=============================================================================
|
||||
// 48 1600 64 0.5 0 0 0.9938 17000 1.0002
|
||||
// 48 1600 128 0.5 0 0 0.9992 9600 1.0000
|
||||
|
||||
use hnsw_rs::prelude::*;
|
||||
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
//
|
||||
Builder::from_default_env().init();
|
||||
//
|
||||
let parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/sift1m-128-euclidean.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
// run bench
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
let nb_elem = anndata.train_data.len();
|
||||
info!(
|
||||
" train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!(" nb neighbours answers for test data : {}", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 48;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 1600;
|
||||
//
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
println!(
|
||||
" ====================================================================================="
|
||||
);
|
||||
//
|
||||
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
//
|
||||
let extend_flag = false;
|
||||
info!("extend flag = {:?} ", extend_flag);
|
||||
hnsw.set_extend_candidates(extend_flag);
|
||||
hnsw.modify_level_scale(0.5);
|
||||
//
|
||||
// parallel insertion
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
//
|
||||
let knbn = 10.min(knbn_max);
|
||||
let ef_search = 64;
|
||||
println!("searching with ef = {}", ef_search);
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
//
|
||||
println!("searching with ef = {}", ef_search);
|
||||
let ef_search = 128;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
}
|
||||
|
||||
pub fn search<Dist>(
|
||||
hnsw: &mut Hnsw<f32, Dist>,
|
||||
knbn: usize,
|
||||
ef_search: usize,
|
||||
anndata: &annhdf5::AnnBenchmarkData,
|
||||
) where
|
||||
Dist: Distance<f32> + Send + Sync,
|
||||
{
|
||||
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
|
||||
let parallel = true;
|
||||
//
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let nb_search = anndata.test_data.len();
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
println!("searching with ef : {:?}", ef_search);
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
// search
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> =
|
||||
hnsw.search(&anndata.test_data[i], knbn, ef_search);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
let cpu_time = start.elapsed();
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let max_dist = anndata.test_distances.row(i)[knbn - 1];
|
||||
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_d.len());
|
||||
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_d.is_empty() {
|
||||
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
} // end of search
|
||||
63
vendor/ruvector/scripts/patches/hnsw_rs/examples/levensthein.rs
vendored
Normal file
63
vendor/ruvector/scripts/patches/hnsw_rs/examples/levensthein.rs
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
use anndists::dist::*;
|
||||
|
||||
use hnsw_rs::prelude::*;
|
||||
use rand::Rng;
|
||||
use std::iter;
|
||||
|
||||
fn generate(len: usize) -> String {
|
||||
const CHARSET: &[u8] = b"abcdefghij";
|
||||
let mut rng = rand::rng();
|
||||
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
|
||||
iter::repeat_with(one_char).take(len).collect()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let nb_elem = 500000; // number of possible words in the dictionary
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 200;
|
||||
let nb_words = 1000;
|
||||
let hns = Hnsw::<u16, DistLevenshtein>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
DistLevenshtein {},
|
||||
);
|
||||
let mut words = vec![];
|
||||
for _n in 1..nb_words {
|
||||
let tw = generate(5);
|
||||
words.push(tw);
|
||||
}
|
||||
words.push(String::from("abcdj"));
|
||||
//
|
||||
for (i, w) in words.iter().enumerate() {
|
||||
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
|
||||
hns.insert((&vec, i));
|
||||
}
|
||||
// create a filter
|
||||
let mut filter: Vec<usize> = Vec::new();
|
||||
for i in 1..100 {
|
||||
filter.push(i);
|
||||
}
|
||||
//
|
||||
let ef_search: usize = 30;
|
||||
let tosearch: Vec<u16> = "abcde".chars().map(|c| c as u16).collect();
|
||||
//
|
||||
println!("========== search with filter ");
|
||||
let res = hns.search_filter(&tosearch, 10, ef_search, Some(&filter));
|
||||
for r in res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
println!("========== search without filter ");
|
||||
let res3 = hns.search(&tosearch, 10, ef_search);
|
||||
for r in res3 {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
}
|
||||
80
vendor/ruvector/scripts/patches/hnsw_rs/examples/random.rs
vendored
Normal file
80
vendor/ruvector/scripts/patches/hnsw_rs/examples/random.rs
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
#![allow(clippy::range_zip_with_len)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use rand::distr::Uniform;
|
||||
use rand::prelude::*;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
|
||||
fn main() {
|
||||
env_logger::Builder::from_default_env().init();
|
||||
//
|
||||
let nb_elem = 500000;
|
||||
let dim = 25;
|
||||
// generate nb_elem colmuns vectors of dimension dim
|
||||
let mut rng = rand::rng();
|
||||
let unif = rand::distr::StandardUniform;
|
||||
let mut data = Vec::with_capacity(nb_elem);
|
||||
for _ in 0..nb_elem {
|
||||
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
|
||||
data.push(column);
|
||||
}
|
||||
// give an id to each data
|
||||
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
|
||||
|
||||
let ef_c = 200;
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
let mut start = ProcessTime::now();
|
||||
let mut begin_t = SystemTime::now();
|
||||
hns.parallel_insert(&data_with_id);
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
println!(" hnsw data insertion cpu time {:?}", cpu_time);
|
||||
println!(
|
||||
" hnsw data insertion parallel, system time {:?} \n",
|
||||
begin_t.elapsed().unwrap()
|
||||
);
|
||||
hns.dump_layer_info();
|
||||
println!(
|
||||
" parallel hnsw data nb point inserted {:?}",
|
||||
hns.get_nb_point()
|
||||
);
|
||||
//
|
||||
// serial insertion
|
||||
//
|
||||
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
start = ProcessTime::now();
|
||||
begin_t = SystemTime::now();
|
||||
for _i in 0..data_with_id.len() {
|
||||
hns.insert((data_with_id[_i].0.as_slice(), data_with_id[_i].1))
|
||||
}
|
||||
cpu_time = start.elapsed();
|
||||
println!("\n\n serial hnsw data insertion {:?}", cpu_time);
|
||||
println!(
|
||||
" hnsw data insertion serial, system time {:?}",
|
||||
begin_t.elapsed().unwrap()
|
||||
);
|
||||
hns.dump_layer_info();
|
||||
println!(
|
||||
" serial hnsw data nb point inserted {:?}",
|
||||
hns.get_nb_point()
|
||||
);
|
||||
|
||||
let ef_search = max_nb_connection * 2;
|
||||
let knbn = 10;
|
||||
//
|
||||
for _iter in 0..100 {
|
||||
let mut r_vec = Vec::<f32>::with_capacity(dim);
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
for _ in 0..dim {
|
||||
r_vec.push(rng.sample(unif));
|
||||
}
|
||||
//
|
||||
let _neighbours = hns.search(&r_vec, knbn, ef_search);
|
||||
}
|
||||
}
|
||||
233
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/annhdf5.rs
vendored
Normal file
233
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/annhdf5.rs
vendored
Normal file
@@ -0,0 +1,233 @@
|
||||
//! This file provides hdf5 utilities to load ann-benchmarks hdf5 data files
|
||||
//! As the libray does not depend on hdf5 nor on ndarray, it is nearly the same for both
|
||||
//! ann benchmarks.
|
||||
|
||||
use ndarray::Array2;
|
||||
|
||||
use ::hdf5::*;
|
||||
use log::debug;
|
||||
|
||||
// datasets
|
||||
// . distances (nbojects, dim) f32 matrix for tests objects
|
||||
// . neighbors (nbobjects, nbnearest) int32 matrix giving the num of nearest neighbors in train data
|
||||
// . test (nbobjects, dim) f32 matrix test data
|
||||
// . train (nbobjects, dim) f32 matrix train data
|
||||
|
||||
/// a structure to load hdf5 data file benchmarks from https://github.com/erikbern/ann-benchmarks
|
||||
pub struct AnnBenchmarkData {
|
||||
pub fname: String,
|
||||
/// distances from each test object to its nearest neighbours.
|
||||
pub test_distances: Array2<f32>,
|
||||
// for each test data , id of its nearest neighbours
|
||||
#[allow(unused)]
|
||||
pub test_neighbours: Array2<i32>,
|
||||
/// list of vectors for which we will search ann.
|
||||
pub test_data: Vec<Vec<f32>>,
|
||||
/// list of data vectors and id
|
||||
pub train_data: Vec<(Vec<f32>, usize)>,
|
||||
/// searched results. first neighbours for each test data.
|
||||
#[allow(unused)]
|
||||
pub searched_neighbours: Vec<Vec<i32>>,
|
||||
/// distances of neighbours obtained of each test
|
||||
#[allow(unused)]
|
||||
pub searched_distances: Vec<Vec<f32>>,
|
||||
}
|
||||
|
||||
impl AnnBenchmarkData {
|
||||
pub fn new(fname: String) -> Result<AnnBenchmarkData> {
|
||||
let res = hdf5::File::open(fname.clone());
|
||||
if res.is_err() {
|
||||
println!("you must download file {:?}", fname);
|
||||
panic!(
|
||||
"download benchmark file some where and modify examples source file accordingly"
|
||||
);
|
||||
}
|
||||
let file = res.ok().unwrap();
|
||||
//
|
||||
// get test distances
|
||||
//
|
||||
let res_distances = file.dataset("distances");
|
||||
if res_distances.is_err() {
|
||||
// let reader = hdf5::Reader::<f32>::new(&test_distance);
|
||||
panic!("error getting distances dataset");
|
||||
}
|
||||
let distances = res_distances.unwrap();
|
||||
let shape = distances.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
let dataf32 = distances.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
// error
|
||||
panic!("error getting type distances dataset");
|
||||
}
|
||||
// read really data
|
||||
let res = distances.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let test_distances = res.unwrap();
|
||||
// a check for row order
|
||||
debug!(
|
||||
"First 2 distances for first test {:?} {:?} ",
|
||||
test_distances.get((0, 0)).unwrap(),
|
||||
test_distances.get((0, 1)).unwrap()
|
||||
);
|
||||
//
|
||||
// read neighbours
|
||||
//
|
||||
let res_neighbours = file.dataset("neighbors");
|
||||
if res_neighbours.is_err() {
|
||||
// let reader = hdf5::Reader::<f32>::new(&test_distance);
|
||||
panic!("error getting neighbours");
|
||||
}
|
||||
let neighbours = res_neighbours.unwrap();
|
||||
let shape = neighbours.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
println!("neighbours shape : {:?}", shape);
|
||||
let datai32 = neighbours.dtype().unwrap().is::<i32>();
|
||||
if !datai32 {
|
||||
// error
|
||||
panic!("error getting type neighbours");
|
||||
}
|
||||
// read really data
|
||||
let res = neighbours.read_2d::<i32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading neighbours dataset");
|
||||
}
|
||||
let test_neighbours = res.unwrap();
|
||||
debug!(
|
||||
"First 2 neighbours for first test {:?} {:?} ",
|
||||
test_neighbours.get((0, 0)).unwrap(),
|
||||
test_neighbours.get((0, 1)).unwrap()
|
||||
);
|
||||
println!("\n 10 first neighbours for first vector : ");
|
||||
for i in 0..10 {
|
||||
print!(" {:?} ", test_neighbours.get((0, i)).unwrap());
|
||||
}
|
||||
println!("\n 10 first neighbours for second vector : ");
|
||||
for i in 0..10 {
|
||||
print!(" {:?} ", test_neighbours.get((1, i)).unwrap());
|
||||
}
|
||||
//
|
||||
// read test data
|
||||
// ===============
|
||||
//
|
||||
let res_testdata = file.dataset("test");
|
||||
if res_testdata.is_err() {
|
||||
panic!("error getting test de notataset");
|
||||
}
|
||||
let test_data = res_testdata.unwrap();
|
||||
let shape = test_data.shape(); // nota shape returns a slice, dim returns a t-uple
|
||||
assert_eq!(shape.len(), 2);
|
||||
let dataf32 = test_data.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
panic!("error getting type de notistances dataset");
|
||||
}
|
||||
// read really datae not
|
||||
let res = test_data.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let test_data_2d = res.unwrap();
|
||||
let mut test_data = Vec::<Vec<f32>>::with_capacity(shape[1]);
|
||||
let (nbrow, nbcolumn) = test_data_2d.dim();
|
||||
println!(" test data, nb element {:?}, dim : {:?}", nbrow, nbcolumn);
|
||||
for i in 0..nbrow {
|
||||
let mut vec = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
vec.push(*test_data_2d.get((i, j)).unwrap());
|
||||
}
|
||||
test_data.push(vec);
|
||||
}
|
||||
//
|
||||
// loaf train data
|
||||
//
|
||||
let res_traindata = file.dataset("train");
|
||||
if res_traindata.is_err() {
|
||||
panic!("error getting distances dataset");
|
||||
}
|
||||
let train_data = res_traindata.unwrap();
|
||||
let train_shape = train_data.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
if test_data_2d.dim().1 != train_shape[1] {
|
||||
println!("test and train have not the same dimension");
|
||||
panic!();
|
||||
}
|
||||
println!(
|
||||
"\n train data shape : {:?}, nbvector {:?} ",
|
||||
train_shape, train_shape[0]
|
||||
);
|
||||
let dataf32 = train_data.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
// error
|
||||
panic!("error getting type distances dataset");
|
||||
}
|
||||
// read really data
|
||||
let res = train_data.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let train_data_2d = res.unwrap();
|
||||
let mut train_data = Vec::<(Vec<f32>, usize)>::with_capacity(shape[1]);
|
||||
let (nbrow, nbcolumn) = train_data_2d.dim();
|
||||
for i in 0..nbrow {
|
||||
let mut vec = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
vec.push(*train_data_2d.get((i, j)).unwrap());
|
||||
}
|
||||
train_data.push((vec, i));
|
||||
}
|
||||
//
|
||||
// now allocate array's for result
|
||||
//
|
||||
println!(
|
||||
" allocating vector for search neighbours answer : {:?}",
|
||||
test_data.len()
|
||||
);
|
||||
let searched_neighbours = Vec::<Vec<i32>>::with_capacity(test_data.len());
|
||||
let searched_distances = Vec::<Vec<f32>>::with_capacity(test_data.len());
|
||||
// searched_distances
|
||||
Ok(AnnBenchmarkData {
|
||||
fname: fname.clone(),
|
||||
test_distances,
|
||||
test_neighbours,
|
||||
test_data,
|
||||
train_data,
|
||||
searched_neighbours,
|
||||
searched_distances,
|
||||
})
|
||||
} // end new
|
||||
|
||||
/// do l2 normalisation of test and train vector to use DistDot metrinc instead DistCosine to spare cpu
|
||||
#[allow(unused)]
|
||||
pub fn do_l2_normalization(&mut self) {
|
||||
for i in 0..self.test_data.len() {
|
||||
anndists::dist::l2_normalize(&mut self.test_data[i]);
|
||||
}
|
||||
for i in 0..self.train_data.len() {
|
||||
anndists::dist::l2_normalize(&mut self.train_data[i].0);
|
||||
}
|
||||
} // end of do_l2_normalization
|
||||
} // end of impl block
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
|
||||
fn test_load_hdf5() {
|
||||
env_logger::Builder::from_default_env().init();
|
||||
//
|
||||
let fname = String::from("/home.2/Data/ANN/glove-25-angular.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let _anndata = AnnBenchmarkData::new(fname).unwrap();
|
||||
//
|
||||
} // end of test_load_hdf5
|
||||
} // end of module test
|
||||
3
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/mod.rs
vendored
Normal file
3
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/mod.rs
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
//! hdf5 utilities for examples
|
||||
|
||||
pub mod annhdf5;
|
||||
Reference in New Issue
Block a user