Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
//! Api for external language.
//! This file provides a trait to be used as an opaque pointer for C or Julia calls used in file libext.rs
use std::path::Path;
use serde::{de::DeserializeOwned, Serialize};
use crate::hnsw::*;
use crate::hnswio::*;
use anndists::dist::distances::Distance;
use log::info;
pub trait AnnT {
/// type of data vectors
type Val;
//
fn insert_data(&mut self, data: &[Self::Val], id: usize);
//
fn search_neighbours(&self, data: &[Self::Val], knbn: usize, ef_s: usize) -> Vec<Neighbour>;
//
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]);
//
fn parallel_search_neighbours(
&self,
data: &[Vec<Self::Val>],
knbn: usize,
ef_s: usize,
) -> Vec<Vec<Neighbour>>;
///
/// dumps a data and graph in 2 files.
/// Datas are dumped in file filename.hnsw.data and graph in filename.hnsw.graph
///
/// **We do not overwrite old files if they are currently in use by memory map**
/// If these files already exist , they are not overwritten and a unique filename is generated by concatenating a random number to filename.
/// The function returns the basename used for the dump
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String>;
}
impl<T, D> AnnT for Hnsw<'_, T, D>
where
T: Serialize + DeserializeOwned + Clone + Send + Sync,
D: Distance<T> + Send + Sync,
{
type Val = T;
//
fn insert_data(&mut self, data: &[Self::Val], id: usize) {
self.insert((data, id));
}
//
fn search_neighbours(&self, data: &[T], knbn: usize, ef_s: usize) -> Vec<Neighbour> {
self.search(data, knbn, ef_s)
}
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]) {
self.parallel_insert(data);
}
fn parallel_search_neighbours(
&self,
data: &[Vec<Self::Val>],
knbn: usize,
ef_s: usize,
) -> Vec<Vec<Neighbour>> {
self.parallel_search(data, knbn, ef_s)
}
// The main entry point to do a dump.
// It will generate two files one for the graph part of the data. The other for the real data points of the structure.
// The names of file are $filename.hnsw.graph for the graph and $filename.hnsw.data.
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String> {
info!("In Hnsw::file_dump");
//
// do not overwrite if mmap is active
let overwrite = !self.get_datamap_opt();
let mut dumpinit = DumpInit::new(path, file_basename, overwrite);
let dumpname = dumpinit.get_basename().clone();
//
let res = self.dump(DumpMode::Full, &mut dumpinit);
//
dumpinit.flush()?;
info!("\n End of dump, file basename : {}\n", &dumpname);
if res.is_ok() {
Ok(dumpname)
} else {
Err(anyhow::anyhow!("unexpected error"))
}
} // end of dump
} // end of impl block AnnT for Hnsw<T,D>

View File

@@ -0,0 +1,457 @@
//! This module provides a memory mapping of Data vectors filling the Hnsw structure.
//! It is used by the module [hnswio] and also gives access to an iterator over data without loading the graph.
//!
//! We mmap the file and provide
//! - a Hashmap from DataId to address
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
use std::io::BufReader;
use std::fs::{File, OpenOptions};
use std::path::{Path, PathBuf};
use indexmap::map::IndexMap;
use log::{debug, error, info, trace};
use mmap_rs::{Mmap, MmapOptions};
use crate::hnsw::DataId;
use crate::hnswio;
use crate::hnswio::MAGICDATAP;
/// This structure uses the data part of the dump of a Hnsw structure to retrieve the data.
/// The data is access via a mmap of the data file, so memory is spared at the expense of page loading.
// possibly to be used in graph to spare memory?
pub struct DataMap {
/// File containing Points data
_datapath: PathBuf,
/// The mmap structure
mmap: Mmap,
/// map a dataId to an address where we get a bson encoded vector of type T
hmap: IndexMap<DataId, usize>,
/// type name of Data
t_name: String,
/// dimension of data vector
dimension: usize,
//
distname: String,
} // end of DataMap
impl DataMap {
// TODO: specifiy mmap option
/// The fname argument corresponds to the basename of the dump.
/// To reload from file fname.hnsw.data just pass fname as argument.
/// The dir argument is the directory where the fname.hnsw.data and fname.hnsw.graph reside.
pub fn from_hnswdump<T: std::fmt::Debug>(
dir: &Path,
file_name: &str,
) -> Result<DataMap, String> {
// reload description to have data type, and check for dump version
let mut graphpath = PathBuf::from(dir);
graphpath.push(dir);
let mut filename = file_name.to_string();
filename.push_str(".hnsw.graph");
graphpath.push(filename);
let graphfileres = OpenOptions::new().read(true).open(&graphpath);
if graphfileres.is_err() {
println!("DataMap: could not open file {:?}", graphpath.as_os_str());
std::process::exit(1);
}
let graphfile = graphfileres.unwrap();
let mut graph_in = BufReader::new(graphfile);
// we need to call load_description first to get distance name
let hnsw_description = hnswio::load_description(&mut graph_in).unwrap();
if hnsw_description.format_version <= 2 {
let msg = String::from(
"from_hnsw::from_hnsw : data mapping is only possible for dumps with the version > 0.1.19 of this crate",
);
error!(
"Data mapping is only possible for dumps with the version > 0.1.19 of this crate"
);
return Err(msg);
}
let distname = hnsw_description.distname.clone();
let t_name = hnsw_description.get_typename();
// check typename coherence
info!("Got typename from reload : {:?}", t_name);
if std::any::type_name::<T>() != t_name {
error!(
"Description has typename {:?}, function type argument is : {:?}",
t_name,
std::any::type_name::<T>()
);
return Err(String::from("type error"));
}
// get dimension as declared in description
let descr_dimension = hnsw_description.get_dimension();
drop(graph_in);
//
// we know data filename is hnswdump.hnsw.data
//
let mut datapath = PathBuf::new();
datapath.push(dir);
let mut filename = file_name.to_string();
filename.push_str(".hnsw.data");
datapath.push(filename);
//
let meta = std::fs::metadata(&datapath);
if meta.is_err() {
error!("Could not open file : {:?}", &datapath);
std::process::exit(1);
}
let fsize = meta.unwrap().len().try_into().unwrap();
//
let file_res = File::open(&datapath);
if file_res.is_err() {
error!("Could not open file : {:?}", &datapath);
std::process::exit(1);
}
let file = file_res.unwrap();
let offset = 0;
//
let mmap_opt = MmapOptions::new(fsize).unwrap();
let mmap_opt = unsafe { mmap_opt.with_file(&file, offset) };
let mapping_res = mmap_opt.map();
if mapping_res.is_err() {
error!("Could not memory map : {:?}", &datapath);
std::process::exit(1);
}
let mmap = mapping_res.unwrap();
//
info!("Mmap done on file : {:?}", &datapath);
//
// where are we in decoding mmap slice? at beginning
//
let mapped_slice = mmap.as_slice();
//
// where are we in decoding mmap slice?
let mut current_mmap_addr = 0usize;
let mut usize_slice = [0u8; std::mem::size_of::<usize>()];
// check magic
let mut u32_slice = [0u8; std::mem::size_of::<u32>()];
u32_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
);
current_mmap_addr += std::mem::size_of::<u32>();
let magic = u32::from_ne_bytes(u32_slice);
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
// get dimension
usize_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<usize>()],
);
current_mmap_addr += std::mem::size_of::<usize>();
let dimension = usize::from_ne_bytes(usize_slice);
if dimension != descr_dimension {
error!(
"Description and data do not agree on dimension, data got : {:?}, description got : {:?}",
dimension, descr_dimension
);
return Err(String::from(
"description and data do not agree on dimension",
));
} else {
info!("Got dimension : {:?}", dimension);
}
//
// now we know that each record consists in
// - MAGICDATAP (u32), DataId (u64), dimension (u64) and then (length of type in bytes * dimension)
//
let record_size = std::mem::size_of::<u32>()
+ 2 * std::mem::size_of::<u64>()
+ dimension * std::mem::size_of::<T>();
let residual = mmap.size() - current_mmap_addr;
info!(
"Mmap size {}, current_mmap_addr {}, residual : {}",
mmap.size(),
current_mmap_addr,
residual
);
let nb_record = residual / record_size;
debug!("Record size : {}, nb_record : {}", record_size, nb_record);
// allocate hmap with correct capacity
let mut hmap = IndexMap::<DataId, usize>::with_capacity(nb_record);
// fill hmap to have address of each data point in file
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
//
// now we loop on records
//
for i in 0..nb_record {
debug!("Record i : {}, addr : {}", i, current_mmap_addr);
// decode Magic
u32_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
);
current_mmap_addr += std::mem::size_of::<u32>();
let magic = u32::from_ne_bytes(u32_slice);
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
// decode DataId
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
current_mmap_addr += std::mem::size_of::<u64>();
let data_id = u64::from_ne_bytes(u64_slice) as usize;
debug!(
"Inserting in hmap : got dataid : {:?} current map address : {:?}",
data_id, current_mmap_addr
);
// Note we store address where we have to decode dimension*size_of::<T> and full bson encoded vector
hmap.insert(data_id, current_mmap_addr);
// now read serialized length
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
current_mmap_addr += std::mem::size_of::<u64>();
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
if i == 0 {
debug!("serialized bytes len to reload {:?}", serialized_len);
}
let mut v_serialized = vec![0; serialized_len];
v_serialized.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + serialized_len],
);
current_mmap_addr += serialized_len;
let slice_t =
unsafe { std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, dimension) };
trace!(
"Deserialized v : {:?} address : {:?} ",
slice_t,
v_serialized.as_ptr() as *const T
);
} // end of for on record
//
debug!("End of DataMap::from_hnsw.");
//
let datamap = DataMap {
_datapath: datapath,
mmap,
hmap,
t_name,
dimension: descr_dimension,
distname,
};
//
Ok(datamap)
} // end of from_datas
//
/// returns true if type T corresponds to type as retrieved in DataMap.
/// This function can (should!) be used before calling [Self::get_data()]
pub fn check_data_type<T>(&self) -> bool
where
T: 'static + Sized,
{
// we check last part of name of type
let tname_vec = self.t_name.rsplit_terminator("::").collect::<Vec<&str>>();
if tname_vec.last().is_none() {
let errmsg = "DataMap::check_data_type() cannot determine data type name ";
error!("DataMap::check_data_type() cannot determine data type name ");
std::panic!("DataMap::check_data_type(), {}", errmsg);
}
let tname_last = tname_vec.last().unwrap();
//
let datat_name_arg = std::any::type_name::<T>().to_string();
let datat_name_vec = datat_name_arg
.rsplit_terminator("::")
.collect::<Vec<&str>>();
let datat_name_arg_last = datat_name_vec.last().unwrap();
//
if datat_name_arg_last == tname_last {
true
} else {
info!(
"Data type in DataMap : {}, type arg = {}",
tname_last, datat_name_arg_last
);
false
}
} // end of check_data_type
//
/// return the data corresponding to dataid. Access is done using mmap.
/// Function returns None if address is invalid
/// This function requires you know the type T.
/// **As mmap loading calls an unsafe function it is recommended to check the type name with [Self::check_data_type()]**
pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> {
//
trace!("In DataMap::get_data, dataid : {:?}", dataid);
let address = self.hmap.get(dataid)?;
debug!("Address for id : {}, address : {:?}", dataid, address);
let mut current_mmap_addr = *address;
let mapped_slice = self.mmap.as_slice();
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
current_mmap_addr += std::mem::size_of::<u64>();
trace!("Serialized bytes len to reload {:?}", serialized_len);
let slice_t = unsafe {
std::slice::from_raw_parts(
mapped_slice[current_mmap_addr..].as_ptr() as *const T,
self.dimension,
)
};
Some(slice_t)
}
/// returns Keys in order they are in the file, thus optimizing file/memory access.
/// Note that in case of parallel insertion this can be different from insertion odrer.
pub fn get_dataid_iter(&self) -> indexmap::map::Keys<'_, DataId, usize> {
self.hmap.keys()
}
/// returns full data type name
pub fn get_data_typename(&self) -> String {
self.t_name.clone()
}
/// returns full data type name
pub fn get_distname(&self) -> String {
self.distname.clone()
}
/// return the number of data in mmap
pub fn get_nb_data(&self) -> usize {
self.hmap.len()
}
} // end of impl DataMap
//=====================================================================================
#[cfg(test)]
mod tests {
use super::*;
use crate::hnswio::HnswIo;
use anndists::dist::*;
pub use crate::api::AnnT;
use crate::prelude::*;
use rand::distr::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_file_mmap() {
println!("\n\n test_file_mmap");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 50;
let nbrow = 11;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
debug!("j : {:?}, data : {:?} ", j, &data[j]);
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "mmap_test";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
let check_reload = false;
if check_reload {
// We check we can reload
debug!("HNSW reload.");
let directory = tempfile::tempdir().unwrap();
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
check_graph_equality(&hnsw_loaded, &hnsw);
info!("========= reload success, going to mmap reloading =========");
}
//
// now we have check that datamap seems ok, test reload of hnsw with mmap
let datamap: DataMap = DataMap::from_hnswdump::<f32>(directory.path(), fname).unwrap();
let nb_test = 30;
info!("Checking random access of id , nb test : {}", nb_test);
for _ in 0..nb_test {
// sample an id in 0..nb_data
let unif = Uniform::<usize>::new(0, nbcolumn).unwrap();
let id = unif.sample(&mut rng);
let d = datamap.get_data::<f32>(&id);
assert!(d.is_some());
if d.is_some() {
debug!("id = {}, v = {:?}", id, d.as_ref().unwrap());
assert_eq!(d.as_ref().unwrap(), &data[id]);
}
}
// test iterator from datamap
let keys = datamap.get_dataid_iter();
for k in keys {
let _data = datamap.get_data::<f32>(k);
}
} // end of test_file_mmap
#[test]
fn test_mmap_iter() {
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<u32>::new(0, 10000).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 50;
let nbrow = 11;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
debug!("j : {:?}, data : {:?} ", j, &data[j]);
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<u32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "mmap_order_test";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// now we have check that datamap seems ok, test reload of hnsw with mmap
let datamap: DataMap = DataMap::from_hnswdump::<u32>(directory.path(), fname).unwrap();
// testing type check
assert!(datamap.check_data_type::<u32>());
assert!(!datamap.check_data_type::<f32>());
info!("Datamap iteration order checking");
let keys = datamap.get_dataid_iter();
for (i, dataid) in keys.enumerate() {
let v = datamap.get_data::<u32>(dataid).unwrap();
assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, i);
}
// rm files generated!
let _ = std::fs::remove_file("mmap_order_test.hnsw.data");
let _ = std::fs::remove_file("mmap_order_test.hnsw.graph");
}
//
} // end of mod tests

View File

@@ -0,0 +1,24 @@
//! defines a trait for filtering requests.
//! See examples in tests/filtertest.rs
use crate::prelude::DataId;
/// Only queries returning true are taken into account along the search
pub trait FilterT {
fn hnsw_filter(&self, id: &DataId) -> bool;
}
impl FilterT for Vec<usize> {
fn hnsw_filter(&self, id: &DataId) -> bool {
self.binary_search(id).is_ok()
}
}
impl<F> FilterT for F
where
F: Fn(&DataId) -> bool,
{
fn hnsw_filter(&self, id: &DataId) -> bool {
self(id)
}
}

View File

@@ -0,0 +1,200 @@
//! This module provides conversion of a Point structure to a FlatPoint containing just the Id of a point
//! and those of its neighbours.
//! The whole Hnsw structure is then flattened into a Hashtable associating the data ID of a point to
//! its corresponding FlatPoint.
//! It can be used, for example, when reloading only the graph part of the data to have knowledge
//! of relative proximity of points as described just by their DataId
//!
use hashbrown::HashMap;
use std::cmp::Ordering;
use crate::hnsw;
use anndists::dist::distances::Distance;
use hnsw::*;
use log::error;
// an ordering of Neighbour of a Point
impl PartialEq for Neighbour {
fn eq(&self, other: &Neighbour) -> bool {
self.distance == other.distance
} // end eq
}
impl Eq for Neighbour {}
// order points by distance to self.
#[allow(clippy::non_canonical_partial_ord_impl)]
impl PartialOrd for Neighbour {
fn partial_cmp(&self, other: &Neighbour) -> Option<Ordering> {
self.distance.partial_cmp(&other.distance)
} // end cmp
} // end impl PartialOrd
impl Ord for Neighbour {
fn cmp(&self, other: &Neighbour) -> Ordering {
if !self.distance.is_nan() && !other.distance.is_nan() {
self.distance.partial_cmp(&other.distance).unwrap()
} else {
panic!("got a NaN in a distance");
}
} // end cmp
}
/// a reduced version of point inserted in the Hnsw structure.
/// It contains original id of point as submitted to the struct Hnsw
/// an ordered (by distance) list of neighbours to the point
/// and it position in layers.
#[derive(Clone)]
pub struct FlatPoint {
/// an id coming from client using hnsw, should identify point uniquely
origin_id: DataId,
/// a point id identifying point as stored in our structure
p_id: PointId,
/// neighbours info
neighbours: Vec<Neighbour>,
}
impl FlatPoint {
/// returns the neighbours orderded by distance.
pub fn get_neighbours(&self) -> &Vec<Neighbour> {
&self.neighbours
}
/// returns the origin id of the point
pub fn get_id(&self) -> DataId {
self.origin_id
}
//
pub fn get_p_id(&self) -> PointId {
self.p_id
}
} // end impl block for FlatPoint
fn flatten_point<T: Clone + Send + Sync>(point: &Point<T>) -> FlatPoint {
let neighbours = point.get_neighborhood_id();
// now we flatten neighbours
let mut flat_neighbours = Vec::<Neighbour>::new();
for layer in neighbours {
for neighbour in layer {
flat_neighbours.push(neighbour);
}
}
flat_neighbours.sort_unstable();
FlatPoint {
origin_id: point.get_origin_id(),
p_id: point.get_point_id(),
neighbours: flat_neighbours,
}
} // end of flatten_point
/// A structure providing neighbourhood information of a point stored in the Hnsw structure given its DataId.
/// The structure uses the [FlatPoint] structure.
/// This structure can be obtained by FlatNeighborhood::from<&Hnsw<T,D>>
pub struct FlatNeighborhood {
hash_t: HashMap<DataId, FlatPoint>,
}
impl FlatNeighborhood {
/// get neighbour of a point given its id.
/// The neighbours are sorted in increasing distance from data_id.
pub fn get_neighbours(&self, p_id: DataId) -> Option<Vec<Neighbour>> {
self.hash_t
.get(&p_id)
.map(|point| point.get_neighbours().clone())
}
} // end impl block for FlatNeighborhood
impl<T: Clone + Send + Sync, D: Distance<T> + Send + Sync> From<&Hnsw<'_, T, D>>
for FlatNeighborhood
{
/// extract from the Hnsw strucure a hashtable mapping original DataId into a FlatPoint structure gathering its neighbourhood information.
/// Useful after reloading from a dump with T=NoData and D = NoDist as points are then reloaded with neighbourhood information only.
fn from(hnsw: &Hnsw<T, D>) -> Self {
let mut hash_t = HashMap::new();
let pt_iter = hnsw.get_point_indexation().into_iter();
//
for point in pt_iter {
// println!("point : {:?}", _point.p_id);
let res_insert = hash_t.insert(point.get_origin_id(), flatten_point(&point));
if let Some(old_point) = res_insert {
error!("2 points with same origin id {:?}", old_point.origin_id);
}
}
FlatNeighborhood { hash_t }
}
} // e,d of Fom implementation
#[cfg(test)]
mod tests {
use super::*;
use anndists::dist::distances::*;
use log::debug;
use crate::api::AnnT;
use crate::hnswio::*;
use rand::distr::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_dump_reload_graph_flatten() {
println!("\n\n test_dump_reload_graph_flatten");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// get flat neighbours of point 3
let neighborhood_before_dump = FlatNeighborhood::from(&hnsw);
let nbg_2_before = neighborhood_before_dump.get_neighbours(2).unwrap();
println!("voisins du point 2 {:?}", nbg_2_before);
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "dumpreloadtestflat";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
//
// reload
debug!("HNSW reload");
// we will need a procedural macro to get from distance name to its instantiation.
// from now on we test with DistL1
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
let neighborhood_after_dump = FlatNeighborhood::from(&hnsw_loaded);
let nbg_2_after = neighborhood_after_dump.get_neighbours(2).unwrap();
println!("Neighbors of point 2 {:?}", nbg_2_after);
// test equality of neighborhood
assert_eq!(nbg_2_after.len(), nbg_2_before.len());
for i in 0..nbg_2_before.len() {
assert_eq!(nbg_2_before[i].p_id, nbg_2_after[i].p_id);
assert_eq!(nbg_2_before[i].distance, nbg_2_after[i].distance);
}
check_graph_equality(&hnsw_loaded, &hnsw);
} // end of test_dump_reload
} // end module test

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
#![cfg_attr(feature = "stdsimd", feature(portable_simd))]
//
// for logging (debug mostly, switched at compile time in cargo.toml)
use env_logger::Builder;
use lazy_static::lazy_static;
pub mod api;
pub mod datamap;
pub mod filter;
pub mod flatten;
pub mod hnsw;
pub mod hnswio;
pub mod libext;
pub mod prelude;
// we impose our version of anndists
pub use anndists;
lazy_static! {
static ref LOG: u64 = init_log();
}
// install a logger facility
#[allow(unused)]
fn init_log() -> u64 {
Builder::from_default_env().init();
println!("\n ************** initializing logger *****************\n");
1
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
// gathers modules to include and re-exorts all of anndists!
pub use crate::api::*;
pub use crate::hnsw::*;
#[allow(unused)]
pub use crate::filter::*;
pub use crate::hnswio::*;
pub use anndists::dist::distances::*;