Files
wifi-densepose/vendor/ruvector/patches/hnsw_rs/src/hnswio.rs

1704 lines
64 KiB
Rust

//! This module provides io dump/ reload of computed graph via the structure Hnswio.
//! This structure stores references to data points if memory map is used.
//!
//! A dump is constituted of 2 files.
//! One file stores just the graph (or topology) with id of points.
//! The other file stores the ids and vector in point and can be reloaded via a mmap scheme.
//! The graph file is suffixed by "hnsw.graph" the other is suffixed by "hnsw.data"
//!
//! Examples of dump and reload of structure Hnsw is given in the tests (see test_dump_reload, reload_with_mmap)
// datafile
// MAGICDATAP : u32
// dimension : usize!!
// The for each point the triplet: (MAGICDATAP, origin_id , dimension , array of values bson encoded) ( u32, u64, ....)
//
// A point is dumped in graph file as given by its external id (type DataId i.e : a usize, possibly a hash value)
// and layer (u8) and rank_in_layer:i32.
// In the data file the point dump consist in the triplet: (MAGICDATAP, origin_id , array of values.)
//
use serde::{Serialize, de::DeserializeOwned};
use std::sync::atomic::{AtomicUsize, Ordering};
//
use std::time::SystemTime;
// io
use std::fs::{File, OpenOptions};
use std::io::{BufReader, BufWriter};
use std::path::{Path, PathBuf};
// synchro
use parking_lot::RwLock;
use std::sync::Arc;
use std::collections::HashMap;
use rand::Rng;
use anyhow::*;
use std::any::type_name;
use anndists::dist::distances::*;
use self::hnsw::*;
use crate::datamap::*;
use crate::hnsw;
use log::{debug, error, info, trace};
use std::io::prelude::*;
// magic before each graph point data for each point
const MAGICPOINT: u32 = 0x000a678f;
// magic at beginning of description format v2 of dump
const MAGICDESCR_2: u32 = 0x002a677f;
// magic at beginning of description format v3 of dump
// format where we can use mmap to provide acces to data (not graph) via a memory mapping of file data ,
// useful when data vector are large and data uses more space than graph.
// differ from v2 as we do not use bincode encoding for point. We dump pure binary
// This help use mmap as we can return directly a slice.
const MAGICDESCR_3: u32 = 0x002a6771;
// magic for v4
// we dump level scale modififcation factor
const MAGICDESCR_4: u32 = 0x002a6779;
// magic at beginning of a layer dump
const MAGICLAYER: u32 = 0x000a676f;
// magic head of data file and before each data vector
pub(crate) const MAGICDATAP: u32 = 0xa67f0000;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DumpMode {
Light,
Full,
}
/// The main interface for dumping struct Hnsw.
pub(crate) trait HnswIoT {
fn dump(&self, mode: DumpMode, dumpinit: &mut DumpInit) -> anyhow::Result<i32>;
}
/// Describe options accessible for reload
///
/// - datamap : a bool for mmap usage.
/// The data point can be reloaded via mmap of data file dump.
/// This can be useful when data points consist in large vectors (as in genomic sketching)
/// as in this case data needs more space than the graph.
///
/// - mmap_threshold : the number of itmes above which we use mmap. Default is 0, meaning always use mmap data
/// Can be useful for search speed in hnsw if we have part of data resident in memory.
#[derive(Copy, Clone)]
pub struct ReloadOptions {
datamap: bool,
/// number of data items above which we use mmap.
mmap_threshold: usize,
}
impl Default for ReloadOptions {
/// default is no mmap
fn default() -> Self {
ReloadOptions {
datamap: false,
mmap_threshold: 0,
}
}
}
impl ReloadOptions {
pub fn new(datamap: bool) -> Self {
ReloadOptions {
datamap,
mmap_threshold: 0,
}
}
/// set mmap uasge to true
pub fn set_mmap(&mut self, val: bool) -> Self {
self.datamap = val;
*self
}
/// set mmap threshold i.e : The maximum number of data that will be reloaded in memory by reading file dump, the other points will be mmapped.
/// As the upper layers are the most frequently used, these points will be loaded in memory during reading, the others will be mmaped.
/// See test *reload_with_mmap()*
pub fn set_mmap_threshold(&mut self, threshold: usize) -> Self {
if threshold > 0 {
self.datamap = true;
self.mmap_threshold = threshold;
}
*self
}
/// return a 2-uple, (datamap, threshold)
pub fn use_mmap(&self) -> (bool, usize) {
(self.datamap, self.mmap_threshold)
}
} // end of ReloadOptions
//===============================================================================================
// initialize datafile and graphfile for io ops
// This structure will check existence of dumps of same name and generate a unique filename if necessary according to overwrite flag
#[allow(unused)]
pub struct DumpInit {
// basename dump
basename: String,
// to dump data
pub(crate) data_out: BufWriter<File>,
// to dump graph
pub(crate) graph_out: BufWriter<File>,
} // end of
impl DumpInit {
// This structure will check existence of dumps of same name and generate a unique filename if necessary according to overwrite flag
pub fn new(dir: &Path, basename_default: &str, overwrite: bool) -> Self {
// if we cannot overwrite data files (in case of mmap in particular)
// we will ensure we have a unique basename
let basename = match overwrite {
true => basename_default.to_string(),
false => {
// we check
let mut dataname = basename_default.to_string();
dataname.push_str(".hnsw.data");
let mut datapath = PathBuf::from(dir);
datapath.push(dataname);
let exist_res = std::fs::metadata(datapath.as_os_str());
if exist_res.is_ok() {
let unique_basename = loop {
let mut unique_basename;
let mut dataname: String;
let id: usize = rand::thread_rng().gen_range(0..10000);
let strid: String = id.to_string();
unique_basename = basename_default.to_string();
unique_basename.push('-');
unique_basename.push_str(&strid);
dataname = unique_basename.clone();
dataname.push_str(".hnsw.data");
let mut datapath = PathBuf::from(dir);
datapath.push(dataname);
let exist_res = std::fs::metadata(datapath.as_os_str());
if exist_res.is_err() {
break unique_basename;
}
};
unique_basename
} else {
basename_default.to_string()
}
}
};
//
info!("Dumping with (unique) basename : {}", basename);
//
let mut graphname = basename.clone();
graphname.push_str(".hnsw.graph");
let mut graphpath = PathBuf::from(dir);
graphpath.push(graphname);
let graphfileres = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(&graphpath);
if graphfileres.is_err() {
println!(
"HnswIo::reload_hnsw : could not open file {:?}",
graphpath.as_os_str()
);
std::panic::panic_any("HnswIo::init : could not open file".to_string());
}
let graphfile = graphfileres.unwrap();
// same thing for data file
let mut dataname = basename.clone();
dataname.push_str(".hnsw.data");
let mut datapath = PathBuf::from(dir);
datapath.push(dataname);
let datafileres = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(&datapath);
if datafileres.is_err() {
println!(
"HnswIo::init : could not open file {:?}",
datapath.as_os_str()
);
std::panic::panic_any("HnswIo::init : could not open file".to_string());
}
let datafile = datafileres.unwrap();
//
let graph_out = BufWriter::new(graphfile);
let data_out = BufWriter::new(datafile);
//
DumpInit {
basename,
data_out,
graph_out,
}
}
/// returns the basename used for the dump. May be it has been made unique to void overwriting a previous or mmapped dump
pub fn get_basename(&self) -> &String {
&self.basename
}
pub fn flush(&mut self) -> Result<()> {
self.data_out.flush()?;
self.graph_out.flush()?;
Ok(())
}
} // end impl for DumpInit
//====================================================
// basic block used to provide arguments to load_hnsw and load_hnsw_with_dist
struct LoadInit {
descr: Description,
//
graphfile: BufReader<File>,
//
datafile: BufReader<File>,
} // end of LoadInit
/// a structure to provide simplified methods for reloading a previous dump.
///
/// The data point can be reloaded via mmap of data file dump.
/// This can be useful when data points consist in large vectors (as in genomic sketching)
/// as in this case data needs more space than the graph.
/// Note : **As this structure potentially contains the mmap data used in hnsw after reload it must not be dropped
/// before the reloaded hnsw.**
/// Example:
///
/// See example in tests::reload_with_mmap
/// ```text
/// let directory = Path::new(".");
/// let mut reloader = HnswIo::new(directory, "mmapreloadtest");
/// let options = ReloadOptions::default().set_mmap(true);
/// reloader.set_options(options);
/// let hnsw_loaded : Hnsw<f32,DistL1>= reloader.load_hnsw::<f32, DistL1>().unwrap();
/// ```
///
/// In some cases we need a hnsw variable that can come from a reload **OR** a direct initialization.
///
/// Hnswio must be defined before Hnsw as drop is done in reverse order of definition, and the function [load_hnsw](Self::load_hnsw())
/// borrows Hnswio. (Hnswio stores the mmap address Hnsw can refer to if mmap is used)
/// It is also possible to preinitialize a Hnswio with the default() function which leaves all the fields with blank values and use
/// the function [set_values](Self::set_values()) after.
/// We get something like:
///
/// ```text
/// let need_reload : bool;
/// ....................
/// let mut hnswio : Hnswio::default();
/// let hnsw : Hnsw<>;
/// if need_reload {
/// hnswio.set_values(...);
/// hnsw = hnswio.reload_hnsw(...)
/// }
/// else {
/// hnsw = Hnsw::new(...)
/// }
/// ````
#[derive(Default)]
pub struct HnswIo {
dir: PathBuf,
/// basename is used to build $basename.hnsw.data and $basename.hnsw.graph
basename: String,
/// options
options: ReloadOptions,
datamap: Option<DataMap>,
/// for Hnswio to be async
nb_point_loaded: Arc<AtomicUsize>,
initialized: bool,
} // end of struct ReloadOptions
impl HnswIo {
/// - directory is directory containing the dumped files,
/// - basename is used to build $basename.hnsw.data and $basename.hnsw.graph
///
/// default is to use default ReloadOptions.
pub fn new(directory: &Path, basename: &str) -> Self {
HnswIo {
dir: directory.to_path_buf(),
basename: basename.to_string(),
options: ReloadOptions::default(),
datamap: None,
nb_point_loaded: Arc::new(AtomicUsize::new(0)),
initialized: true,
}
}
/// same as preceding, avoids the call to [set_options](Self::set_options())
pub fn new_with_options(directory: &Path, basename: &str, options: ReloadOptions) -> Self {
HnswIo {
dir: directory.to_path_buf(),
basename: basename.to_string(),
options,
datamap: None,
nb_point_loaded: Arc::new(AtomicUsize::new(0)),
initialized: true,
}
}
/// return basename of dump
pub fn get_basename(&self) -> &str {
&self.basename
}
/// this method enables effective initialization after default allocation.
/// It is an error to call set_values on an already defined Hswnio by any function other than [default](Self::default())
pub fn set_values(
&mut self,
directory: &Path,
basename: String,
options: ReloadOptions,
) -> Result<()> {
if self.initialized {
return Err(anyhow!("Hnswio already initialized"));
};
//
self.dir = directory.to_path_buf();
self.basename = basename;
self.options = options;
self.datamap = None;
//
self.initialized = true;
//
Ok(())
} // end of set_values
//
fn init(&self) -> Result<LoadInit> {
//
info!("reloading from basename : {}", &self.basename);
//
let mut graphname = self.basename.clone();
graphname.push_str(".hnsw.graph");
let mut graphpath = self.dir.clone();
graphpath.push(graphname);
let graphfileres = OpenOptions::new().read(true).open(&graphpath);
if graphfileres.is_err() {
println!(
"HnswIo::reload_hnsw : could not open file {:?}",
graphpath.as_os_str()
);
error!(
"HnswIo::reload_hnsw : could not open file {:?}",
graphpath.as_os_str()
);
return Err(anyhow!(
"HnswIo::reload_hnsw : could not open file {:?}",
graphpath.as_os_str()
));
}
let graphfile = graphfileres.unwrap();
// same thing for data file
let mut dataname = self.basename.clone();
dataname.push_str(".hnsw.data");
let mut datapath = self.dir.clone();
datapath.push(dataname);
let datafileres = OpenOptions::new().read(true).open(&datapath);
if datafileres.is_err() {
println!(
"HnswIo::init : could not open file {:?}",
datapath.as_os_str()
);
error!(
"HnswIo::init : could not open file {:?}",
datapath.as_os_str()
);
return Err(anyhow!(
"HnswIo::reload_hnsw : could not open file {:?}",
datapath.as_os_str()
));
}
let datafile = datafileres.unwrap();
//
let mut graph_in = BufReader::new(graphfile);
let data_in = BufReader::new(datafile);
// we need to call load_description first to get distance name
let hnsw_description = load_description(&mut graph_in).unwrap();
//
Ok(LoadInit {
descr: hnsw_description,
graphfile: graph_in,
datafile: data_in,
})
}
/// to set non default options, in particular to ask for mmap of data file
pub fn set_options(&mut self, options: ReloadOptions) {
self.options = options;
}
/// reload a previously dumped hnsw structure
pub fn load_hnsw<'b, 'a, T, D>(&'a mut self) -> Result<Hnsw<'b, T, D>>
where
T: 'static + Serialize + DeserializeOwned + Clone + Sized + Send + Sync + std::fmt::Debug,
D: Distance<T> + Default + Send + Sync,
'a: 'b,
{
//
debug!("HnswIo::load_hnsw ");
let start_t = SystemTime::now();
//
let init = self.init();
if init.is_err() {
return Err(anyhow!("could not reload HNSW structure"));
}
let mut init = init.unwrap();
let data_in = &mut init.datafile;
let graph_in = &mut init.graphfile;
let description = init.descr;
info!("format version : {}", description.format_version);
// In datafile , we must read MAGICDATAP and dimension and check
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
data_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
assert_eq!(
magic, MAGICDATAP,
"magic not equal to MAGICDATAP in load_point"
);
//
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
data_in.read_exact(&mut it_slice)?;
let dimension = usize::from_ne_bytes(it_slice);
assert_eq!(
dimension, description.dimension,
"data dimension incoherent {:?} {:?} ",
dimension, description.dimension
);
//
let _mode = description.dumpmode;
let distname = description.distname.clone();
// We must ensure that the distance stored matches the one asked for in loading hnsw
// for that we check for short names equality stripping
debug!("distance in description = {:?}", distname);
let d_type_name = type_name::<D>().to_string();
let d_type_name_split: Vec<&str> = d_type_name.rsplit_terminator("::").collect();
for s in &d_type_name_split {
info!(" distname in generic type argument {:?}", s);
}
let distname_split: Vec<&str> = distname.rsplit_terminator("::").collect();
if (std::any::TypeId::of::<T>() != std::any::TypeId::of::<NoData>())
&& (d_type_name_split[0] != distname_split[0])
{
// for all types except NoData , distance asked in reload declaration and distance in dump must be equal!
let mut errmsg = String::from("error in distances : dumped distance is : ");
errmsg.push_str(&distname);
errmsg.push_str(" asked distance in loading is : ");
errmsg.push_str(&d_type_name);
error!(" distance in type argument : {:?}", d_type_name);
error!("error , dump is for distance = {:?}", distname);
return Err(anyhow!(errmsg));
}
let t_type = description.t_name.clone();
debug!("T type name in dump = {:?}", t_type);
// Do we use mmap at reload
if self.options.use_mmap().0 {
let datamap_res = DataMap::from_hnswdump::<T>(self.dir.as_path(), &self.basename);
if datamap_res.is_err() {
error!("load_hnsw could not initialize mmap")
} else {
info!("reload using mmap");
self.datamap = Some(datamap_res.unwrap());
}
}
// reloader can use datamap
let layer_point_indexation = self.load_point_indexation(graph_in, &description, data_in)?;
let data_dim = layer_point_indexation.get_data_dimension();
//
let hnsw: Hnsw<T, D> = Hnsw {
max_nb_connection: description.max_nb_connection as usize,
ef_construction: description.ef,
extend_candidates: true,
keep_pruned: false,
max_layer: description.nb_layer as usize,
layer_indexed_points: layer_point_indexation,
data_dimension: data_dim,
dist_f: D::default(),
searching: false,
datamap_opt: true, // set datamap_opt to true
};
//
debug!("load_hnsw completed");
let elapsed_t = start_t.elapsed().unwrap().as_secs() as f32;
info!("reload_hnsw : elapsed system time(s) {}", elapsed_t);
Ok(hnsw)
} // end of load_hnsw
/// reload a previously dumped hnsw structure
/// This function makes reload of a Hnsw dump with a given Dist.
/// It is dedicated to distance of type DistPtr (see crate [anndist](https://crates.io/crates/anndists)) that cannot implement Default.
/// **It is the user responsability to reload with the same function as used in the dump**
///
pub fn load_hnsw_with_dist<'b, 'a, T, D>(&'a self, f: D) -> anyhow::Result<Hnsw<'b, T, D>>
where
T: 'static + Serialize + DeserializeOwned + Clone + Sized + Send + Sync + std::fmt::Debug,
D: Distance<T> + Send + Sync,
'a: 'b,
{
//
debug!("HnswIo::load_hnsw_with_dist");
//
let init = self.init();
if init.is_err() {
return Err(anyhow!("Could not reload hnsw structure"));
}
let mut init = init.unwrap();
//
let data_in = &mut init.datafile;
let graph_in = &mut init.graphfile;
let description = init.descr;
// In datafile , we must read MAGICDATAP and dimension and check
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
data_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
assert_eq!(
magic, MAGICDATAP,
"magic not equal to MAGICDATAP in load_point"
);
//
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
data_in.read_exact(&mut it_slice)?;
let dimension = usize::from_ne_bytes(it_slice);
assert_eq!(
dimension, description.dimension,
"data dimension incoherent {:?} {:?} ",
dimension, description.dimension
);
//
let _mode = description.dumpmode;
let distname = description.distname.clone();
// We must ensure that the distance stored matches the one asked for in loading hnsw
// for that we check for short names equality stripping
info!("distance in description = {:?}", distname);
let d_type_name = type_name::<D>().to_string();
let v: Vec<&str> = d_type_name.rsplit_terminator("::").collect();
for s in v {
info!(" distname in generic type argument {:?}", s);
}
if (std::any::TypeId::of::<T>() != std::any::TypeId::of::<NoData>())
&& (d_type_name != distname)
{
// for all types except NoData , distance asked in reload declaration and distance in dump must be equal!
let mut errmsg = String::from("error in distances : dumped distance is : ");
errmsg.push_str(&distname);
errmsg.push_str(" asked distance in loading is : ");
errmsg.push_str(&d_type_name);
error!(" distance in type argument : {:?}", d_type_name);
error!("error , dump is for distance = {:?}", distname);
return Err(anyhow!(errmsg));
}
let t_type = description.t_name.clone();
info!("T type name in dump = {:?}", t_type);
//
//
let layer_point_indexation = self.load_point_indexation(graph_in, &description, data_in)?;
let data_dim = layer_point_indexation.get_data_dimension();
//
let hnsw: Hnsw<T, D> = Hnsw {
max_nb_connection: description.max_nb_connection as usize,
ef_construction: description.ef,
extend_candidates: true,
keep_pruned: false,
max_layer: description.nb_layer as usize,
layer_indexed_points: layer_point_indexation,
data_dimension: data_dim,
dist_f: f,
searching: false,
datamap_opt: false,
};
//
debug!("load_hnsw_with_dist completed");
// We cannot check that the pointer function was the same as the dump
//
Ok(hnsw)
} // end of load_hnsw_with_dist
fn load_point_indexation<'b, 'a, T>(
&'a self,
graph_in: &mut dyn Read,
descr: &Description,
data_in: &mut dyn Read,
) -> anyhow::Result<PointIndexation<'b, T>>
where
T: 'static + Serialize + DeserializeOwned + Clone + Sized + Send + Sync + std::fmt::Debug,
'a: 'b,
{
//
debug!(" in load_point_indexation");
//
// now we check that except for the case NoData, the typename are the sames.
if std::any::TypeId::of::<T>() != std::any::TypeId::of::<NoData>()
&& std::any::type_name::<T>() != descr.t_name
{
error!(
"typename loaded in description {:?} do not correspond to instanciation type {:?}",
descr.t_name,
std::any::type_name::<T>()
);
panic!("incohrent size of T in description");
}
//
let mut points_by_layer: Vec<Vec<Arc<Point<T>>>> =
Vec::with_capacity(NB_LAYER_MAX as usize);
let mut neighbourhood_map: HashMap<PointId, Vec<Vec<Neighbour>>> = HashMap::new();
// load max layer
let mut it_slice = [0u8; ::std::mem::size_of::<u8>()];
graph_in.read_exact(&mut it_slice)?;
let nb_layer = u8::from_ne_bytes(it_slice);
debug!("nb layer {:?}", nb_layer);
if nb_layer > NB_LAYER_MAX {
return Err(anyhow!("inconsistent number of layErrers"));
}
//
let mut nb_points_loaded: usize = 0;
let mut nb_still_to_load = descr.nb_point as i64;
let (use_mmap, max_nbpoint_in_memory) = self.options.use_mmap();
//
for l in 0..nb_layer as usize {
// read and check magic
debug!("loading layer {:?}", l);
let mut it_slice = [0u8; ::std::mem::size_of::<u32>()];
graph_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
if magic != MAGICLAYER {
return Err(anyhow!("bad magic at layer beginning"));
}
let mut it_slice = [0u8; ::std::mem::size_of::<usize>()];
graph_in.read_exact(&mut it_slice)?;
let nbpoints = usize::from_ne_bytes(it_slice);
debug!(" layer {:?} , nb points {:?}", l, nbpoints);
let mut vlayer: Vec<Arc<Point<T>>> = Vec::with_capacity(nbpoints);
// load graph and data part of point. Points are dumped in the same order.
for r in 0..nbpoints {
// do we use mmap? for this point. We must load into memory up to threshold points, and we also want the most
// frequently accessed points, i.e those in upper layers! to be physically loaded.
// So we do use mmap from the moment the number of points yet to be loaded is less than threshold.
let point_use_mmap = match use_mmap {
false => false,
true => {
if nb_still_to_load <= max_nbpoint_in_memory as i64 {
if log::log_enabled!(log::Level::Info)
&& nb_still_to_load == max_nbpoint_in_memory as i64
{
info!(
"Switching to points in memory. nb points stiil to load {:?}",
nb_still_to_load
);
}
false
} else {
true
}
}
};
let load_point_res = self.load_point(graph_in, descr, data_in, point_use_mmap);
if let Err(other) = load_point_res {
error!("in load_point_indexation, loading of point {} failed", r);
return Err(anyhow!(other));
}
let load_point_res = load_point_res.unwrap();
let point = load_point_res.0;
let p_id = point.get_point_id();
// some checks
assert_eq!(l, p_id.0 as usize);
if r != p_id.1 as usize {
debug!("Origin= {:?}, p_id = {:?}", point.get_origin_id(), p_id);
debug!("Storing at l {:?}, r {:?}", l, r);
}
assert_eq!(r, p_id.1 as usize);
// store neoghbour info of this point
neighbourhood_map.insert(p_id, load_point_res.1);
vlayer.push(point);
nb_points_loaded += 1;
nb_still_to_load -= 1;
assert!(nb_still_to_load >= 0);
}
points_by_layer.push(vlayer);
}
// at this step all points are loaded , but without their neighbours fileds are not yet initialized
let mut nbp: usize = 0;
for (p_id, neighbours) in &neighbourhood_map {
let point = &points_by_layer[p_id.0 as usize][p_id.1 as usize];
for (l, neighbours) in neighbours.iter().enumerate() {
for n in neighbours {
let n_point = &points_by_layer[n.p_id.0 as usize][n.p_id.1 as usize];
// now n_point is the Arc<Point> corresponding to neighbour n of point,
// construct a corresponding PointWithOrder
let n_pwo = PointWithOrder::<T>::new(n_point, n.distance);
point.neighbours.write()[l].push(Arc::new(n_pwo));
} // end of for n
// must sort
point.neighbours.write()[l].sort_unstable();
} // end of for l
nbp += 1;
if nbp % 500_000 == 0 {
debug!("reloading nb_points neighbourhood completed : {}", nbp);
}
} // end loop in neighbourhood_map
//
// get id of entry_point
// load entry point
info!(
"end of layer loading, allocating PointIndexation, nb points loaded {:?}",
nb_points_loaded
);
//
let mut it_slice = [0u8; std::mem::size_of::<DataId>()];
graph_in.read_exact(&mut it_slice)?;
let origin_id = DataId::from_ne_bytes(it_slice);
//
let mut it_slice = [0u8; ::std::mem::size_of::<u8>()];
graph_in.read_exact(&mut it_slice)?;
let layer = u8::from_ne_bytes(it_slice);
//
let mut it_slice = [0u8; std::mem::size_of::<i32>()];
graph_in.read_exact(&mut it_slice)?;
let rank_in_l = i32::from_ne_bytes(it_slice);
//
info!(
"found entry point, origin_id {:?} , layer {:?}, rank in layer {:?} ",
origin_id, layer, rank_in_l
);
let entry_point = Arc::clone(&points_by_layer[layer as usize][rank_in_l as usize]);
info!(
" loaded entry point, origin_id {:} p_id {:?}",
entry_point.get_origin_id(),
entry_point.get_point_id()
);
//
let point_indexation = PointIndexation {
max_nb_connection: descr.max_nb_connection as usize,
max_layer: NB_LAYER_MAX as usize,
points_by_layer: Arc::new(RwLock::new(points_by_layer)),
layer_g: LayerGenerator::new_with_scale(
descr.max_nb_connection as usize,
descr.level_scale,
NB_LAYER_MAX as usize,
),
nb_point: Arc::new(RwLock::new(nb_points_loaded)), // CAVEAT , we should increase , the whole thing is to be able to increment graph ?
entry_point: Arc::new(RwLock::new(Some(entry_point))),
};
//
debug!("Exiting load_pointIndexation");
Ok(point_indexation)
} // end of load_pointIndexation
//
// Reload a point from a dump.
//
// The graph part is loaded from graph_in file
// the data vector itself is loaded from data_in
//
#[allow(clippy::type_complexity)]
fn load_point<'b, 'a, T>(
&'a self,
graph_in: &mut dyn Read,
descr: &Description,
data_in: &mut dyn Read,
point_use_mmap: bool,
) -> Result<(Arc<Point<'b, T>>, Vec<Vec<Neighbour>>)>
where
T: 'static + DeserializeOwned + Clone + Sized + Send + Sync + std::fmt::Debug,
'a: 'b,
{
//
// debug!(" point load {:?} {:?} ", p_id, origin_id);
// Now for each layer , read neighbours
let load_res = load_point_graph(graph_in, descr);
if load_res.is_err() {
error!("load_point error reading graph data for point p_id");
return Err(anyhow!("error reading graph data for point"));
}
let (origin_id, p_id, neighborhood) = load_res.unwrap();
//
let point = match point_use_mmap {
false => {
let v = load_point_data::<T>(origin_id, data_in, descr);
if v.is_err() {
error!("loading point {:?}", origin_id);
std::process::exit(1);
}
Point::<T>::new(v.unwrap(), origin_id, p_id)
}
true => {
skip_point_data(origin_id, data_in, descr)?; // keep cohrence between data file and graph file!
debug!("constructing point from datamap, dataid : {:?}", origin_id);
let s: Option<&'b [T]> = self.datamap.as_ref().unwrap().get_data::<T>(&origin_id);
Point::<T>::new_from_mmap(s.unwrap(), origin_id, p_id)
}
};
self.nb_point_loaded.fetch_add(1, Ordering::Relaxed);
trace!(
"load_point origin {:?} allocated size {:?}, dim {:?}",
origin_id,
point.get_v().len(),
descr.dimension
);
//
Ok((Arc::new(point), neighborhood))
} // end of load_point
} // end of Hnswio
/// structure describing main parameters for hnsnw data and written at the beginning of a dump file.
///
/// Name of distance and type of data must be encoded in the dump file for a coherent reload.
#[repr(C)]
pub struct Description {
/// to keep track of format version
pub format_version: usize,
/// value is 1 for Full 0 for Light
pub dumpmode: u8,
/// max number of connections in layers != 0
pub max_nb_connection: u8,
/// scale used in level sampling
pub level_scale: f64,
/// number of observed layers
pub nb_layer: u8,
/// search parameter
pub ef: usize,
/// total number of points
pub nb_point: usize,
/// data dimension
pub dimension: usize,
/// name of distance
pub distname: String,
/// T typename
pub t_name: String,
}
impl Description {
/// The dump of Description consists in :
/// . The value MAGICDESCR_* as a u32 (4 u8)
/// . The type of dump as u8
/// . max_nb_connection as u8
/// . ef (search parameter used in construction) as usize
/// . nb_point (the number points dumped) as a usize
/// . the name of distance used. (nb byes as a usize then list of bytes)
///
fn dump<W: Write>(&self, argmode: DumpMode, out: &mut BufWriter<W>) -> Result<i32> {
info!("in dump of description");
out.write_all(&MAGICDESCR_4.to_ne_bytes())?;
let mode: u8 = match argmode {
DumpMode::Full => 1,
_ => 0,
};
// CAVEAT should check mode == self.mode
out.write_all(&mode.to_ne_bytes())?;
// dump of max_nb_connection as u8!!
out.write_all(&self.max_nb_connection.to_ne_bytes())?;
// with MAGICDESCR_4 we must dump self.level_scale
out.write_all(&self.level_scale.to_ne_bytes())?;
//
out.write_all(&self.nb_layer.to_ne_bytes())?;
if self.nb_layer != NB_LAYER_MAX {
println!("dump of Description, nb_layer != NB_MAX_LAYER");
return Err(anyhow!("dump of Description, nb_layer != NB_MAX_LAYER"));
}
//
info!("dumping ef {:?}", self.ef);
out.write_all(&self.ef.to_ne_bytes())?;
//
info!("dumping nb point {:?}", self.nb_point);
out.write_all(&self.nb_point.to_ne_bytes())?;
//
info!("dumping dimension of data {:?}", self.dimension);
out.write_all(&self.dimension.to_ne_bytes())?;
// dump of distance name
let namelen: usize = self.distname.len();
info!("distance name {:?} ", self.distname);
out.write_all(&namelen.to_ne_bytes())?;
out.write_all(self.distname.as_bytes())?;
// dump of T value typename
let namelen: usize = self.t_name.len();
info!("T name {:?} ", self.t_name);
out.write_all(&namelen.to_ne_bytes())?;
out.write_all(self.t_name.as_bytes())?;
//
Ok(1)
} // end fo dump
/// return data typename
pub fn get_typename(&self) -> String {
self.t_name.clone()
}
/// returns dimension of data
pub fn get_dimension(&self) -> usize {
self.dimension
}
} // end of HnswIO impl for Descr
//
/// This method is internally used by Hnswio.
/// It is make *pub* as it can be used to retrieve the description of a dump.
/// It takes as input the graph part of the dump.
pub fn load_description(io_in: &mut dyn Read) -> Result<Description> {
//
let mut descr = Description {
format_version: 0,
dumpmode: 0,
max_nb_connection: 0,
level_scale: 1.0f64,
nb_layer: 0,
ef: 0,
nb_point: 0,
dimension: 0,
distname: String::from(""),
t_name: String::from(""),
};
//
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
io_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
debug!(" magic {:X} ", magic);
match magic {
MAGICDESCR_2 => {
descr.format_version = 2;
}
MAGICDESCR_3 => {
descr.format_version = 3;
}
MAGICDESCR_4 => {
descr.format_version = 4;
}
_ => {
error!("bad magic");
return Err(anyhow!("bad magic at descr beginning"));
}
}
let mut it_slice = [0u8; std::mem::size_of::<u8>()];
io_in.read_exact(&mut it_slice)?;
descr.dumpmode = u8::from_ne_bytes(it_slice);
info!(" dumpmode {:?} ", descr.dumpmode);
//
let mut it_slice = [0u8; std::mem::size_of::<u8>()];
io_in.read_exact(&mut it_slice)?;
descr.max_nb_connection = u8::from_ne_bytes(it_slice);
info!(" max_nb_connection {:?} ", descr.max_nb_connection);
//
if descr.format_version == 4 {
// we read modification for level sampling
let mut it_slice = [0u8; std::mem::size_of::<f64>()];
io_in.read_exact(&mut it_slice)?;
descr.level_scale = f64::from_ne_bytes(it_slice);
info!(" level scale : {:.2e}", descr.level_scale);
}
//
let mut it_slice = [0u8; std::mem::size_of::<u8>()];
io_in.read_exact(&mut it_slice)?;
descr.nb_layer = u8::from_ne_bytes(it_slice);
info!("nb_layer {:?} ", descr.nb_layer);
// ef
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
io_in.read_exact(&mut it_slice)?;
descr.ef = usize::from_ne_bytes(it_slice);
info!("ef {:?} ", descr.ef);
// nb_point
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
io_in.read_exact(&mut it_slice)?;
descr.nb_point = usize::from_ne_bytes(it_slice);
// read dimension
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
io_in.read_exact(&mut it_slice)?;
descr.dimension = usize::from_ne_bytes(it_slice);
info!(
"nb_point {:?} dimension {:?} ",
descr.nb_point, descr.dimension
);
// distance name
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
io_in.read_exact(&mut it_slice)?;
let len: usize = usize::from_ne_bytes(it_slice);
debug!("length of distance name {:?} ", len);
if len > 256 {
info!(" length of distance name > 256");
println!(" length of distance name should not exceed 256");
return Err(anyhow!("bad length for distance name"));
}
let mut distv = vec![0; len];
io_in.read_exact(distv.as_mut_slice())?;
let distname = String::from_utf8(distv).unwrap();
debug!("distance name {:?} ", distname);
descr.distname = distname;
// reload of type name
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
io_in.read_exact(&mut it_slice)?;
let len: usize = usize::from_ne_bytes(it_slice);
debug!("length of T name {:?} ", len);
if len > 256 {
println!(" length of T name should not exceed 256");
return Err(anyhow!("bad lenght for T name"));
}
let mut tnamev = vec![0; len];
io_in.read_exact(tnamev.as_mut_slice())?;
let t_name = String::from_utf8(tnamev).unwrap();
debug!("T type name {:?} ", t_name);
descr.t_name = t_name;
debug!(" end of description load \n");
//
Ok(descr)
}
//
// dump and load of Point<T>
// ==========================
//
/// Graph part of point dump
/// dump of a point consist in
/// 1. The value MAGICPOINT
/// 2. its identity ( a usize rank in original data , hash value or else , and PointId)
/// 3. for each layer dump of the number of neighbours followed by :
/// for each neighbour dump of its identity (: usize) and then distance (): u32) to point dumped.
///
/// identity of a point is in full mode the triplet origin_id (: usize), layer (: u8) rank_in_layer (: u32)
/// light mode only origin_id (: usize)
/// For data dump
/// 1. The value MAGICDATAP (u32)
/// 2. origin_id as a u64
/// 3. The vector of data (the length is known from Description)
///
fn dump_point<T: Serialize + Clone + Sized + Send + Sync, W: Write>(
point: &Point<T>,
mode: DumpMode,
graphout: &mut BufWriter<W>,
dataout: &mut BufWriter<W>,
) -> Result<i32> {
//
graphout.write_all(&MAGICPOINT.to_ne_bytes())?;
// dump ext_id: usize , layer : u8 , rank in layer : i32
graphout.write_all(&point.get_origin_id().to_ne_bytes())?;
let p_id = point.get_point_id();
if mode == DumpMode::Full {
graphout.write_all(&p_id.0.to_ne_bytes())?;
graphout.write_all(&p_id.1.to_ne_bytes())?;
}
trace!(" point dump {:?} {:?} ", p_id, point.get_origin_id());
// then dump neighborhood info : nb neighbours : u32 , then list of origin_id, layer, rank_in_layer
let neighborhood = point.get_neighborhood_id();
// in any case nb_layers are dumped with possibly 0 neighbours at a layer, but this does not occur by construction
for (l, neighbours_at_l) in neighborhood.iter().enumerate() {
// Caution : we dump number of neighbours as a usize, even if it cannot be so large!
let nbg_l: usize = neighbours_at_l.len();
trace!("\t dumping nbng : {} at l {}", nbg_l, l);
graphout.write_all(&nbg_l.to_ne_bytes())?;
for n in neighbours_at_l {
// dump d_id : uszie , distance : f32, layer : u8, rank in layer : i32
graphout.write_all(&n.d_id.to_ne_bytes())?;
if mode == DumpMode::Full {
graphout.write_all(&n.p_id.0.to_ne_bytes())?;
graphout.write_all(&n.p_id.1.to_ne_bytes())?;
}
graphout.write_all(&n.distance.to_ne_bytes())?;
// debug!(" voisins {:?} {:?} {:?}", n.p_id, n.d_id , n.distance);
}
}
// now we dump data vector!
dataout.write_all(&MAGICDATAP.to_ne_bytes())?;
let origin_u64 = point.get_origin_id() as u64;
dataout.write_all(&origin_u64.to_ne_bytes())?;
//
let serialized = unsafe {
std::slice::from_raw_parts(
point.get_v().as_ptr() as *const u8,
std::mem::size_of_val(point.get_v()),
)
};
trace!("serializing len {:?}", serialized.len());
let len_64 = serialized.len() as u64;
dataout.write_all(&len_64.to_ne_bytes())?;
dataout.write_all(serialized)?;
//
Ok(1)
} // end of dump for Point<T>
// just reload data vector for point from file where data were dumped
// used when we do not used memory map in reload
fn load_point_data<T>(
origin_id: usize,
data_in: &mut dyn Read,
descr: &Description,
) -> Result<Vec<T>>
where
T: 'static + DeserializeOwned + Clone + Sized + Send + Sync,
{
//
trace!("load_point_data , origin id : {}", origin_id);
//
// construct a point from data_in
//
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
data_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
assert_eq!(
magic, MAGICDATAP,
"magic not equal to MAGICDATAP in load_point, point_id : {:?} ",
origin_id
);
// read origin id
let mut it_slice = [0u8; std::mem::size_of::<u64>()];
data_in.read_exact(&mut it_slice)?;
let origin_id_data = u64::from_ne_bytes(it_slice) as usize;
assert_eq!(
origin_id, origin_id_data,
"origin_id incoherent between graph and data"
);
// now read data. we use size_t that is in description, to take care of the casewhere we reload
let mut it_slice = [0u8; std::mem::size_of::<u64>()];
data_in.read_exact(&mut it_slice)?;
let serialized_len = u64::from_ne_bytes(it_slice);
trace!("serialized len to reload {:?}", serialized_len);
let mut v_serialized = vec![0; serialized_len as usize];
data_in.read_exact(&mut v_serialized)?;
let v: Vec<T> = if std::any::TypeId::of::<T>() != std::any::TypeId::of::<NoData>() {
match descr.format_version {
2 => bincode::deserialize(&v_serialized).unwrap(),
3 | 4 => {
let slice_t = unsafe {
std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, descr.dimension)
};
slice_t.to_vec()
}
_ => {
error!(
"error in load_point, unknow format_version : {:?}",
descr.format_version
);
std::process::exit(1);
}
}
} else {
Vec::new()
};
//
Ok(v)
} // end of load_point_data
// We need to maintain coherence in data and graph stream, so we read to keep in phase
fn skip_point_data(origin_id: usize, data_in: &mut dyn Read, _descr: &Description) -> Result<()> {
//
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
data_in.read_exact(&mut it_slice)?;
let magic = u32::from_ne_bytes(it_slice);
assert_eq!(
magic, MAGICDATAP,
"magic not equal to MAGICDATAP in load_point, point_id : {:?} ",
origin_id
);
// read origin id
let mut it_slice = [0u8; std::mem::size_of::<u64>()];
data_in.read_exact(&mut it_slice)?;
let origin_id_data = u64::from_ne_bytes(it_slice) as usize;
assert_eq!(
origin_id, origin_id_data,
"origin_id incoherent between graph and data"
);
//
// now read data. we use size_t that is in description, to take care of the casewhere we reload
let mut it_slice = [0u8; std::mem::size_of::<u64>()];
data_in.read_exact(&mut it_slice)?;
let serialized_len = u64::from_ne_bytes(it_slice);
trace!(
"skip_point_data : serialized len to reload {:?}",
serialized_len
);
let mut v_serialized = vec![0; serialized_len as usize];
data_in.read_exact(&mut v_serialized)?;
//
Ok(())
} // end of skip_point_data
//==================================================================================
/// This structure gathers info loaded in dumped graph file for a point.
type PointGraphInfo = (usize, PointId, Vec<Vec<Neighbour>>);
// This function reads neighbourhood info and returns neighbourhood info.
// It suppose and requires that the file graph_in is just at beginning of info related to origin_id
fn load_point_graph(graph_in: &mut dyn Read, descr: &Description) -> Result<PointGraphInfo> {
//
trace!("in load_point_graph");
// read and check magic
let mut it_slice = [0u8; std::mem::size_of::<u32>()];
graph_in.read_exact(&mut it_slice).unwrap();
let magic = u32::from_ne_bytes(it_slice);
if magic != MAGICPOINT {
error!("got instead of MAGICPOINT {:x}", magic);
return Err(anyhow!("bad magic at point beginning"));
}
let mut it_slice = [0u8; std::mem::size_of::<DataId>()];
graph_in.read_exact(&mut it_slice).unwrap();
let origin_id = DataId::from_ne_bytes(it_slice);
//
// read point_id
let mut it_slice = [0u8; std::mem::size_of::<u8>()];
graph_in.read_exact(&mut it_slice).unwrap();
let layer = u8::from_ne_bytes(it_slice);
//
let mut it_slice = [0u8; std::mem::size_of::<i32>()];
graph_in.read_exact(&mut it_slice).unwrap();
let rank_in_l = i32::from_ne_bytes(it_slice);
let p_id = PointId(layer, rank_in_l);
debug!(
"in load_point_graph, got origin_id : {}, p_id : {:?}",
origin_id, p_id
);
//
// Now for each layer , read neighbours
let nb_layer = descr.nb_layer;
let mut neighborhood = Vec::<Vec<Neighbour>>::with_capacity(NB_LAYER_MAX as usize);
for _l in 0..nb_layer {
let mut neighbour: Neighbour = Default::default();
// read nb_neighbour as usize!!! CAUTION, then nb_neighbours times identity(depends on Full or Light) distance : f32
let mut it_slice = [0u8; std::mem::size_of::<usize>()];
graph_in.read_exact(&mut it_slice).unwrap();
let nb_neighbours = usize::from_ne_bytes(it_slice);
let mut neighborhood_l: Vec<Neighbour> = Vec::with_capacity(nb_neighbours);
for _j in 0..nb_neighbours {
let mut it_slice = [0u8; std::mem::size_of::<DataId>()];
graph_in.read_exact(&mut it_slice).unwrap();
neighbour.d_id = DataId::from_ne_bytes(it_slice);
if descr.dumpmode == 1 {
let mut it_slice = [0u8; std::mem::size_of::<u8>()];
graph_in.read_exact(&mut it_slice).unwrap();
neighbour.p_id.0 = u8::from_ne_bytes(it_slice);
//
let mut it_slice = [0u8; std::mem::size_of::<i32>()];
graph_in.read_exact(&mut it_slice).unwrap();
neighbour.p_id.1 = i32::from_ne_bytes(it_slice);
}
let mut it_slice = [0u8; std::mem::size_of::<f32>()];
graph_in.read_exact(&mut it_slice).unwrap();
neighbour.distance = f32::from_ne_bytes(it_slice);
// debug!(" voisins load {:?} {:?} {:?} ", neighbour.p_id, neighbour.d_id , neighbour.distance);
// now we have a new neighbour, we must really fill neighbourhood info, so it means going from Neighbour to PointWithOrder
neighborhood_l.push(neighbour);
}
neighborhood.push(neighborhood_l);
}
for _l in nb_layer..NB_LAYER_MAX {
neighborhood.push(Vec::<Neighbour>::new());
}
//
let point_grap_info = (origin_id, p_id, neighborhood);
//
Ok(point_grap_info)
} // end of load_point_graph
//
// dump and load of PointIndexation<T>
// ===================================
//
//
// nb_layer : 8
// a magick at each Layer : u32
// . number of points in layer (usize),
// . list of point of layer
// dump entry point
//
impl<T: Serialize + DeserializeOwned + Clone + Send + Sync> HnswIoT for PointIndexation<'_, T> {
fn dump(&self, mode: DumpMode, dumpinit: &mut DumpInit) -> Result<i32> {
let graphout = &mut dumpinit.graph_out;
let dataout = &mut dumpinit.data_out;
// dump max_layer
let layers = self.points_by_layer.read();
let nb_layer = layers.len() as u8;
graphout.write_all(&nb_layer.to_ne_bytes())?;
// dump layers from lower (most populatated to higher level)
for i in 0..layers.len() {
let nb_point = layers[i].len();
debug!("dumping layer {:?}, nb_point {:?}", i, nb_point);
graphout.write_all(&MAGICLAYER.to_ne_bytes())?;
graphout.write_all(&nb_point.to_ne_bytes())?;
for j in 0..layers[i].len() {
assert_eq!(layers[i][j].get_point_id(), PointId(i as u8, j as i32));
dump_point(&layers[i][j], mode, graphout, dataout)?;
}
}
// dump id of entry point
let ep_read = self.entry_point.read();
let ep = ep_read
.as_ref()
.ok_or(anyhow!("entry point not initialized"))?;
//let ep = ep_read.as_ref().unwrap();
graphout.write_all(&ep.get_origin_id().to_ne_bytes())?;
let p_id = ep.get_point_id();
if mode == DumpMode::Full {
graphout.write_all(&p_id.0.to_ne_bytes())?;
graphout.write_all(&p_id.1.to_ne_bytes())?;
}
info!(
"dumped entry_point origin_d {:?}, p_id {:?} ",
ep.get_origin_id(),
p_id
);
//
Ok(1)
} // end of dump for PointIndexation<T>
} // end of impl HnswIO
//
// dump and load of Hnsw<T>
// =========================
//
//
impl<T: Serialize + DeserializeOwned + Clone + Sized + Send + Sync, D: Distance<T> + Send + Sync>
HnswIoT for Hnsw<'_, T, D>
{
/// The dump method for hnsw.
/// - graphout is a BufWriter dedicated to the dump of the graph part of Hnsw
/// - dataout is a bufWriter dedicated to the dump of the data stored in the Hnsw structure.
fn dump(&self, mode: DumpMode, dumpinit: &mut DumpInit) -> anyhow::Result<i32> {
//
let graphout = &mut dumpinit.graph_out;
let dataout = &mut dumpinit.data_out;
// dump description , then PointIndexation
let dumpmode: u8 = match mode {
DumpMode::Full => 1,
_ => 0,
};
let datadim: usize = self.layer_indexed_points.get_data_dimension();
let level_scale = self.layer_indexed_points.get_level_scale();
let description = Description {
format_version: 3,
// value is 1 for Full 0 for Light
dumpmode,
max_nb_connection: self.get_max_nb_connection(),
level_scale,
nb_layer: self.get_max_level() as u8,
ef: self.get_ef_construction(),
nb_point: self.get_nb_point(),
dimension: datadim,
distname: self.get_distance_name(),
t_name: type_name::<T>().to_string(),
};
debug!("dump obtained typename {:?}", type_name::<T>());
description.dump(mode, graphout)?;
// We must dump a header for dataout.
dataout.write_all(&MAGICDATAP.to_ne_bytes())?;
dataout.write_all(&datadim.to_ne_bytes())?;
//
self.layer_indexed_points.dump(mode, dumpinit)?;
Ok(1)
}
} // end impl block for Hnsw
//===============================================================================================================
#[cfg(test)]
mod tests {
use super::*;
pub use crate::api::AnnT;
use anndists::dist;
use log::error;
use rand::distr::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
fn my_fn(v1: &[f32], v2: &[f32]) -> f32 {
let norm_l1: f32 = v1.iter().zip(v2.iter()).map(|t| (*t.0 - *t.1).abs()).sum();
norm_l1
}
#[test]
fn test_dump_reload_1() {
println!("\n\n test_dump_reload_1");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, dist::DistL1>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
dist::DistL1 {},
);
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "dumpreloadtest1";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
//
// reload
debug!("\n\n test_dump_reload_1 hnsw reload");
// we will need a procedural macro to get from distance name to its instanciation.
// from now on we test with DistL1
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
// test equality
check_graph_equality(&hnsw_loaded, &hnsw);
} // end of test_dump_reload
#[test]
fn test_dump_reload_myfn() {
println!("\n\n test_dump_reload_myfn");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let mydist = dist::DistPtr::<f32, f32>::new(my_fn);
let hnsw = Hnsw::<f32, dist::DistPtr<f32, f32>>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
mydist,
);
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
let fname = "dumpreloadtest_myfn";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
//
// reload
debug!("HNSW reload");
let reloader = HnswIo::new(directory.path(), fname);
let mydist = dist::DistPtr::<f32, f32>::new(my_fn);
let _hnsw_loaded: Hnsw<f32, DistPtr<f32, f32>> =
reloader.load_hnsw_with_dist(mydist).unwrap();
} // end of test_dump_reload_myfn
#[test]
fn test_dump_reload_graph_only() {
println!("\n\n test_dump_reload_graph_only");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, dist::DistL1>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
dist::DistL1 {},
);
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "dumpreloadtestgraph";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
//
// reload
debug!("\n\n hnsw reload");
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
// test equality
check_graph_equality(&hnsw_loaded, &hnsw);
} // end of test_dump_reload
// this tests reloads a dump with memory mapping of data, inserts new data and redump
#[test]
fn reload_with_mmap() {
println!("\n\n hnswio tests : reload_with_mmap");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 100 vectors of size 10 f32
let nbcolumn = 100;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
//
let first: Vec<f32> = data[0].clone();
info!("data[0] = {:?}", first);
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, dist::DistL1>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
dist::DistL1 {},
);
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "mmapreloadtest";
let directory = tempfile::tempdir().unwrap();
let dumpname = hnsw.file_dump(directory.path(), fname).unwrap();
debug!("dump succeeded in file basename : {}", dumpname);
//
// reload reload_with_mmap
debug!("HNSW reload");
let mut reloader = HnswIo::new(directory.path(), &dumpname);
// use mmap for points after half number of points
let options = ReloadOptions::default().set_mmap_threshold(nbcolumn / 2);
reloader.set_options(options);
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
// test equality
check_graph_equality(&hnsw_loaded, &hnsw);
// We add nbcolumn new vectors
info!("adding points in hnsw reloaded");
let nbcolumn = 5;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
let first_with_mmap: Vec<f32> = data[0].clone();
info!(
"first added after reloading with mmap : data[0] = {:?}",
first_with_mmap
);
let nb_in = hnsw.get_nb_point();
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i + nb_in));
}
//
let search_res = hnsw.search(&first, 5, ef_construct);
info!("neighbours od first point inserted");
for n in &search_res {
info!("neighbour: {:?}", n);
}
assert_eq!(search_res[0].d_id, 0);
assert_eq!(search_res[0].distance, 0.);
let search_res = hnsw.search(&first_with_mmap, 5, ef_construct);
info!("neighbours of first point inserted after reload with mmap");
for n in &search_res {
info!("neighbour {:?}", n);
}
if search_res[0].d_id != nb_in {
// with very low probability it could happen that we find a very near point!
// then distance should very small
info!(
"neighbour found for point id : {}, distance : {:.2e}, should have been id : {}, dist : {:.2e}",
search_res[0].d_id, search_res[0].distance, nb_in, 0.
);
}
assert_eq!(search_res[0].d_id, nb_in);
assert_eq!(search_res[0].distance, 0.);
//
// TODO: redump and care about mmapped file, so we do not overwrite
//
let dump_init = DumpInit::new(directory.path(), fname, false);
info!("will use basename : {}", dump_init.get_basename());
let res = hnsw.file_dump(directory.path(), dump_init.get_basename());
if res.is_err() {
error!("hnsw.file_dump failed");
std::panic!("hnsw.file_dump failed");
}
} // end of reload_with_mmap
#[test]
fn test_bincode() {
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let size = 10;
let mut xsi;
let mut data = Vec::with_capacity(size);
for _ in 0..size {
xsi = unif.sample(&mut rng);
println!("xsi = {:?}", xsi);
data.push(xsi);
}
println!("to serialized {:?}", data);
let v_serialized: Vec<u8> = bincode::serialize(&data).unwrap();
debug!("serializing len {:?}", v_serialized.len());
let v_deserialized: Vec<f32> = bincode::deserialize(&v_serialized).unwrap();
println!("deserialized {:?}", v_deserialized);
}
#[test]
fn read_write_empty_db() -> Result<()> {
log_init_test();
let ef_construct = 25;
let nb_connection = 10;
let hnsw =
Hnsw::<f32, dist::DistL1>::new(nb_connection, 0, 16, ef_construct, dist::DistL1 {});
let fname = "empty_db";
let directory = tempfile::tempdir()?;
let _res = hnsw.file_dump(directory.path(), fname);
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded_res = reloader.load_hnsw::<f32, DistL1>();
assert!(hnsw_loaded_res.is_err());
Ok(())
}
} // end module tests