Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
87
vendor/ruvector/scripts/patches/hnsw_rs/src/api.rs
vendored
Normal file
87
vendor/ruvector/scripts/patches/hnsw_rs/src/api.rs
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Api for external language.
|
||||
//! This file provides a trait to be used as an opaque pointer for C or Julia calls used in file libext.rs
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
|
||||
use crate::hnsw::*;
|
||||
use crate::hnswio::*;
|
||||
use anndists::dist::distances::Distance;
|
||||
use log::info;
|
||||
|
||||
pub trait AnnT {
|
||||
/// type of data vectors
|
||||
type Val;
|
||||
//
|
||||
fn insert_data(&mut self, data: &[Self::Val], id: usize);
|
||||
//
|
||||
fn search_neighbours(&self, data: &[Self::Val], knbn: usize, ef_s: usize) -> Vec<Neighbour>;
|
||||
//
|
||||
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]);
|
||||
//
|
||||
fn parallel_search_neighbours(
|
||||
&self,
|
||||
data: &[Vec<Self::Val>],
|
||||
knbn: usize,
|
||||
ef_s: usize,
|
||||
) -> Vec<Vec<Neighbour>>;
|
||||
///
|
||||
/// dumps a data and graph in 2 files.
|
||||
/// Datas are dumped in file filename.hnsw.data and graph in filename.hnsw.graph
|
||||
///
|
||||
/// **We do not overwrite old files if they are currently in use by memory map**
|
||||
/// If these files already exist , they are not overwritten and a unique filename is generated by concatenating a random number to filename.
|
||||
/// The function returns the basename used for the dump
|
||||
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String>;
|
||||
}
|
||||
|
||||
impl<T, D> AnnT for Hnsw<'_, T, D>
|
||||
where
|
||||
T: Serialize + DeserializeOwned + Clone + Send + Sync,
|
||||
D: Distance<T> + Send + Sync,
|
||||
{
|
||||
type Val = T;
|
||||
//
|
||||
fn insert_data(&mut self, data: &[Self::Val], id: usize) {
|
||||
self.insert((data, id));
|
||||
}
|
||||
//
|
||||
fn search_neighbours(&self, data: &[T], knbn: usize, ef_s: usize) -> Vec<Neighbour> {
|
||||
self.search(data, knbn, ef_s)
|
||||
}
|
||||
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]) {
|
||||
self.parallel_insert(data);
|
||||
}
|
||||
|
||||
fn parallel_search_neighbours(
|
||||
&self,
|
||||
data: &[Vec<Self::Val>],
|
||||
knbn: usize,
|
||||
ef_s: usize,
|
||||
) -> Vec<Vec<Neighbour>> {
|
||||
self.parallel_search(data, knbn, ef_s)
|
||||
}
|
||||
|
||||
// The main entry point to do a dump.
|
||||
// It will generate two files one for the graph part of the data. The other for the real data points of the structure.
|
||||
// The names of file are $filename.hnsw.graph for the graph and $filename.hnsw.data.
|
||||
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String> {
|
||||
info!("In Hnsw::file_dump");
|
||||
//
|
||||
// do not overwrite if mmap is active
|
||||
let overwrite = !self.get_datamap_opt();
|
||||
let mut dumpinit = DumpInit::new(path, file_basename, overwrite);
|
||||
let dumpname = dumpinit.get_basename().clone();
|
||||
//
|
||||
let res = self.dump(DumpMode::Full, &mut dumpinit);
|
||||
//
|
||||
dumpinit.flush()?;
|
||||
info!("\n End of dump, file basename : {}\n", &dumpname);
|
||||
if res.is_ok() {
|
||||
Ok(dumpname)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("unexpected error"))
|
||||
}
|
||||
} // end of dump
|
||||
} // end of impl block AnnT for Hnsw<T,D>
|
||||
457
vendor/ruvector/scripts/patches/hnsw_rs/src/datamap.rs
vendored
Normal file
457
vendor/ruvector/scripts/patches/hnsw_rs/src/datamap.rs
vendored
Normal file
@@ -0,0 +1,457 @@
|
||||
//! This module provides a memory mapping of Data vectors filling the Hnsw structure.
|
||||
//! It is used by the module [hnswio] and also gives access to an iterator over data without loading the graph.
|
||||
//!
|
||||
//! We mmap the file and provide
|
||||
//! - a Hashmap from DataId to address
|
||||
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
|
||||
|
||||
use std::io::BufReader;
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use indexmap::map::IndexMap;
|
||||
use log::{debug, error, info, trace};
|
||||
use mmap_rs::{Mmap, MmapOptions};
|
||||
|
||||
use crate::hnsw::DataId;
|
||||
use crate::hnswio;
|
||||
|
||||
use crate::hnswio::MAGICDATAP;
|
||||
/// This structure uses the data part of the dump of a Hnsw structure to retrieve the data.
|
||||
/// The data is access via a mmap of the data file, so memory is spared at the expense of page loading.
|
||||
// possibly to be used in graph to spare memory?
|
||||
pub struct DataMap {
|
||||
/// File containing Points data
|
||||
_datapath: PathBuf,
|
||||
/// The mmap structure
|
||||
mmap: Mmap,
|
||||
/// map a dataId to an address where we get a bson encoded vector of type T
|
||||
hmap: IndexMap<DataId, usize>,
|
||||
/// type name of Data
|
||||
t_name: String,
|
||||
/// dimension of data vector
|
||||
dimension: usize,
|
||||
//
|
||||
distname: String,
|
||||
} // end of DataMap
|
||||
|
||||
impl DataMap {
|
||||
// TODO: specifiy mmap option
|
||||
/// The fname argument corresponds to the basename of the dump.
|
||||
/// To reload from file fname.hnsw.data just pass fname as argument.
|
||||
/// The dir argument is the directory where the fname.hnsw.data and fname.hnsw.graph reside.
|
||||
pub fn from_hnswdump<T: std::fmt::Debug>(
|
||||
dir: &Path,
|
||||
file_name: &str,
|
||||
) -> Result<DataMap, String> {
|
||||
// reload description to have data type, and check for dump version
|
||||
let mut graphpath = PathBuf::from(dir);
|
||||
graphpath.push(dir);
|
||||
let mut filename = file_name.to_string();
|
||||
filename.push_str(".hnsw.graph");
|
||||
graphpath.push(filename);
|
||||
let graphfileres = OpenOptions::new().read(true).open(&graphpath);
|
||||
if graphfileres.is_err() {
|
||||
println!("DataMap: could not open file {:?}", graphpath.as_os_str());
|
||||
std::process::exit(1);
|
||||
}
|
||||
let graphfile = graphfileres.unwrap();
|
||||
let mut graph_in = BufReader::new(graphfile);
|
||||
// we need to call load_description first to get distance name
|
||||
let hnsw_description = hnswio::load_description(&mut graph_in).unwrap();
|
||||
if hnsw_description.format_version <= 2 {
|
||||
let msg = String::from(
|
||||
"from_hnsw::from_hnsw : data mapping is only possible for dumps with the version > 0.1.19 of this crate",
|
||||
);
|
||||
error!(
|
||||
"Data mapping is only possible for dumps with the version > 0.1.19 of this crate"
|
||||
);
|
||||
return Err(msg);
|
||||
}
|
||||
let distname = hnsw_description.distname.clone();
|
||||
let t_name = hnsw_description.get_typename();
|
||||
// check typename coherence
|
||||
info!("Got typename from reload : {:?}", t_name);
|
||||
if std::any::type_name::<T>() != t_name {
|
||||
error!(
|
||||
"Description has typename {:?}, function type argument is : {:?}",
|
||||
t_name,
|
||||
std::any::type_name::<T>()
|
||||
);
|
||||
return Err(String::from("type error"));
|
||||
}
|
||||
// get dimension as declared in description
|
||||
let descr_dimension = hnsw_description.get_dimension();
|
||||
drop(graph_in);
|
||||
//
|
||||
// we know data filename is hnswdump.hnsw.data
|
||||
//
|
||||
let mut datapath = PathBuf::new();
|
||||
datapath.push(dir);
|
||||
let mut filename = file_name.to_string();
|
||||
filename.push_str(".hnsw.data");
|
||||
datapath.push(filename);
|
||||
//
|
||||
let meta = std::fs::metadata(&datapath);
|
||||
if meta.is_err() {
|
||||
error!("Could not open file : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let fsize = meta.unwrap().len().try_into().unwrap();
|
||||
//
|
||||
let file_res = File::open(&datapath);
|
||||
if file_res.is_err() {
|
||||
error!("Could not open file : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let file = file_res.unwrap();
|
||||
let offset = 0;
|
||||
//
|
||||
let mmap_opt = MmapOptions::new(fsize).unwrap();
|
||||
let mmap_opt = unsafe { mmap_opt.with_file(&file, offset) };
|
||||
let mapping_res = mmap_opt.map();
|
||||
if mapping_res.is_err() {
|
||||
error!("Could not memory map : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let mmap = mapping_res.unwrap();
|
||||
//
|
||||
info!("Mmap done on file : {:?}", &datapath);
|
||||
//
|
||||
// where are we in decoding mmap slice? at beginning
|
||||
//
|
||||
let mapped_slice = mmap.as_slice();
|
||||
//
|
||||
// where are we in decoding mmap slice?
|
||||
let mut current_mmap_addr = 0usize;
|
||||
let mut usize_slice = [0u8; std::mem::size_of::<usize>()];
|
||||
// check magic
|
||||
let mut u32_slice = [0u8; std::mem::size_of::<u32>()];
|
||||
u32_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u32>();
|
||||
let magic = u32::from_ne_bytes(u32_slice);
|
||||
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
|
||||
// get dimension
|
||||
usize_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<usize>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<usize>();
|
||||
let dimension = usize::from_ne_bytes(usize_slice);
|
||||
if dimension != descr_dimension {
|
||||
error!(
|
||||
"Description and data do not agree on dimension, data got : {:?}, description got : {:?}",
|
||||
dimension, descr_dimension
|
||||
);
|
||||
return Err(String::from(
|
||||
"description and data do not agree on dimension",
|
||||
));
|
||||
} else {
|
||||
info!("Got dimension : {:?}", dimension);
|
||||
}
|
||||
//
|
||||
// now we know that each record consists in
|
||||
// - MAGICDATAP (u32), DataId (u64), dimension (u64) and then (length of type in bytes * dimension)
|
||||
//
|
||||
let record_size = std::mem::size_of::<u32>()
|
||||
+ 2 * std::mem::size_of::<u64>()
|
||||
+ dimension * std::mem::size_of::<T>();
|
||||
let residual = mmap.size() - current_mmap_addr;
|
||||
info!(
|
||||
"Mmap size {}, current_mmap_addr {}, residual : {}",
|
||||
mmap.size(),
|
||||
current_mmap_addr,
|
||||
residual
|
||||
);
|
||||
let nb_record = residual / record_size;
|
||||
debug!("Record size : {}, nb_record : {}", record_size, nb_record);
|
||||
// allocate hmap with correct capacity
|
||||
let mut hmap = IndexMap::<DataId, usize>::with_capacity(nb_record);
|
||||
// fill hmap to have address of each data point in file
|
||||
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
|
||||
//
|
||||
// now we loop on records
|
||||
//
|
||||
for i in 0..nb_record {
|
||||
debug!("Record i : {}, addr : {}", i, current_mmap_addr);
|
||||
// decode Magic
|
||||
u32_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u32>();
|
||||
let magic = u32::from_ne_bytes(u32_slice);
|
||||
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
|
||||
// decode DataId
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
let data_id = u64::from_ne_bytes(u64_slice) as usize;
|
||||
debug!(
|
||||
"Inserting in hmap : got dataid : {:?} current map address : {:?}",
|
||||
data_id, current_mmap_addr
|
||||
);
|
||||
// Note we store address where we have to decode dimension*size_of::<T> and full bson encoded vector
|
||||
hmap.insert(data_id, current_mmap_addr);
|
||||
// now read serialized length
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
|
||||
if i == 0 {
|
||||
debug!("serialized bytes len to reload {:?}", serialized_len);
|
||||
}
|
||||
let mut v_serialized = vec![0; serialized_len];
|
||||
v_serialized.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + serialized_len],
|
||||
);
|
||||
current_mmap_addr += serialized_len;
|
||||
let slice_t =
|
||||
unsafe { std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, dimension) };
|
||||
trace!(
|
||||
"Deserialized v : {:?} address : {:?} ",
|
||||
slice_t,
|
||||
v_serialized.as_ptr() as *const T
|
||||
);
|
||||
} // end of for on record
|
||||
//
|
||||
debug!("End of DataMap::from_hnsw.");
|
||||
//
|
||||
let datamap = DataMap {
|
||||
_datapath: datapath,
|
||||
mmap,
|
||||
hmap,
|
||||
t_name,
|
||||
dimension: descr_dimension,
|
||||
distname,
|
||||
};
|
||||
//
|
||||
Ok(datamap)
|
||||
} // end of from_datas
|
||||
|
||||
//
|
||||
|
||||
/// returns true if type T corresponds to type as retrieved in DataMap.
|
||||
/// This function can (should!) be used before calling [Self::get_data()]
|
||||
pub fn check_data_type<T>(&self) -> bool
|
||||
where
|
||||
T: 'static + Sized,
|
||||
{
|
||||
// we check last part of name of type
|
||||
let tname_vec = self.t_name.rsplit_terminator("::").collect::<Vec<&str>>();
|
||||
|
||||
if tname_vec.last().is_none() {
|
||||
let errmsg = "DataMap::check_data_type() cannot determine data type name ";
|
||||
error!("DataMap::check_data_type() cannot determine data type name ");
|
||||
std::panic!("DataMap::check_data_type(), {}", errmsg);
|
||||
}
|
||||
let tname_last = tname_vec.last().unwrap();
|
||||
//
|
||||
let datat_name_arg = std::any::type_name::<T>().to_string();
|
||||
let datat_name_vec = datat_name_arg
|
||||
.rsplit_terminator("::")
|
||||
.collect::<Vec<&str>>();
|
||||
|
||||
let datat_name_arg_last = datat_name_vec.last().unwrap();
|
||||
//
|
||||
if datat_name_arg_last == tname_last {
|
||||
true
|
||||
} else {
|
||||
info!(
|
||||
"Data type in DataMap : {}, type arg = {}",
|
||||
tname_last, datat_name_arg_last
|
||||
);
|
||||
false
|
||||
}
|
||||
} // end of check_data_type
|
||||
|
||||
//
|
||||
|
||||
/// return the data corresponding to dataid. Access is done using mmap.
|
||||
/// Function returns None if address is invalid
|
||||
/// This function requires you know the type T.
|
||||
/// **As mmap loading calls an unsafe function it is recommended to check the type name with [Self::check_data_type()]**
|
||||
pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> {
|
||||
//
|
||||
trace!("In DataMap::get_data, dataid : {:?}", dataid);
|
||||
let address = self.hmap.get(dataid)?;
|
||||
debug!("Address for id : {}, address : {:?}", dataid, address);
|
||||
let mut current_mmap_addr = *address;
|
||||
let mapped_slice = self.mmap.as_slice();
|
||||
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
trace!("Serialized bytes len to reload {:?}", serialized_len);
|
||||
let slice_t = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
mapped_slice[current_mmap_addr..].as_ptr() as *const T,
|
||||
self.dimension,
|
||||
)
|
||||
};
|
||||
Some(slice_t)
|
||||
}
|
||||
|
||||
/// returns Keys in order they are in the file, thus optimizing file/memory access.
|
||||
/// Note that in case of parallel insertion this can be different from insertion odrer.
|
||||
pub fn get_dataid_iter(&self) -> indexmap::map::Keys<'_, DataId, usize> {
|
||||
self.hmap.keys()
|
||||
}
|
||||
|
||||
/// returns full data type name
|
||||
pub fn get_data_typename(&self) -> String {
|
||||
self.t_name.clone()
|
||||
}
|
||||
|
||||
/// returns full data type name
|
||||
pub fn get_distname(&self) -> String {
|
||||
self.distname.clone()
|
||||
}
|
||||
|
||||
/// return the number of data in mmap
|
||||
pub fn get_nb_data(&self) -> usize {
|
||||
self.hmap.len()
|
||||
}
|
||||
} // end of impl DataMap
|
||||
|
||||
//=====================================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::hnswio::HnswIo;
|
||||
use anndists::dist::*;
|
||||
|
||||
pub use crate::api::AnnT;
|
||||
use crate::prelude::*;
|
||||
|
||||
use rand::distr::{Distribution, Uniform};
|
||||
|
||||
fn log_init_test() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_mmap() {
|
||||
println!("\n\n test_file_mmap");
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 50;
|
||||
let nbrow = 11;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
debug!("j : {:?}, data : {:?} ", j, &data[j]);
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "mmap_test";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
|
||||
let check_reload = false;
|
||||
if check_reload {
|
||||
// We check we can reload
|
||||
debug!("HNSW reload.");
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let mut reloader = HnswIo::new(directory.path(), fname);
|
||||
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
|
||||
check_graph_equality(&hnsw_loaded, &hnsw);
|
||||
info!("========= reload success, going to mmap reloading =========");
|
||||
}
|
||||
//
|
||||
// now we have check that datamap seems ok, test reload of hnsw with mmap
|
||||
let datamap: DataMap = DataMap::from_hnswdump::<f32>(directory.path(), fname).unwrap();
|
||||
let nb_test = 30;
|
||||
info!("Checking random access of id , nb test : {}", nb_test);
|
||||
for _ in 0..nb_test {
|
||||
// sample an id in 0..nb_data
|
||||
let unif = Uniform::<usize>::new(0, nbcolumn).unwrap();
|
||||
let id = unif.sample(&mut rng);
|
||||
let d = datamap.get_data::<f32>(&id);
|
||||
assert!(d.is_some());
|
||||
if d.is_some() {
|
||||
debug!("id = {}, v = {:?}", id, d.as_ref().unwrap());
|
||||
assert_eq!(d.as_ref().unwrap(), &data[id]);
|
||||
}
|
||||
}
|
||||
// test iterator from datamap
|
||||
let keys = datamap.get_dataid_iter();
|
||||
for k in keys {
|
||||
let _data = datamap.get_data::<f32>(k);
|
||||
}
|
||||
} // end of test_file_mmap
|
||||
|
||||
#[test]
|
||||
fn test_mmap_iter() {
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<u32>::new(0, 10000).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 50;
|
||||
let nbrow = 11;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
debug!("j : {:?}, data : {:?} ", j, &data[j]);
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<u32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "mmap_order_test";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
// now we have check that datamap seems ok, test reload of hnsw with mmap
|
||||
let datamap: DataMap = DataMap::from_hnswdump::<u32>(directory.path(), fname).unwrap();
|
||||
// testing type check
|
||||
assert!(datamap.check_data_type::<u32>());
|
||||
assert!(!datamap.check_data_type::<f32>());
|
||||
info!("Datamap iteration order checking");
|
||||
let keys = datamap.get_dataid_iter();
|
||||
for (i, dataid) in keys.enumerate() {
|
||||
let v = datamap.get_data::<u32>(dataid).unwrap();
|
||||
assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, i);
|
||||
}
|
||||
// rm files generated!
|
||||
let _ = std::fs::remove_file("mmap_order_test.hnsw.data");
|
||||
let _ = std::fs::remove_file("mmap_order_test.hnsw.graph");
|
||||
}
|
||||
//
|
||||
} // end of mod tests
|
||||
24
vendor/ruvector/scripts/patches/hnsw_rs/src/filter.rs
vendored
Normal file
24
vendor/ruvector/scripts/patches/hnsw_rs/src/filter.rs
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
//! defines a trait for filtering requests.
|
||||
//! See examples in tests/filtertest.rs
|
||||
|
||||
use crate::prelude::DataId;
|
||||
|
||||
/// Only queries returning true are taken into account along the search
|
||||
pub trait FilterT {
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool;
|
||||
}
|
||||
|
||||
impl FilterT for Vec<usize> {
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool {
|
||||
self.binary_search(id).is_ok()
|
||||
}
|
||||
}
|
||||
|
||||
impl<F> FilterT for F
|
||||
where
|
||||
F: Fn(&DataId) -> bool,
|
||||
{
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool {
|
||||
self(id)
|
||||
}
|
||||
}
|
||||
200
vendor/ruvector/scripts/patches/hnsw_rs/src/flatten.rs
vendored
Normal file
200
vendor/ruvector/scripts/patches/hnsw_rs/src/flatten.rs
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
//! This module provides conversion of a Point structure to a FlatPoint containing just the Id of a point
|
||||
//! and those of its neighbours.
|
||||
//! The whole Hnsw structure is then flattened into a Hashtable associating the data ID of a point to
|
||||
//! its corresponding FlatPoint.
|
||||
//! It can be used, for example, when reloading only the graph part of the data to have knowledge
|
||||
//! of relative proximity of points as described just by their DataId
|
||||
//!
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::hnsw;
|
||||
use anndists::dist::distances::Distance;
|
||||
use hnsw::*;
|
||||
use log::error;
|
||||
|
||||
// an ordering of Neighbour of a Point
|
||||
|
||||
impl PartialEq for Neighbour {
|
||||
fn eq(&self, other: &Neighbour) -> bool {
|
||||
self.distance == other.distance
|
||||
} // end eq
|
||||
}
|
||||
|
||||
impl Eq for Neighbour {}
|
||||
|
||||
// order points by distance to self.
|
||||
#[allow(clippy::non_canonical_partial_ord_impl)]
|
||||
impl PartialOrd for Neighbour {
|
||||
fn partial_cmp(&self, other: &Neighbour) -> Option<Ordering> {
|
||||
self.distance.partial_cmp(&other.distance)
|
||||
} // end cmp
|
||||
} // end impl PartialOrd
|
||||
|
||||
impl Ord for Neighbour {
|
||||
fn cmp(&self, other: &Neighbour) -> Ordering {
|
||||
if !self.distance.is_nan() && !other.distance.is_nan() {
|
||||
self.distance.partial_cmp(&other.distance).unwrap()
|
||||
} else {
|
||||
panic!("got a NaN in a distance");
|
||||
}
|
||||
} // end cmp
|
||||
}
|
||||
|
||||
/// a reduced version of point inserted in the Hnsw structure.
|
||||
/// It contains original id of point as submitted to the struct Hnsw
|
||||
/// an ordered (by distance) list of neighbours to the point
|
||||
/// and it position in layers.
|
||||
#[derive(Clone)]
|
||||
pub struct FlatPoint {
|
||||
/// an id coming from client using hnsw, should identify point uniquely
|
||||
origin_id: DataId,
|
||||
/// a point id identifying point as stored in our structure
|
||||
p_id: PointId,
|
||||
/// neighbours info
|
||||
neighbours: Vec<Neighbour>,
|
||||
}
|
||||
|
||||
impl FlatPoint {
|
||||
/// returns the neighbours orderded by distance.
|
||||
pub fn get_neighbours(&self) -> &Vec<Neighbour> {
|
||||
&self.neighbours
|
||||
}
|
||||
/// returns the origin id of the point
|
||||
pub fn get_id(&self) -> DataId {
|
||||
self.origin_id
|
||||
}
|
||||
//
|
||||
pub fn get_p_id(&self) -> PointId {
|
||||
self.p_id
|
||||
}
|
||||
} // end impl block for FlatPoint
|
||||
|
||||
fn flatten_point<T: Clone + Send + Sync>(point: &Point<T>) -> FlatPoint {
|
||||
let neighbours = point.get_neighborhood_id();
|
||||
// now we flatten neighbours
|
||||
let mut flat_neighbours = Vec::<Neighbour>::new();
|
||||
for layer in neighbours {
|
||||
for neighbour in layer {
|
||||
flat_neighbours.push(neighbour);
|
||||
}
|
||||
}
|
||||
flat_neighbours.sort_unstable();
|
||||
FlatPoint {
|
||||
origin_id: point.get_origin_id(),
|
||||
p_id: point.get_point_id(),
|
||||
neighbours: flat_neighbours,
|
||||
}
|
||||
} // end of flatten_point
|
||||
|
||||
/// A structure providing neighbourhood information of a point stored in the Hnsw structure given its DataId.
|
||||
/// The structure uses the [FlatPoint] structure.
|
||||
/// This structure can be obtained by FlatNeighborhood::from<&Hnsw<T,D>>
|
||||
pub struct FlatNeighborhood {
|
||||
hash_t: HashMap<DataId, FlatPoint>,
|
||||
}
|
||||
|
||||
impl FlatNeighborhood {
|
||||
/// get neighbour of a point given its id.
|
||||
/// The neighbours are sorted in increasing distance from data_id.
|
||||
pub fn get_neighbours(&self, p_id: DataId) -> Option<Vec<Neighbour>> {
|
||||
self.hash_t
|
||||
.get(&p_id)
|
||||
.map(|point| point.get_neighbours().clone())
|
||||
}
|
||||
} // end impl block for FlatNeighborhood
|
||||
|
||||
impl<T: Clone + Send + Sync, D: Distance<T> + Send + Sync> From<&Hnsw<'_, T, D>>
|
||||
for FlatNeighborhood
|
||||
{
|
||||
/// extract from the Hnsw strucure a hashtable mapping original DataId into a FlatPoint structure gathering its neighbourhood information.
|
||||
/// Useful after reloading from a dump with T=NoData and D = NoDist as points are then reloaded with neighbourhood information only.
|
||||
fn from(hnsw: &Hnsw<T, D>) -> Self {
|
||||
let mut hash_t = HashMap::new();
|
||||
let pt_iter = hnsw.get_point_indexation().into_iter();
|
||||
//
|
||||
for point in pt_iter {
|
||||
// println!("point : {:?}", _point.p_id);
|
||||
let res_insert = hash_t.insert(point.get_origin_id(), flatten_point(&point));
|
||||
if let Some(old_point) = res_insert {
|
||||
error!("2 points with same origin id {:?}", old_point.origin_id);
|
||||
}
|
||||
}
|
||||
FlatNeighborhood { hash_t }
|
||||
}
|
||||
} // e,d of Fom implementation
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use anndists::dist::distances::*;
|
||||
use log::debug;
|
||||
|
||||
use crate::api::AnnT;
|
||||
use crate::hnswio::*;
|
||||
|
||||
use rand::distr::{Distribution, Uniform};
|
||||
|
||||
fn log_init_test() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dump_reload_graph_flatten() {
|
||||
println!("\n\n test_dump_reload_graph_flatten");
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 1000;
|
||||
let nbrow = 10;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// get flat neighbours of point 3
|
||||
let neighborhood_before_dump = FlatNeighborhood::from(&hnsw);
|
||||
let nbg_2_before = neighborhood_before_dump.get_neighbours(2).unwrap();
|
||||
println!("voisins du point 2 {:?}", nbg_2_before);
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "dumpreloadtestflat";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
|
||||
//
|
||||
// reload
|
||||
debug!("HNSW reload");
|
||||
// we will need a procedural macro to get from distance name to its instantiation.
|
||||
// from now on we test with DistL1
|
||||
let mut reloader = HnswIo::new(directory.path(), fname);
|
||||
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
|
||||
let neighborhood_after_dump = FlatNeighborhood::from(&hnsw_loaded);
|
||||
let nbg_2_after = neighborhood_after_dump.get_neighbours(2).unwrap();
|
||||
println!("Neighbors of point 2 {:?}", nbg_2_after);
|
||||
// test equality of neighborhood
|
||||
assert_eq!(nbg_2_after.len(), nbg_2_before.len());
|
||||
for i in 0..nbg_2_before.len() {
|
||||
assert_eq!(nbg_2_before[i].p_id, nbg_2_after[i].p_id);
|
||||
assert_eq!(nbg_2_before[i].distance, nbg_2_after[i].distance);
|
||||
}
|
||||
check_graph_equality(&hnsw_loaded, &hnsw);
|
||||
} // end of test_dump_reload
|
||||
} // end module test
|
||||
1872
vendor/ruvector/scripts/patches/hnsw_rs/src/hnsw.rs
vendored
Normal file
1872
vendor/ruvector/scripts/patches/hnsw_rs/src/hnsw.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1703
vendor/ruvector/scripts/patches/hnsw_rs/src/hnswio.rs
vendored
Normal file
1703
vendor/ruvector/scripts/patches/hnsw_rs/src/hnswio.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
30
vendor/ruvector/scripts/patches/hnsw_rs/src/lib.rs
vendored
Normal file
30
vendor/ruvector/scripts/patches/hnsw_rs/src/lib.rs
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
#![cfg_attr(feature = "stdsimd", feature(portable_simd))]
|
||||
//
|
||||
// for logging (debug mostly, switched at compile time in cargo.toml)
|
||||
use env_logger::Builder;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
pub mod api;
|
||||
pub mod datamap;
|
||||
pub mod filter;
|
||||
pub mod flatten;
|
||||
pub mod hnsw;
|
||||
pub mod hnswio;
|
||||
pub mod libext;
|
||||
pub mod prelude;
|
||||
|
||||
// we impose our version of anndists
|
||||
pub use anndists;
|
||||
|
||||
lazy_static! {
|
||||
static ref LOG: u64 = init_log();
|
||||
}
|
||||
|
||||
// install a logger facility
|
||||
#[allow(unused)]
|
||||
fn init_log() -> u64 {
|
||||
Builder::from_default_env().init();
|
||||
println!("\n ************** initializing logger *****************\n");
|
||||
1
|
||||
}
|
||||
1240
vendor/ruvector/scripts/patches/hnsw_rs/src/libext.rs
vendored
Normal file
1240
vendor/ruvector/scripts/patches/hnsw_rs/src/libext.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11
vendor/ruvector/scripts/patches/hnsw_rs/src/prelude.rs
vendored
Normal file
11
vendor/ruvector/scripts/patches/hnsw_rs/src/prelude.rs
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
// gathers modules to include and re-exorts all of anndists!
|
||||
|
||||
pub use crate::api::*;
|
||||
pub use crate::hnsw::*;
|
||||
|
||||
#[allow(unused)]
|
||||
pub use crate::filter::*;
|
||||
|
||||
pub use crate::hnswio::*;
|
||||
|
||||
pub use anndists::dist::distances::*;
|
||||
Reference in New Issue
Block a user