//! This module provides a memory mapping of Data vectors filling the Hnsw structure. //! It is used by the module [hnswio] and also gives access to an iterator over data without loading the graph. //! //! We mmap the file and provide //! - a Hashmap from DataId to address //! - an interface for retrieving just data vectors loaded in the hnsw structure. use std::io::BufReader; use std::fs::{File, OpenOptions}; use std::path::{Path, PathBuf}; use indexmap::map::IndexMap; use log::{debug, error, info, trace}; use mmap_rs::{Mmap, MmapOptions}; use crate::hnsw::DataId; use crate::hnswio; use crate::hnswio::MAGICDATAP; /// This structure uses the data part of the dump of a Hnsw structure to retrieve the data. /// The data is access via a mmap of the data file, so memory is spared at the expense of page loading. // possibly to be used in graph to spare memory? pub struct DataMap { /// File containing Points data _datapath: PathBuf, /// The mmap structure mmap: Mmap, /// map a dataId to an address where we get a bson encoded vector of type T hmap: IndexMap, /// type name of Data t_name: String, /// dimension of data vector dimension: usize, // distname: String, } // end of DataMap impl DataMap { // TODO: specifiy mmap option /// The fname argument corresponds to the basename of the dump. /// To reload from file fname.hnsw.data just pass fname as argument. /// The dir argument is the directory where the fname.hnsw.data and fname.hnsw.graph reside. pub fn from_hnswdump( dir: &Path, file_name: &str, ) -> Result { // reload description to have data type, and check for dump version let mut graphpath = PathBuf::from(dir); graphpath.push(dir); let mut filename = file_name.to_string(); filename.push_str(".hnsw.graph"); graphpath.push(filename); let graphfileres = OpenOptions::new().read(true).open(&graphpath); if graphfileres.is_err() { println!("DataMap: could not open file {:?}", graphpath.as_os_str()); std::process::exit(1); } let graphfile = graphfileres.unwrap(); let mut graph_in = BufReader::new(graphfile); // we need to call load_description first to get distance name let hnsw_description = hnswio::load_description(&mut graph_in).unwrap(); if hnsw_description.format_version <= 2 { let msg = String::from( "from_hnsw::from_hnsw : data mapping is only possible for dumps with the version > 0.1.19 of this crate", ); error!( "Data mapping is only possible for dumps with the version > 0.1.19 of this crate" ); return Err(msg); } let distname = hnsw_description.distname.clone(); let t_name = hnsw_description.get_typename(); // check typename coherence info!("Got typename from reload : {:?}", t_name); if std::any::type_name::() != t_name { error!( "Description has typename {:?}, function type argument is : {:?}", t_name, std::any::type_name::() ); return Err(String::from("type error")); } // get dimension as declared in description let descr_dimension = hnsw_description.get_dimension(); drop(graph_in); // // we know data filename is hnswdump.hnsw.data // let mut datapath = PathBuf::new(); datapath.push(dir); let mut filename = file_name.to_string(); filename.push_str(".hnsw.data"); datapath.push(filename); // let meta = std::fs::metadata(&datapath); if meta.is_err() { error!("Could not open file : {:?}", &datapath); std::process::exit(1); } let fsize = meta.unwrap().len().try_into().unwrap(); // let file_res = File::open(&datapath); if file_res.is_err() { error!("Could not open file : {:?}", &datapath); std::process::exit(1); } let file = file_res.unwrap(); let offset = 0; // let mmap_opt = MmapOptions::new(fsize).unwrap(); let mmap_opt = unsafe { mmap_opt.with_file(&file, offset) }; let mapping_res = mmap_opt.map(); if mapping_res.is_err() { error!("Could not memory map : {:?}", &datapath); std::process::exit(1); } let mmap = mapping_res.unwrap(); // info!("Mmap done on file : {:?}", &datapath); // // where are we in decoding mmap slice? at beginning // let mapped_slice = mmap.as_slice(); // // where are we in decoding mmap slice? let mut current_mmap_addr = 0usize; let mut usize_slice = [0u8; std::mem::size_of::()]; // check magic let mut u32_slice = [0u8; std::mem::size_of::()]; u32_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); current_mmap_addr += std::mem::size_of::(); let magic = u32::from_ne_bytes(u32_slice); assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap"); // get dimension usize_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); current_mmap_addr += std::mem::size_of::(); let dimension = usize::from_ne_bytes(usize_slice); if dimension != descr_dimension { error!( "Description and data do not agree on dimension, data got : {:?}, description got : {:?}", dimension, descr_dimension ); return Err(String::from( "description and data do not agree on dimension", )); } else { info!("Got dimension : {:?}", dimension); } // // now we know that each record consists in // - MAGICDATAP (u32), DataId (u64), dimension (u64) and then (length of type in bytes * dimension) // let record_size = std::mem::size_of::() + 2 * std::mem::size_of::() + dimension * std::mem::size_of::(); let residual = mmap.size() - current_mmap_addr; info!( "Mmap size {}, current_mmap_addr {}, residual : {}", mmap.size(), current_mmap_addr, residual ); let nb_record = residual / record_size; debug!("Record size : {}, nb_record : {}", record_size, nb_record); // allocate hmap with correct capacity let mut hmap = IndexMap::::with_capacity(nb_record); // fill hmap to have address of each data point in file let mut u64_slice = [0u8; std::mem::size_of::()]; // // now we loop on records // for i in 0..nb_record { debug!("Record i : {}, addr : {}", i, current_mmap_addr); // decode Magic u32_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); current_mmap_addr += std::mem::size_of::(); let magic = u32::from_ne_bytes(u32_slice); assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap"); // decode DataId u64_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); current_mmap_addr += std::mem::size_of::(); let data_id = u64::from_ne_bytes(u64_slice) as usize; debug!( "Inserting in hmap : got dataid : {:?} current map address : {:?}", data_id, current_mmap_addr ); // Note we store address where we have to decode dimension*size_of:: and full bson encoded vector hmap.insert(data_id, current_mmap_addr); // now read serialized length u64_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); current_mmap_addr += std::mem::size_of::(); let serialized_len = u64::from_ne_bytes(u64_slice) as usize; if i == 0 { debug!("serialized bytes len to reload {:?}", serialized_len); } let mut v_serialized = vec![0; serialized_len]; v_serialized.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + serialized_len], ); current_mmap_addr += serialized_len; let slice_t = unsafe { std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, dimension) }; trace!( "Deserialized v : {:?} address : {:?} ", slice_t, v_serialized.as_ptr() as *const T ); } // end of for on record // debug!("End of DataMap::from_hnsw."); // let datamap = DataMap { _datapath: datapath, mmap, hmap, t_name, dimension: descr_dimension, distname, }; // Ok(datamap) } // end of from_datas // /// returns true if type T corresponds to type as retrieved in DataMap. /// This function can (should!) be used before calling [Self::get_data()] pub fn check_data_type(&self) -> bool where T: 'static + Sized, { // we check last part of name of type let tname_vec = self.t_name.rsplit_terminator("::").collect::>(); if tname_vec.last().is_none() { let errmsg = "DataMap::check_data_type() cannot determine data type name "; error!("DataMap::check_data_type() cannot determine data type name "); std::panic!("DataMap::check_data_type(), {}", errmsg); } let tname_last = tname_vec.last().unwrap(); // let datat_name_arg = std::any::type_name::().to_string(); let datat_name_vec = datat_name_arg .rsplit_terminator("::") .collect::>(); let datat_name_arg_last = datat_name_vec.last().unwrap(); // if datat_name_arg_last == tname_last { true } else { info!( "Data type in DataMap : {}, type arg = {}", tname_last, datat_name_arg_last ); false } } // end of check_data_type // /// return the data corresponding to dataid. Access is done using mmap. /// Function returns None if address is invalid /// This function requires you know the type T. /// **As mmap loading calls an unsafe function it is recommended to check the type name with [Self::check_data_type()]** pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> { // trace!("In DataMap::get_data, dataid : {:?}", dataid); let address = self.hmap.get(dataid)?; debug!("Address for id : {}, address : {:?}", dataid, address); let mut current_mmap_addr = *address; let mapped_slice = self.mmap.as_slice(); let mut u64_slice = [0u8; std::mem::size_of::()]; u64_slice.copy_from_slice( &mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::()], ); let serialized_len = u64::from_ne_bytes(u64_slice) as usize; current_mmap_addr += std::mem::size_of::(); trace!("Serialized bytes len to reload {:?}", serialized_len); let slice_t = unsafe { std::slice::from_raw_parts( mapped_slice[current_mmap_addr..].as_ptr() as *const T, self.dimension, ) }; Some(slice_t) } /// returns Keys in order they are in the file, thus optimizing file/memory access. /// Note that in case of parallel insertion this can be different from insertion odrer. pub fn get_dataid_iter(&self) -> indexmap::map::Keys<'_, DataId, usize> { self.hmap.keys() } /// returns full data type name pub fn get_data_typename(&self) -> String { self.t_name.clone() } /// returns full data type name pub fn get_distname(&self) -> String { self.distname.clone() } /// return the number of data in mmap pub fn get_nb_data(&self) -> usize { self.hmap.len() } } // end of impl DataMap //===================================================================================== #[cfg(test)] mod tests { use super::*; use crate::hnswio::HnswIo; use anndists::dist::*; pub use crate::api::AnnT; use crate::prelude::*; use rand::distr::{Distribution, Uniform}; fn log_init_test() { let _ = env_logger::builder().is_test(true).try_init(); } #[test] fn test_file_mmap() { println!("\n\n test_file_mmap"); log_init_test(); // generate a random test let mut rng = rand::rng(); let unif = Uniform::::new(0., 1.).unwrap(); // 1000 vectors of size 10 f32 let nbcolumn = 50; let nbrow = 11; let mut xsi; let mut data = Vec::with_capacity(nbcolumn); for j in 0..nbcolumn { data.push(Vec::with_capacity(nbrow)); for _ in 0..nbrow { xsi = unif.sample(&mut rng); data[j].push(xsi); } debug!("j : {:?}, data : {:?} ", j, &data[j]); } // define hnsw let ef_construct = 25; let nb_connection = 10; let hnsw = Hnsw::::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {}); for (i, d) in data.iter().enumerate() { hnsw.insert((d, i)); } // some loggin info hnsw.dump_layer_info(); // dump in a file. Must take care of name as tests runs in // !!! let fname = "mmap_test"; let directory = tempfile::tempdir().unwrap(); let _res = hnsw.file_dump(directory.path(), fname); let check_reload = false; if check_reload { // We check we can reload debug!("HNSW reload."); let directory = tempfile::tempdir().unwrap(); let mut reloader = HnswIo::new(directory.path(), fname); let hnsw_loaded: Hnsw = reloader.load_hnsw::().unwrap(); check_graph_equality(&hnsw_loaded, &hnsw); info!("========= reload success, going to mmap reloading ========="); } // // now we have check that datamap seems ok, test reload of hnsw with mmap let datamap: DataMap = DataMap::from_hnswdump::(directory.path(), fname).unwrap(); let nb_test = 30; info!("Checking random access of id , nb test : {}", nb_test); for _ in 0..nb_test { // sample an id in 0..nb_data let unif = Uniform::::new(0, nbcolumn).unwrap(); let id = unif.sample(&mut rng); let d = datamap.get_data::(&id); assert!(d.is_some()); if d.is_some() { debug!("id = {}, v = {:?}", id, d.as_ref().unwrap()); assert_eq!(d.as_ref().unwrap(), &data[id]); } } // test iterator from datamap let keys = datamap.get_dataid_iter(); for k in keys { let _data = datamap.get_data::(k); } } // end of test_file_mmap #[test] fn test_mmap_iter() { log_init_test(); // generate a random test let mut rng = rand::rng(); let unif = Uniform::::new(0, 10000).unwrap(); // 1000 vectors of size 10 f32 let nbcolumn = 50; let nbrow = 11; let mut xsi; let mut data = Vec::with_capacity(nbcolumn); for j in 0..nbcolumn { data.push(Vec::with_capacity(nbrow)); for _ in 0..nbrow { xsi = unif.sample(&mut rng); data[j].push(xsi); } debug!("j : {:?}, data : {:?} ", j, &data[j]); } // define hnsw let ef_construct = 25; let nb_connection = 10; let hnsw = Hnsw::::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {}); for (i, d) in data.iter().enumerate() { hnsw.insert((d, i)); } // some loggin info hnsw.dump_layer_info(); // dump in a file. Must take care of name as tests runs in // !!! let fname = "mmap_order_test"; let directory = tempfile::tempdir().unwrap(); let _res = hnsw.file_dump(directory.path(), fname); // now we have check that datamap seems ok, test reload of hnsw with mmap let datamap: DataMap = DataMap::from_hnswdump::(directory.path(), fname).unwrap(); // testing type check assert!(datamap.check_data_type::()); assert!(!datamap.check_data_type::()); info!("Datamap iteration order checking"); let keys = datamap.get_dataid_iter(); for (i, dataid) in keys.enumerate() { let v = datamap.get_data::(dataid).unwrap(); assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, i); } // rm files generated! let _ = std::fs::remove_file("mmap_order_test.hnsw.data"); let _ = std::fs::remove_file("mmap_order_test.hnsw.graph"); } // } // end of mod tests