Files
wifi-densepose/vendor/ruvector/patches/hnsw_rs/src/hnsw.rs

1873 lines
75 KiB
Rust

//! A rust implementation of Approximate NN search from:
//! Efficient and robust approximate nearest neighbour search using Hierarchical Navigable
//! small World graphs.
//! Yu. A. Malkov, D.A Yashunin 2016, 2018
use serde::{Deserialize, Serialize};
use cpu_time::ProcessTime;
use std::time::SystemTime;
use std::cmp::Ordering;
use parking_lot::{Mutex, RwLock, RwLockReadGuard};
use rayon::prelude::*;
use std::sync::Arc;
use std::sync::mpsc::channel;
use std::any::type_name;
use hashbrown::HashMap;
#[allow(unused)]
use std::collections::HashSet;
use std::collections::binary_heap::BinaryHeap;
use log::trace;
use log::{debug, info};
pub use crate::filter::FilterT;
use anndists::dist::distances::Distance;
// TODO
// Profiling.
/// This unit structure provides the type to instanciate Hnsw with,
/// to get reload of graph only in the the structure.
/// It must be associated to the unit structure dist::NoDist for the distance type to provide.
#[derive(Default, Clone, Copy, Serialize, Deserialize, Debug)]
pub struct NoData;
/// maximum number of layers
pub(crate) const NB_LAYER_MAX: u8 = 16; // so max layer is 15!!
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
/// The 2-uple represent layer as u8 and rank in layer as a i32 as stored in our structure
pub struct PointId(pub u8, pub i32);
/// this type is for an identificateur of each data vector, given by client.
/// Can be the rank of data in an array, a hash value or anything that permits
/// retrieving the data.
pub type DataId = usize;
pub type PointDistance<T> = Box<dyn Distance<T>>;
/// A structure containing internal pointId with distance to this pointId.
/// The order is given by ordering the distance to the point it refers to.
/// So points ordering has a meaning only has points refers to the same point
#[derive(Debug, Clone, Copy)]
pub struct PointIdWithOrder {
/// the identificateur of the point for which we store a distance
pub point_id: PointId,
/// The distance to a reference point (not represented in the structure)
pub dist_to_ref: f32,
}
impl PartialEq for PointIdWithOrder {
fn eq(&self, other: &PointIdWithOrder) -> bool {
self.dist_to_ref == other.dist_to_ref
} // end eq
}
// order points by distance to self.
impl PartialOrd for PointIdWithOrder {
fn partial_cmp(&self, other: &PointIdWithOrder) -> Option<Ordering> {
self.dist_to_ref.partial_cmp(&other.dist_to_ref)
} // end cmp
} // end impl PartialOrd
impl<T: Send + Sync + Clone + Copy> From<&PointWithOrder<'_, T>> for PointIdWithOrder {
fn from(point: &PointWithOrder<T>) -> PointIdWithOrder {
PointIdWithOrder::new(point.point_ref.p_id, point.dist_to_ref)
}
}
impl PointIdWithOrder {
pub fn new(point_id: PointId, dist_to_ref: f32) -> Self {
PointIdWithOrder {
point_id,
dist_to_ref,
}
}
} // end of impl block
//=======================================================================================
/// The struct giving an answer point to a search request.
/// This structure is exported to other language API.
/// First field is origin id of the request point, second field is distance to request point
#[repr(C)]
#[derive(Debug, Copy, Clone, Default)]
pub struct Neighbour {
/// identification of data vector as given in initializing hnsw
pub d_id: DataId,
/// distance of neighbours
pub distance: f32,
/// point identification inside layers
pub p_id: PointId,
}
impl Neighbour {
pub fn new(d_id: DataId, distance: f32, p_id: PointId) -> Neighbour {
Neighbour {
d_id,
distance,
p_id,
}
}
/// retrieves original id of neighbour as given in hnsw initialization
pub fn get_origin_id(&self) -> DataId {
self.d_id
}
/// return the distance
pub fn get_distance(&self) -> f32 {
self.distance
}
}
//=======================================================================================
#[derive(Debug, Clone)]
enum PointData<'b, T: Clone + Send + Sync + 'b> {
// full data
V(Vec<T>),
// areference to a mmaped slice
S(&'b [T]),
} // end of enum PointData
impl<'b, T: Clone + Send + Sync + 'b> PointData<'b, T> {
// allocate a point stored in structure
fn new_v(v: Vec<T>) -> Self {
PointData::V(v)
}
// allocate a point representation a memory mapped slice
fn new_s(s: &'b [T]) -> Self {
PointData::S(s)
}
fn get_v(&self) -> &[T] {
match self {
PointData::V(v) => v.as_slice(),
PointData::S(s) => s,
}
} // end of get_v
} // end of impl block for PointData
/// The basestructure representing a data point.
/// Its constains data as coming from the client, its client id,
/// and position in layer representation and neighbours.
///
// neighbours table : one vector by layer so neighbours is allocated to NB_LAYER_MAX
//
#[derive(Debug, Clone)]
#[allow(clippy::type_complexity)]
pub struct Point<'b, T: Clone + Send + Sync> {
/// The data of this point, coming from hnsw client and associated to origin_id,
data: PointData<'b, T>,
/// an id coming from client using hnsw, should identify point uniquely
origin_id: DataId,
/// a point id identifying point as stored in our structure
p_id: PointId,
/// neighbours info
pub(crate) neighbours: Arc<RwLock<Vec<Vec<Arc<PointWithOrder<'b, T>>>>>>,
}
impl<'b, T: Clone + Send + Sync> Point<'b, T> {
pub fn new(v: Vec<T>, origin_id: usize, p_id: PointId) -> Self {
let mut neighbours = Vec::with_capacity(NB_LAYER_MAX as usize);
// CAVEAT, perhaps pass nb layer as arg ?
for _ in 0..NB_LAYER_MAX {
neighbours.push(Vec::<Arc<PointWithOrder<T>>>::new());
}
Point {
data: PointData::new_v(v),
origin_id,
p_id,
neighbours: Arc::new(RwLock::new(neighbours)),
}
}
pub fn new_from_mmap(s: &'b [T], origin_id: usize, p_id: PointId) -> Self {
let mut neighbours = Vec::with_capacity(NB_LAYER_MAX as usize);
// CAVEAT, perhaps pass nb layer as arg ?
for _ in 0..NB_LAYER_MAX {
neighbours.push(Vec::<Arc<PointWithOrder<T>>>::new());
}
Point {
data: PointData::new_s(s),
origin_id,
p_id,
neighbours: Arc::new(RwLock::new(neighbours)),
}
}
/// get a reference to vector data
pub fn get_v(&self) -> &[T] {
self.data.get_v()
}
/// return coordinates in indexation
pub fn get_point_id(&self) -> PointId {
self.p_id
}
/// returns external (or client id) id of point
pub fn get_origin_id(&self) -> usize {
self.origin_id
}
/// returns for each layer, a vector Neighbour of a point, one vector by layer
/// useful for extern crate only as it reallocates vectors
pub fn get_neighborhood_id(&self) -> Vec<Vec<Neighbour>> {
let ref_neighbours = self.neighbours.read();
let nb_layer = ref_neighbours.len();
let mut neighborhood = Vec::<Vec<Neighbour>>::with_capacity(nb_layer);
for i in 0..nb_layer {
let mut neighbours = Vec::<Neighbour>::new();
let nb_ngbh = ref_neighbours[i].len();
if nb_ngbh > 0usize {
neighbours.reserve(nb_ngbh);
for pointwo in &ref_neighbours[i] {
neighbours.push(Neighbour::new(
pointwo.point_ref.get_origin_id(),
pointwo.dist_to_ref,
pointwo.point_ref.get_point_id(),
));
}
}
neighborhood.push(neighbours);
}
neighborhood
}
/// prints minimal information on neighbours of point.
pub fn debug_dump(&self) {
println!(" \n dump of point id : {:?}", self.p_id);
println!("\n origin id : {:?} ", self.origin_id);
println!(" neighbours : ...");
let ref_neighbours = self.neighbours.read();
for i in 0..ref_neighbours.len() {
if !ref_neighbours[i].is_empty() {
println!("neighbours at layer {:?}", i);
for n in &ref_neighbours[i] {
println!(" {:?}", n.point_ref.p_id);
}
}
}
println!(" neighbours dump : end");
}
} // end of block
//===========================================================================================
/// A structure to store neighbours for of a point.
#[derive(Debug, Clone)]
pub(crate) struct PointWithOrder<'b, T: Clone + Send + Sync> {
/// the identificateur of the point for which we store a distance to a point for which
/// we made a request.
point_ref: Arc<Point<'b, T>>,
/// The distance to a point_ref to the request point (not represented in the structure)
dist_to_ref: f32,
}
impl<T: Clone + Send + Sync> PartialEq for PointWithOrder<'_, T> {
fn eq(&self, other: &PointWithOrder<T>) -> bool {
self.dist_to_ref == other.dist_to_ref
} // end eq
}
impl<T: Clone + Send + Sync> Eq for PointWithOrder<'_, T> {}
// order points by distance to self.
#[allow(clippy::non_canonical_partial_ord_impl)]
impl<T: Clone + Send + Sync> PartialOrd for PointWithOrder<'_, T> {
fn partial_cmp(&self, other: &PointWithOrder<T>) -> Option<Ordering> {
self.dist_to_ref.partial_cmp(&other.dist_to_ref)
} // end cmp
} // end impl PartialOrd
impl<T: Clone + Send + Sync> Ord for PointWithOrder<'_, T> {
fn cmp(&self, other: &PointWithOrder<T>) -> Ordering {
if !self.dist_to_ref.is_nan() && !other.dist_to_ref.is_nan() {
self.dist_to_ref.partial_cmp(&other.dist_to_ref).unwrap()
} else {
panic!("got a NaN in a distance");
}
} // end cmp
}
impl<'b, T: Clone + Send + Sync> PointWithOrder<'b, T> {
pub fn new(point_ref: &Arc<Point<'b, T>>, dist_to_ref: f32) -> Self {
PointWithOrder {
point_ref: Arc::clone(point_ref),
dist_to_ref,
}
}
} // end of impl block
//============================================================================================
// LayerGenerator
use rand::distributions::Uniform;
use rand::prelude::*;
/// a struct to randomly generate a level for an item according to an exponential law
/// of parameter given by scale.
/// The distribution is constrained to be in [0..maxlevel[
pub struct LayerGenerator {
rng: Arc<Mutex<rand::rngs::StdRng>>,
unif: Uniform<f64>,
// drives number of levels generated ~ S
scale: f64,
maxlevel: usize,
}
impl LayerGenerator {
pub fn new(max_nb_connection: usize, maxlevel: usize) -> Self {
let scale = 1. / (max_nb_connection as f64).ln();
LayerGenerator {
rng: Arc::new(Mutex::new(StdRng::from_entropy())),
unif: Uniform::new(0., 1.),
scale,
maxlevel,
}
}
// new when we know scale used. Should replace the one without scale
pub(crate) fn new_with_scale(
max_nb_connection: usize,
scale_factor: f64,
maxlevel: usize,
) -> Self {
let scale_default = 1. / (max_nb_connection as f64).ln();
LayerGenerator {
rng: Arc::new(Mutex::new(StdRng::from_entropy())),
unif: Uniform::new(0., 1.),
scale: scale_default * scale_factor,
maxlevel,
}
}
//
// l=0 most densely packed layer
// if S is scale we sample so that P(l=n) = exp(-n/S) - exp(- (n+1)/S)
// with S = 1./ln(max_nb_connection) P(l >= maxlevel) = exp(-maxlevel * ln(max_nb_connection))
// for nb_conn = 10, even with maxlevel = 10, we get P(l >= maxlevel) = 1.E-13
// In Malkov(2016) S = 1./log(max_nb_connection)
//
/// generate a layer with given maxlevel. upper layers (higher index) are of decreasing probabilities.
/// thread safe method.
fn generate(&self) -> usize {
let mut protected_rng = self.rng.lock();
let xsi = protected_rng.sample(self.unif);
let level = -xsi.ln() * self.scale;
let mut ulevel = level.floor() as usize;
// we redispatch possibly sampled level >= maxlevel to required range
if ulevel >= self.maxlevel {
// This occurs with very low probability. Cf commentary above.
ulevel = protected_rng.sample(Uniform::new(0, self.maxlevel));
}
ulevel
}
/// just to try some variations on exponential level sampling. Unused.
fn set_scale_modification(&mut self, scale_modification: f64) {
self.scale *= scale_modification;
log::info!("using scale for sampling levels : {:.2e}", self.scale);
}
//
fn get_level_scale(&self) -> f64 {
self.scale
}
} // end impl for LayerGenerator
// ====================================================================
/// A short-hand for points in a layer
type Layer<'b, T> = Vec<Arc<Point<'b, T>>>;
/// a structure for indexation of points in layer
#[allow(unused)]
pub struct PointIndexation<'b, T: Clone + Send + Sync> {
/// max number of connection for a point at a layer
pub(crate) max_nb_connection: usize,
//
pub(crate) max_layer: usize,
/// needs at least one representation of points. points_by_layers\[i\] gives the points in layer i
pub(crate) points_by_layer: Arc<RwLock<Vec<Layer<'b, T>>>>,
/// utility to generate a level
pub(crate) layer_g: LayerGenerator,
/// number of points in indexed structure
pub(crate) nb_point: Arc<RwLock<usize>>,
/// curent enter_point: an Arc RwLock on a possible Arc Point
pub(crate) entry_point: Arc<RwLock<Option<Arc<Point<'b, T>>>>>,
}
// A point indexation may contain circular references. To deallocate these after a point indexation goes out of scope,
// implement the Drop trait.
impl<T: Clone + Send + Sync> Drop for PointIndexation<'_, T> {
fn drop(&mut self) {
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
info!("entering PointIndexation drop");
// clear_neighborhood. There are no point in neighborhoods that are not referenced directly in layers.
// so we cannot lose reference to a point by cleaning neighborhood
fn clear_neighborhoods<T: Clone + Send + Sync>(init: &Point<T>) {
let mut neighbours = init.neighbours.write();
let nb_layer = neighbours.len();
for l in 0..nb_layer {
neighbours[l].clear();
}
neighbours.clear();
}
if let Some(i) = self.entry_point.write().as_ref() {
clear_neighborhoods(i.as_ref());
}
//
let nb_level = self.get_max_level_observed();
for l in 0..=nb_level {
trace!("clearing layer {}", l);
let layer = &mut self.points_by_layer.write()[l as usize];
layer.into_par_iter().for_each(|p| clear_neighborhoods(p));
layer.clear();
}
//
debug!("clearing self.points_by_layer...");
drop(self.points_by_layer.write());
debug!("exiting PointIndexation drop");
info!(
" drop sys time(s) {:?} cpu time {:?}",
sys_now.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
} // end my drop
} // end implementation Drop
impl<'b, T: Clone + Send + Sync> PointIndexation<'b, T> {
pub fn new(max_nb_connection: usize, max_layer: usize, max_elements: usize) -> Self {
let mut points_by_layer = Vec::with_capacity(max_layer);
for i in 0..max_layer {
// recall that range are right extremeity excluded
// compute fraction of points going into layer i and do expected memory reservation
let s = 1. / (max_nb_connection as f64).ln();
let frac = (-(i as f64) / s).exp() - (-((i + 1) as f64) / s);
let expected_size = ((frac * max_elements as f64).round()) as usize;
points_by_layer.push(Vec::with_capacity(expected_size));
}
let layer_g = LayerGenerator::new(max_nb_connection, max_layer);
PointIndexation {
max_nb_connection,
max_layer,
points_by_layer: Arc::new(RwLock::new(points_by_layer)),
layer_g,
nb_point: Arc::new(RwLock::new(0)),
entry_point: Arc::new(RwLock::new(None)),
}
} // end of new
/// returns the maximum level of layer observed
pub fn get_max_level_observed(&self) -> u8 {
let opt = self.entry_point.read();
match opt.as_ref() {
Some(arc_point) => arc_point.p_id.0,
None => 0,
}
}
pub fn get_level_scale(&self) -> f64 {
self.layer_g.get_level_scale()
}
fn debug_dump(&self) {
println!(" debug dump of PointIndexation");
let max_level_observed = self.get_max_level_observed();
// CAVEAT a lock once
for l in 0..=max_level_observed as usize {
println!(
" layer {} : length : {} ",
l,
self.points_by_layer.read()[l].len()
);
}
println!(" debug dump of PointIndexation end");
}
/// real insertion of point in point indexation
// generate a new Point/ArcPoint (with neigbourhood info empty) and store it in global table
// The function is called by Hnsw insert method
fn generate_new_point(&self, data: &[T], origin_id: usize) -> (Arc<Point<'b, T>>, usize) {
// get a write lock at the beginning of the function
let level = self.layer_g.generate();
let new_point;
{
// open a write lock on points_by_layer
let mut points_by_layer_ref = self.points_by_layer.write();
let mut p_id = PointId(level as u8, -1);
p_id.1 = points_by_layer_ref[p_id.0 as usize].len() as i32;
// make a Point and then an Arc<Point>
let point = Point::new(data.to_vec(), origin_id, p_id);
new_point = Arc::new(point);
trace!("definitive pushing of point {:?}", p_id);
points_by_layer_ref[p_id.0 as usize].push(Arc::clone(&new_point));
} // close write lock on points_by_layer
//
let nb_point;
{
let mut lock_nb_point = self.nb_point.write();
*lock_nb_point += 1;
nb_point = *lock_nb_point;
if nb_point % 50000 == 0 {
println!(" setting number of points {:?} ", nb_point);
}
}
trace!(" setting number of points {:?} ", *self.nb_point);
// Now possibly this is a point on a new layer that will have no neighbours in its layer
(Arc::clone(&new_point), nb_point)
} // end of insert
/// check if entry_point is modified
fn check_entry_point(&self, new_point: &Arc<Point<'b, T>>) {
//
// take directly a write lock so that we are sure nobody can change anything between read and write
// of entry_point_id
trace!("trying to get a lock on entry point");
let mut entry_point_ref = self.entry_point.write();
match entry_point_ref.as_ref() {
Some(arc_point) => {
if new_point.p_id.0 > arc_point.p_id.0 {
debug!("Hnsw , inserting entry point {:?} ", new_point.p_id);
debug!(
"PointIndexation insert setting max level from {:?} to {:?}",
arc_point.p_id.0, new_point.p_id.0
);
*entry_point_ref = Some(Arc::clone(new_point));
}
}
None => {
trace!("initializing entry point");
debug!("Hnsw , inserting entry point {:?} ", new_point.p_id);
*entry_point_ref = Some(Arc::clone(new_point));
}
}
} // end of check_entry_point
/// returns the number of points in layered structure
pub fn get_nb_point(&self) -> usize {
*self.nb_point.read()
}
/// returns the number of points in a given layer, 0 on a bad layer num
pub fn get_layer_nb_point(&self, layer: usize) -> usize {
let nb_layer = self.points_by_layer.read().len();
if layer < nb_layer {
self.points_by_layer.read()[layer].len()
} else {
0
}
} // end of get_layer_nb_point
/// returns the size of data vector in graph if any, else return 0
pub fn get_data_dimension(&self) -> usize {
let ep = self.entry_point.read();
match ep.as_ref() {
Some(point) => point.get_v().len(),
None => 0,
}
}
/// returns (**by cloning**) the data inside a point given it PointId, or None if PointId is not coherent.
/// Can be useful after reloading from a dump.
/// NOTE : This function should not be called during or before insertion in the structure is terminated as it
/// uses read locks to access the inside of Hnsw structure.
pub fn get_point_data(&self, p_id: &PointId) -> Option<Vec<T>> {
if p_id.1 < 0 {
return None;
}
let p: usize = std::convert::TryFrom::try_from(p_id.1).unwrap();
let l = p_id.0 as usize;
if p_id.0 <= self.get_max_level_observed() && p < self.get_layer_nb_point(l) {
Some(self.points_by_layer.read()[l][p].get_v().to_vec())
} else {
None
}
} // end of get_point_data
/// returns (**by Arc::clone**) the point given it PointId, or None if PointId is not coherent.
/// Can be useful after reloading from a dump.
/// NOTE : This function should not be called during or before insertion in the structure is terminated as it
/// uses read locks to access the inside of Hnsw structure.
#[allow(unused)]
pub(crate) fn get_point(&self, p_id: &PointId) -> Option<Arc<Point<'b, T>>> {
if p_id.1 < 0 {
return None;
}
let p: usize = std::convert::TryFrom::try_from(p_id.1).unwrap();
let l = p_id.0 as usize;
if p_id.0 <= self.get_max_level_observed() && p < self.get_layer_nb_point(l) {
Some(self.points_by_layer.read()[l][p].clone())
} else {
None
}
} // end of get_point
/// get an iterator on the points stored in a given layer
pub fn get_layer_iterator<'a>(&'a self, layer: usize) -> IterPointLayer<'a, 'b, T> {
IterPointLayer::new(self, layer)
} // end of get_layer_iterator
} // end of impl PointIndexation
//============================================================================================
/// an iterator on points stored.
/// The iteration begins at level 0 (most populated level) and goes upward in levels.
/// The iterator takes a ReadGuard on the PointIndexation structure
pub struct IterPoint<'a, 'b, T: Clone + Send + Sync + 'b> {
point_indexation: &'a PointIndexation<'b, T>,
pi_guard: RwLockReadGuard<'a, Vec<Layer<'b, T>>>,
layer: i64,
slot_in_layer: i64,
}
impl<'a, 'b, T: Clone + Send + Sync> IterPoint<'a, 'b, T> {
pub fn new(point_indexation: &'a PointIndexation<'b, T>) -> Self {
let pi_guard: RwLockReadGuard<Vec<Layer<'b, T>>> = point_indexation.points_by_layer.read();
IterPoint {
point_indexation,
pi_guard,
layer: -1,
slot_in_layer: -1,
}
}
} // end of block impl IterPoint
/// iterator for layer 0 to upper layer.
impl<'b, T: Clone + Send + Sync> Iterator for IterPoint<'_, 'b, T> {
type Item = Arc<Point<'b, T>>;
//
fn next(&mut self) -> Option<Self::Item> {
if self.layer == -1 {
self.layer = 0;
self.slot_in_layer = 0;
}
if (self.slot_in_layer as usize) < self.pi_guard[self.layer as usize].len() {
let slot = self.slot_in_layer as usize;
self.slot_in_layer += 1;
Some(self.pi_guard[self.layer as usize][slot].clone())
} else {
self.slot_in_layer = 0;
self.layer += 1;
// must reach a non empty layer if possible
let entry_point_ref = self.point_indexation.entry_point.read();
let points_by_layer = self.point_indexation.points_by_layer.read();
let entry_point_level = entry_point_ref.as_ref().unwrap().p_id.0;
while (self.layer as u8) <= entry_point_level
&& points_by_layer[self.layer as usize].is_empty()
{
self.layer += 1;
}
// now here either (self.layer as u8) > self.point_indexation.max_level_observed
// or self.point_indexation.points_by_layer[self.layer as usize ].len() > 0
if (self.layer as u8) <= entry_point_level {
let slot = self.slot_in_layer as usize;
self.slot_in_layer += 1;
Some(points_by_layer[self.layer as usize][slot].clone())
} else {
None
}
}
} // end of next
} // end of impl Iterator
impl<'a, 'b, T: Clone + Send + Sync> IntoIterator for &'a PointIndexation<'b, T> {
type Item = Arc<Point<'b, T>>;
type IntoIter = IterPoint<'a, 'b, T>;
//
fn into_iter(self) -> Self::IntoIter {
IterPoint::new(self)
}
} // end of IntoIterator for &'a PointIndexation<T>
/// An iterator on points stored in a given layer
/// The iterator stores a ReadGuard on the structure PointIndexation
pub struct IterPointLayer<'a, 'b, T: Clone + Send + Sync> {
_point_indexation: &'a PointIndexation<'b, T>,
pi_guard: RwLockReadGuard<'a, Vec<Layer<'b, T>>>,
layer: usize,
slot_in_layer: usize,
}
impl<'a, 'b, T: Clone + Send + Sync> IterPointLayer<'a, 'b, T> {
pub fn new(point_indexation: &'a PointIndexation<'b, T>, layer: usize) -> Self {
let pi_guard: RwLockReadGuard<Vec<Layer<'b, T>>> = point_indexation.points_by_layer.read();
IterPointLayer {
_point_indexation: point_indexation,
pi_guard,
layer,
slot_in_layer: 0,
}
}
} // end of block impl IterPointLayer
/// iterator for layer 0 to upper layer.
impl<'b, T: Clone + Send + Sync + 'b> Iterator for IterPointLayer<'_, 'b, T> {
type Item = Arc<Point<'b, T>>;
//
fn next(&mut self) -> Option<Self::Item> {
if (self.slot_in_layer) < self.pi_guard[self.layer].len() {
let slot = self.slot_in_layer;
self.slot_in_layer += 1;
Some(self.pi_guard[self.layer][slot].clone())
} else {
None
}
} // end of next
} // end of impl Iterator
// ============================================================================================
// The fields are made pub(crate) to be able to initialize struct from hnswio
/// The Base structure for hnsw implementation.
/// The main useful functions are : new, insert, insert_parallel, search, parallel_search and file_dump
/// as described in trait AnnT.
///
/// Other functions are mainly for others crate to get access to some fields.
pub struct Hnsw<'b, T: Clone + Send + Sync + 'b, D: Distance<T>> {
/// asked number of candidates in search
pub(crate) ef_construction: usize,
/// maximum number of connection by layer for a point
pub(crate) max_nb_connection: usize,
/// flag to enforce that we have ef candidates as pruning strategy can discard some points
/// Can be set to true with method :set_extend_candidates
/// When set to true used only in base layer.
pub(crate) extend_candidates: bool,
/// defuault to false
pub(crate) keep_pruned: bool,
/// max layer , recall rust is in 0..maxlevel right bound excluded
pub(crate) max_layer: usize,
/// The global table containing points
pub(crate) layer_indexed_points: PointIndexation<'b, T>,
/// dimension data stored in points
#[allow(unused)]
pub(crate) data_dimension: usize,
/// distance between points. initialized at first insertion
pub(crate) dist_f: D,
/// insertion mode or searching mode. This flag prevents a internal thread to do a write when searching with other threads.
pub(crate) searching: bool,
/// set to true if some data come from a mmap
pub(crate) datamap_opt: bool,
} // end of Hnsw
impl<'b, T: Clone + Send + Sync, D: Distance<T> + Send + Sync> Hnsw<'b, T, D> {
/// allocation function
/// . max_nb_connection : number of neighbours stored, by layer, in tables. Must be less than 256.
/// . ef_construction : controls numbers of neighbours explored during construction. See README or paper.
/// . max_elements : hint to speed up allocation tables. number of elements expected.
/// . f : the distance function
pub fn new(
max_nb_connection: usize,
max_elements: usize,
max_layer: usize,
ef_construction: usize,
f: D,
) -> Self {
let adjusted_max_layer = (NB_LAYER_MAX as usize).min(max_layer);
let layer_indexed_points =
PointIndexation::<T>::new(max_nb_connection, adjusted_max_layer, max_elements);
let extend_candidates = false;
let keep_pruned = false;
//
if max_nb_connection > 256 {
println!("error max_nb_connection must be less equal than 256");
std::process::exit(1);
}
//
info!("Hnsw max_nb_connection {:?}", max_nb_connection);
info!("Hnsw nb elements {:?}", max_elements);
info!("Hnsw ef_construction {:?}", ef_construction);
info!("Hnsw distance {:?}", type_name::<D>());
info!("Hnsw extend candidates {:?}", extend_candidates);
//
Hnsw {
max_nb_connection,
ef_construction,
extend_candidates,
keep_pruned,
max_layer: adjusted_max_layer,
layer_indexed_points,
data_dimension: 0,
dist_f: f,
searching: false,
datamap_opt: false,
}
} // end of new
/// get ef_construction used in graph creation
pub fn get_ef_construction(&self) -> usize {
self.ef_construction
}
/// returns the maximum layer authorized in construction
pub fn get_max_level(&self) -> usize {
self.max_layer
}
/// return the maximum level reached in the layers.
pub fn get_max_level_observed(&self) -> u8 {
self.layer_indexed_points.get_max_level_observed()
}
/// returns the maximum of links between a point and others points in each layer
pub fn get_max_nb_connection(&self) -> u8 {
self.max_nb_connection as u8
}
/// returns number of points stored in hnsw structure
pub fn get_nb_point(&self) -> usize {
self.layer_indexed_points.get_nb_point()
}
/// set searching mode.
/// It is not possible to do parallel insertion and parallel searching simultaneously in different threads
/// so to enable searching after parallel insertion the flag must be set to true.
/// To resume parallel insertion reset the flag to false and so on.
pub fn set_searching_mode(&mut self, flag: bool) {
// must use an atomic!
self.searching = flag;
}
/// get name if distance
pub fn get_distance_name(&self) -> String {
type_name::<D>().to_string()
}
/// set the flag asking to keep pruned vectors by Navarro's heuristic (see Paper).
/// It can be useful for small datasets where the pruning can make it difficult
/// to get the exact number of neighbours asked for.
pub fn set_keeping_pruned(&mut self, flag: bool) {
self.keep_pruned = flag;
}
/// retrieves the distance used in Hnsw construction
pub fn get_distance(&self) -> &D {
&self.dist_f
}
/// set extend_candidates to given flag. By default it is false.
/// Only used in the level 0 layer during insertion (see the paper)
/// flag to enforce that we have ef candidates neighbours examined as pruning strategy
/// can discard some points
pub fn set_extend_candidates(&mut self, flag: bool) {
self.extend_candidates = flag;
}
// When dumping we need to know if some file is mmapped
pub(crate) fn get_datamap_opt(&self) -> bool {
self.datamap_opt
}
/// By default the levels are sampled using an exponential law of parameter **ln(max_nb_conn)**
/// so the probability of having more than l levels decrease as **exp(-l * ln(max_nb_conn))**.
/// Reducing the scale change the parameter of the exponential to **ln(max_nb_conn)/scale**.
/// This reduce the number of levels generated and can provide better precision, reduce memory with marginally more cpu used.
/// The factor must between 0.2 and 1.
pub fn modify_level_scale(&mut self, scale_modification: f64) {
//
if self.get_nb_point() > 0 {
println!(
"using modify_level_scale is possible at creation of a Hnsw structure to ensure coherence between runs"
)
}
//
let min_factor = 0.2;
println!(
"\n Current scale value : {:.2e}, Scale modification factor asked : {:.2e},(modification factor must be between {:.2e} and 1.)",
self.layer_indexed_points.layer_g.scale, scale_modification, min_factor
);
//
if scale_modification > 1. {
println!(
"\n Scale modification not applied, modification arg {:.2e} not valid , factor must be less than 1.)",
scale_modification
);
} else if scale_modification < min_factor {
println!(
"\n Scale modification arg {:.2e} not valid , factor must be greater than {:.2e}, using {:.2e})",
scale_modification, min_factor, min_factor
);
}
//
self.layer_indexed_points
.layer_g
.set_scale_modification(scale_modification.max(min_factor).min(1.));
} // end of set_scale_modification
// here we could pass a point_id_with_order instead of entry_point_id: PointId
// The efficacity depends on greedy part depends on how near entry point is from point.
// ef is the number of points to return
// The method returns a BinaryHeap with positive distances. The caller must transforms it according its need
//** NOTE: the entry point is pushed into returned point at the beginning of the function, but in fact entry_point is in a layer
//** with higher (one more) index than the argument layer. If the greedy search matches a sufficiently large number of points
//** nearer to point searched (arg point) than entry_point it will finally get popped up from the heap of returned points
//** but otherwise it will stay in the binary heap and so we can have a point in neighbours that is in fact in a layer
//** above the one we search in.
//** The guarantee is that the binary heap will return points in layer
//** with a larger index, although we can expect that most often (at least in densely populated layers) the returned
//** points will be found in searched layer
///
/// Greedy algorithm n° 2 in Malkov paper.
/// search in a layer (layer) for the ef points nearest a point to be inserted in hnsw.
fn search_layer(
&self,
point: &[T],
entry_point: Arc<Point<'b, T>>,
ef: usize,
layer: u8,
filter: Option<&dyn FilterT>,
) -> BinaryHeap<Arc<PointWithOrder<'b, T>>> {
//
trace!(
"entering search_layer with entry_point_id {:?} layer : {:?} ef {:?} ",
entry_point.p_id, layer, ef
);
//
// here we allocate a binary_heap on values not on reference beccause we want to return
// log2(skiplist_size) must be greater than 1.
let skiplist_size = ef.max(2);
// we will store positive distances in this one
let mut return_points = BinaryHeap::<Arc<PointWithOrder<T>>>::with_capacity(skiplist_size);
//
if self.layer_indexed_points.points_by_layer.read()[layer as usize].is_empty() {
// at the beginning we can have nothing in layer
trace!("search layer {:?}, empty layer", layer);
return return_points;
}
if entry_point.p_id.1 < 0 {
trace!("search layer negative point id : {:?}", entry_point.p_id);
return return_points;
}
// initialize visited points
let dist_to_entry_point = self.dist_f.eval(point, entry_point.data.get_v());
trace!(" distance to entry point: {:?} ", dist_to_entry_point);
// keep a list of id visited
let mut visited_point_id = HashMap::<PointId, Arc<Point<T>>>::new();
visited_point_id.insert(entry_point.p_id, Arc::clone(&entry_point));
//
let mut candidate_points =
BinaryHeap::<Arc<PointWithOrder<T>>>::with_capacity(skiplist_size);
candidate_points.push(Arc::new(PointWithOrder::new(
&entry_point,
-dist_to_entry_point,
)));
return_points.push(Arc::new(PointWithOrder::new(
&entry_point,
dist_to_entry_point,
)));
// at the beginning candidate_points contains point passed as arg in layer entry_point_id.0
while !candidate_points.is_empty() {
// get nearest point in candidate_points
let c = candidate_points.pop().unwrap();
// f farthest point to
let f = return_points.peek().unwrap();
assert!(f.dist_to_ref >= 0.);
assert!(c.dist_to_ref <= 0.);
trace!(
"Comparaing c : {:?} f : {:?}",
-(c.dist_to_ref),
f.dist_to_ref
);
if -(c.dist_to_ref) > f.dist_to_ref {
// this comparison requires that we are sure that distances compared are distances to the same point :
// This is the case we compare distance to point passed as arg.
trace!(
"Fast return from search_layer, nb points : {:?} \n \t c {:?} \n \t f {:?} dists: {:?} {:?}",
return_points.len(),
c.point_ref.p_id,
f.point_ref.p_id,
-(c.dist_to_ref),
f.dist_to_ref
);
if filter.is_none() {
return return_points;
} else if return_points.len() >= ef {
return_points.retain(|p| {
filter
.as_ref()
.unwrap()
.hnsw_filter(&p.point_ref.get_origin_id())
});
}
}
// now we scan neighborhood of c in layer and increment visited_point, candidate_points
// and optimize candidate_points so that it contains points with lowest distances to point arg
//
let neighbours_c_l = &c.point_ref.neighbours.read()[layer as usize];
let c_pid = c.point_ref.p_id;
trace!(
" search_layer, {:?} has nb neighbours : {:?} ",
c_pid,
neighbours_c_l.len()
);
for e in neighbours_c_l {
// HERE WE sEE THAT neighbours should be stored as PointIdWithOrder !!
// CAVEAT what if several point_id with same distance to ref point?
if !visited_point_id.contains_key(&e.point_ref.p_id) {
visited_point_id.insert(e.point_ref.p_id, Arc::clone(&e.point_ref));
trace!(" visited insertion {:?}", e.point_ref.p_id);
let f_opt = return_points.peek();
if f_opt.is_none() {
// do some debug info, dumped distance is from e to c! as e is in c neighbours
debug!("return points empty when inserting {:?}", e.point_ref.p_id);
return return_points;
}
let f = f_opt.unwrap();
let e_dist_to_p = self.dist_f.eval(point, e.point_ref.data.get_v());
let f_dist_to_p = f.dist_to_ref;
if e_dist_to_p < f_dist_to_p || return_points.len() < ef {
let e_prime = Arc::new(PointWithOrder::new(&e.point_ref, e_dist_to_p));
// a neighbour of neighbour is better, we insert it into candidate with the distance to point
trace!(
" inserting new candidate {:?}",
e_prime.point_ref.p_id
);
candidate_points
.push(Arc::new(PointWithOrder::new(&e.point_ref, -e_dist_to_p)));
if filter.is_none() {
return_points.push(Arc::clone(&e_prime));
} else {
let id: &usize = &e_prime.point_ref.get_origin_id();
if filter.as_ref().unwrap().hnsw_filter(id) {
if return_points.len() == 1 {
let only_id = return_points.peek().unwrap().point_ref.origin_id;
if !filter.as_ref().unwrap().hnsw_filter(&only_id) {
return_points.clear()
}
}
return_points.push(Arc::clone(&e_prime))
}
}
if return_points.len() > ef {
return_points.pop();
}
} // end if e.dist_to_ref < f.dist_to_ref
}
} // end of for on neighbours_c
} // end of while in candidates
//
trace!(
"return from search_layer, nb points : {:?}",
return_points.len()
);
return_points
} // end of search_layer
/// insert a tuple (&Vec, usize) with its external id as given by the client.
/// The insertion method gives the point an internal id.
#[inline]
pub fn insert(&self, datav_with_id: (&[T], usize)) {
self.insert_slice((datav_with_id.0, datav_with_id.1))
}
// Hnsw insert.
/// Insert a data slice with its external id as given by the client.
/// The insertion method gives the point an internal id.
/// The slice insertion makes integration with ndarray crate easier than the vector insertion
pub fn insert_slice(&self, data_with_id: (&[T], usize)) {
//
let (data, origin_id) = data_with_id;
let keep_pruned = self.keep_pruned;
// insert in indexation and get point_id adn generate a new entry_point if necessary
let (new_point, point_rank) = self
.layer_indexed_points
.generate_new_point(data, origin_id);
trace!("Hnsw insert generated new point {:?} ", new_point.p_id);
// now real work begins
// allocate a binary heap
let level = new_point.p_id.0;
let mut enter_point_copy = None;
let mut max_level_observed = 0;
// entry point has been set in
{
// I open a read lock on an option
if let Some(arc_point) = self.layer_indexed_points.entry_point.read().as_ref() {
enter_point_copy = Some(Arc::clone(arc_point));
if point_rank == 1 {
debug!(
"Hnsw stored first point , direct return {:?} ",
new_point.p_id
);
return;
}
max_level_observed = enter_point_copy.as_ref().unwrap().p_id.0;
}
}
if enter_point_copy.is_none() {
self.layer_indexed_points.check_entry_point(&new_point);
return;
}
let mut dist_to_entry = self
.dist_f
.eval(data, enter_point_copy.as_ref().unwrap().data.get_v());
// we go from self.max_level_observed to level+1 included
for l in ((level + 1)..(max_level_observed + 1)).rev() {
// CAVEAT could bypass when layer empty, avoid allocation..
let mut sorted_points = self.search_layer(
data,
Arc::clone(enter_point_copy.as_ref().unwrap()),
1,
l,
None,
);
trace!(
"in insert :search_layer layer {:?}, returned {:?} points ",
l,
sorted_points.len()
);
if sorted_points.len() > 1 {
panic!(
"in insert : search_layer layer {:?}, returned {:?} points ",
l,
sorted_points.len()
);
}
// the heap conversion is useless beccause of the preceding test.
// sorted_points = from_positive_binaryheap_to_negative_binary_heap(&mut sorted_points);
//
if let Some(ep) = sorted_points.pop() {
// useful for projecting lower layer to upper layer. keep track of points encountered.
if new_point.neighbours.read()[l as usize].len()
< self.get_max_nb_connection() as usize
{
new_point.neighbours.write()[l as usize].push(Arc::clone(&ep));
}
// get the lowest distance point
let tmp_dist = self.dist_f.eval(data, ep.point_ref.data.get_v());
if tmp_dist < dist_to_entry {
enter_point_copy = Some(Arc::clone(&ep.point_ref));
dist_to_entry = tmp_dist;
}
} else {
// this layer is not yet filled
trace!("layer still empty {} : got null list", l);
}
}
// now enter_point_id_copy contains id of nearest
// now loop down to 0
for l in (0..level + 1).rev() {
let ef = self.ef_construction;
// when l == level, we cannot get new_point in sorted_points as it is seen only from declared neighbours
let mut sorted_points = self.search_layer(
data,
Arc::clone(enter_point_copy.as_ref().unwrap()),
ef,
l,
None,
);
trace!(
"in insert :search_layer layer {:?}, returned {:?} points ",
l,
sorted_points.len()
);
sorted_points = from_positive_binaryheap_to_negative_binary_heap(&mut sorted_points);
if !sorted_points.is_empty() {
let nb_conn;
let extend_c;
if l == 0 {
nb_conn = 2 * self.max_nb_connection;
extend_c = self.extend_candidates;
} else {
nb_conn = self.max_nb_connection;
extend_c = false;
}
let mut neighbours = Vec::<Arc<PointWithOrder<T>>>::with_capacity(nb_conn);
self.select_neighbours(
data,
&mut sorted_points,
nb_conn,
extend_c,
l,
keep_pruned,
&mut neighbours,
);
// sort neighbours
neighbours.sort_unstable();
// we must add bidirecti*onal from data i.e new_point_id to neighbours
new_point.neighbours.write()[l as usize].clone_from(&neighbours);
// this reverse neighbour update could be done here but we put it at end to gather all code
// requiring a mutex guard for multi threading.
// update ep for loop iteration. As we sorted neighbours the nearest
if !neighbours.is_empty() {
enter_point_copy = Some(Arc::clone(&neighbours[0].point_ref));
}
}
} // for l
//
// new_point has been inserted at the beginning in table
// so that we can call reverse_update_neighborhoodwe consitently
// now reverse update of neighbours.
self.reverse_update_neighborhood_simple(Arc::clone(&new_point));
//
self.layer_indexed_points.check_entry_point(&new_point);
//
trace!("Hnsw exiting insert new point {:?} ", new_point.p_id);
} // end of insert
/// Insert in parallel a slice of Vec\<T\> each associated to its id.
/// It uses Rayon for threading so the number of insertions asked for must be large enough to be efficient.
/// Typically 1000 * the number of threads.
/// Many consecutive parallel_insert can be done, so the size of vector inserted in one insertion can be optimized.
pub fn parallel_insert(&self, datas: &[(&Vec<T>, usize)]) {
debug!("entering parallel_insert");
datas
.par_iter()
.for_each(|&(item, v)| self.insert((item.as_slice(), v)));
debug!("exiting parallel_insert");
} // end of parallel_insert
/// Insert in parallel slices of \[T\] each associated to its id.
/// It uses Rayon for threading so the number of insertions asked for must be large enough to be efficient.
/// Typically 1000 * the number of threads.
/// Facilitates the use with the ndarray crate as we can extract slices (for data in contiguous order) from Array.
pub fn parallel_insert_slice(&self, datas: &Vec<(&[T], usize)>) {
datas.par_iter().for_each(|&item| self.insert_slice(item));
} // end of parallel_insert
/// insert new_point in neighbourhood info of point
fn reverse_update_neighborhood_simple(&self, new_point: Arc<Point<T>>) {
// println!("reverse update neighbourhood for new point {:?} ", new_point.p_id);
trace!(
"reverse update neighbourhood for new point {:?} ",
new_point.p_id
);
let level = new_point.p_id.0;
for l in (0..level + 1).rev() {
for q in &new_point.neighbours.read()[l as usize] {
if new_point.p_id != q.point_ref.p_id {
// as new point is in global table, do not loop and deadlock!!
let q_point = &q.point_ref;
let mut q_point_neighbours = q_point.neighbours.write();
let n_to_add = PointWithOrder::<T>::new(&Arc::clone(&new_point), q.dist_to_ref);
// must be sure that we add a point at the correct level. See the comment to search_layer!
// this ensures that reverse updating do not add problems.
let l_n = n_to_add.point_ref.p_id.0 as usize;
let already = q_point_neighbours[l_n]
.iter()
.position(|old| old.point_ref.p_id == new_point.p_id);
if already.is_some() {
// debug!(" new_point.p_id {:?} already in neighbourhood of q_point {:?} at index {:?}", new_point.p_id, q_point.p_id, already.unwrap());
// q_point.debug_dump(); cannot be called as its neighbours are locked write by this method.
// new_point.debug_dump();
// panic!();
continue;
}
q_point_neighbours[l_n].push(Arc::new(n_to_add));
let nbn_at_l = q_point_neighbours[l_n].len();
//
// if l < level, update upward chaining, insert does a sort! t_q has a neighbour not yet in global table of points!
let threshold_shrinking = if l_n > 0 {
self.max_nb_connection
} else {
2 * self.max_nb_connection
};
let shrink = nbn_at_l > threshold_shrinking;
{
// sort and shring if necessary
q_point_neighbours[l_n].sort_unstable();
if shrink {
q_point_neighbours[l_n].pop();
}
}
} // end protection against point identity
}
}
// println!(" exitingreverse update neighbourhood for new point {:?} ", new_point.p_id);
} // end of reverse_update_neighborhood_simple
pub fn get_point_indexation(&self) -> &PointIndexation<'b, T> {
&self.layer_indexed_points
}
// This is best explained in : Navarro. Searching in metric spaces by spatial approximation.
/// simplest searh neighbours
// The binary heaps here is with negative distance sorted.
#[allow(clippy::too_many_arguments)]
fn select_neighbours(
&self,
data: &[T],
candidates: &mut BinaryHeap<Arc<PointWithOrder<'b, T>>>,
nb_neighbours_asked: usize,
extend_candidates_asked: bool,
layer: u8,
keep_pruned: bool,
neighbours_vec: &mut Vec<Arc<PointWithOrder<'b, T>>>,
) {
//
trace!(
"entering select_neighbours : nb candidates: {}",
candidates.len()
);
//
neighbours_vec.clear();
// we will extend if we do not have enough candidates and it is explicitly asked in arg
let mut extend_candidates = false;
if candidates.len() <= nb_neighbours_asked {
if !extend_candidates_asked {
// just transfer taking care of signs
while !candidates.is_empty() {
let p = candidates.pop().unwrap();
assert!(-p.dist_to_ref >= 0.);
neighbours_vec
.push(Arc::new(PointWithOrder::new(&p.point_ref, -p.dist_to_ref)));
}
return;
} else {
extend_candidates = true;
}
}
//
//
//extend_candidates = true;
//
if extend_candidates {
let mut candidates_set = HashMap::<PointId, Arc<Point<T>>>::new();
for c in candidates.iter() {
candidates_set.insert(c.point_ref.p_id, Arc::clone(&c.point_ref));
}
let mut new_candidates_set = HashMap::<PointId, Arc<Point<T>>>::new();
// get a list of all neighbours of candidates
for (_p_id, p_point) in candidates_set.iter() {
let n_p_layer = &p_point.neighbours.read()[layer as usize];
for q in n_p_layer {
if !candidates_set.contains_key(&q.point_ref.p_id)
&& !new_candidates_set.contains_key(&q.point_ref.p_id)
{
new_candidates_set.insert(q.point_ref.p_id, Arc::clone(&q.point_ref));
}
}
} // end of for p
trace!(
"select neighbours extend candidates from : {:?} adding : {:?}",
candidates.len(),
new_candidates_set.len()
);
for (_p_id, p_point) in new_candidates_set.iter() {
let dist_topoint = self.dist_f.eval(data, p_point.data.get_v());
candidates.push(Arc::new(PointWithOrder::new(p_point, -dist_topoint)));
}
} // end if extend_candidates
//
let mut discarded_points = BinaryHeap::<Arc<PointWithOrder<T>>>::new();
while !candidates.is_empty() && neighbours_vec.len() < nb_neighbours_asked {
// compare distances of e to data. we do not need to recompute dists!
if let Some(e_p) = candidates.pop() {
let mut e_to_insert = true;
let e_point_v = e_p.point_ref.data.get_v();
assert!(e_p.dist_to_ref <= 0.);
// is e_p the nearest to reference? data than to previous neighbours
if !neighbours_vec.is_empty() {
e_to_insert = !neighbours_vec.iter().any(|d| {
self.dist_f.eval(e_point_v, d.point_ref.data.get_v()) <= -e_p.dist_to_ref
});
}
if e_to_insert {
trace!("inserting neighbours : {:?} ", e_p.point_ref.p_id);
neighbours_vec.push(Arc::new(PointWithOrder::new(
&e_p.point_ref,
-e_p.dist_to_ref,
)));
} else {
trace!("discarded neighbours : {:?} ", e_p.point_ref.p_id);
// ep is taken from a binary heap, so it has a negative sign, we keep its sign
// to store it in another binary heap will possibly need to retain the best ones from the discarde binaryHeap
if keep_pruned {
discarded_points.push(Arc::new(PointWithOrder::new(
&e_p.point_ref,
e_p.dist_to_ref,
)));
}
}
}
}
// now this part of neighbours is the most interesting and is distance sorted.
// not pruned are at the end of neighbours_vec which is not re-sorted , but discarded are sorted.
if keep_pruned {
while !discarded_points.is_empty() && neighbours_vec.len() < nb_neighbours_asked {
let best_point = discarded_points.pop().unwrap();
// do not forget to reverse sign
assert!(best_point.dist_to_ref <= 0.);
neighbours_vec.push(Arc::new(PointWithOrder::new(
&best_point.point_ref,
-best_point.dist_to_ref,
)));
}
};
//
if log::log_enabled!(log::Level::Trace) {
trace!(
"exiting select_neighbours : nb candidates: {}",
neighbours_vec.len()
);
for n in neighbours_vec {
trace!(" neighbours {:?} ", n.point_ref.p_id);
}
}
//
} // end of select_neighbours
/// A utility to get printed info on how many points there are in each layer.
pub fn dump_layer_info(&self) {
self.layer_indexed_points.debug_dump();
}
// search the first knbn nearest neigbours of a data, but can modify ef for layer > 1
// This function return Vec<Arc<PointWithOrder<T> >>
// The parameter ef controls the width of the search in the lowest level, it must be greater
// than number of neighbours asked. A rule of thumb could be between knbn and max_nb_connection.
#[allow(unused)]
fn search_general(&self, data: &[T], knbn: usize, ef_arg: usize) -> Vec<Neighbour> {
//
let mut entry_point;
{
// a lock on an option an a Arc<Point>
let entry_point_opt_ref = self.layer_indexed_points.entry_point.read();
if entry_point_opt_ref.is_none() {
return Vec::<Neighbour>::new();
} else {
entry_point = Arc::clone((*entry_point_opt_ref).as_ref().unwrap());
}
}
//
let mut dist_to_entry = self.dist_f.eval(data, entry_point.as_ref().data.get_v());
for layer in (1..=entry_point.p_id.0).rev() {
let mut neighbours = self.search_layer(data, Arc::clone(&entry_point), 1, layer, None);
neighbours = from_positive_binaryheap_to_negative_binary_heap(&mut neighbours);
if let Some(entry_point_tmp) = neighbours.pop() {
// get the lowest distance point.
let tmp_dist = self
.dist_f
.eval(data, entry_point_tmp.point_ref.data.get_v());
if tmp_dist < dist_to_entry {
entry_point = Arc::clone(&entry_point_tmp.point_ref);
dist_to_entry = tmp_dist;
}
}
}
// ef must be greater than knbn. Possibly it should be between knbn and self.max_nb_connection
let ef = ef_arg.max(knbn);
// now search with asked ef in layer 0
let neighbours_heap = self.search_layer(data, entry_point, ef, 0, None);
// go from heap of points with negative dist to a sorted vec of increasing points with > 0 distances.
let neighbours = neighbours_heap.into_sorted_vec();
// get the min of K and ef points into a vector.
//
let last = knbn.min(ef).min(neighbours.len());
let knn_neighbours: Vec<Neighbour> = neighbours[0..last]
.iter()
.map(|p| {
Neighbour::new(
p.as_ref().point_ref.origin_id,
p.as_ref().dist_to_ref,
p.as_ref().point_ref.p_id,
)
})
.collect();
knn_neighbours
} // end of knn_search
/// a filtered version of [`Self::search`].
/// A filter can be added to the search to get nodes with a particular property or id constraint.
/// See examples in tests/filtertest.rs
pub fn search_filter(
&self,
data: &[T],
knbn: usize,
ef_arg: usize,
filter: Option<&dyn FilterT>,
) -> Vec<Neighbour> {
//
let entry_point;
{
// a lock on an option an a Arc<Point>
let entry_point_opt_ref = self.layer_indexed_points.entry_point.read();
if entry_point_opt_ref.is_none() {
return Vec::<Neighbour>::new();
} else {
entry_point = Arc::clone((*entry_point_opt_ref).as_ref().unwrap());
}
}
//
let mut dist_to_entry = self.dist_f.eval(data, entry_point.as_ref().data.get_v());
let mut pivot = Arc::clone(&entry_point);
let mut new_pivot = None;
//
for layer in (1..=entry_point.p_id.0).rev() {
let mut has_changed = false;
// search in stored neighbours
{
let neighbours = &pivot.neighbours.read()[layer as usize];
for n in neighbours {
// get the lowest distance point.
let tmp_dist = self.dist_f.eval(data, n.point_ref.data.get_v());
if tmp_dist < dist_to_entry {
new_pivot = Some(Arc::clone(&n.point_ref));
has_changed = true;
dist_to_entry = tmp_dist;
}
} // end of for on neighbours
}
if has_changed {
pivot = Arc::clone(new_pivot.as_ref().unwrap());
}
} // end on for on layers
// ef must be greater than knbn. Possibly it should be between knbn and self.max_nb_connection
let ef = ef_arg.max(knbn);
log::debug!("pivot changed , current pivot {:?}", pivot.get_point_id());
// search lowest non empty layer (in case of search with incomplete lower layer at beginning of hnsw filling)
let mut l = 0u8;
let layer_to_search = loop {
if self.get_point_indexation().get_layer_nb_point(l as usize) > 0 {
break l;
}
l += 1;
};
// now search with asked ef in lower layer
let neighbours_heap = self.search_layer(data, pivot, ef, layer_to_search, filter);
// go from heap of points with negative dist to a sorted vec of increasing points with > 0 distances.
let neighbours = neighbours_heap.into_sorted_vec();
// get the min of K and ef points into a vector.
//
let last = knbn.min(ef).min(neighbours.len());
//
if let Some(filter_t) = filter {
let knn_neighbours: Vec<Neighbour> = neighbours[0..last]
.iter()
.map(|p| {
if filter_t.hnsw_filter(&p.as_ref().point_ref.origin_id) {
Some(Neighbour::new(
p.as_ref().point_ref.origin_id,
p.as_ref().dist_to_ref,
p.as_ref().point_ref.p_id,
))
} else {
None
}
})
.filter(|x| x.is_some())
.map(|x| x.unwrap())
.collect();
//
knn_neighbours
} else {
let knn_neighbours: Vec<Neighbour> = neighbours[0..last]
.iter()
.map(|p| {
Neighbour::new(
p.as_ref().point_ref.origin_id,
p.as_ref().dist_to_ref,
p.as_ref().point_ref.p_id,
)
})
.collect();
knn_neighbours
}
} // end of search_filter
#[inline]
pub fn search_possible_filter(
&self,
data: &[T],
knbn: usize,
ef_arg: usize,
filter: Option<&dyn FilterT>,
) -> Vec<Neighbour> {
self.search_filter(data, knbn, ef_arg, filter)
}
/// search the first knbn nearest neigbours of a data and returns a Vector of Neighbour.
/// The parameter ef controls the width of the search in the lowest level, it must be greater
/// than number of neighbours asked.
/// A rule of thumb could be between knbn and max_nb_connection.
pub fn search(&self, data: &[T], knbn: usize, ef_arg: usize) -> Vec<Neighbour> {
self.search_possible_filter(data, knbn, ef_arg, None)
}
fn search_with_id(
&self,
request: (usize, &Vec<T>),
knbn: usize,
ef: usize,
) -> (usize, Vec<Neighbour>) {
(request.0, self.search(request.1, knbn, ef))
}
/// knbn is the number of nearest neigbours asked for. Returns for each data vector
/// a Vector of Neighbour
pub fn parallel_search(&self, datas: &[Vec<T>], knbn: usize, ef: usize) -> Vec<Vec<Neighbour>> {
let (sender, receiver) = channel();
// make up requests
let nb_request = datas.len();
let requests: Vec<(usize, &Vec<T>)> = (0..nb_request).zip(datas.iter()).collect();
//
requests.par_iter().for_each_with(sender, |s, item| {
s.send(self.search_with_id(*item, knbn, ef)).unwrap()
});
let req_res: Vec<(usize, Vec<Neighbour>)> = receiver.iter().collect();
// now sort to respect the key order of input
let mut answers = Vec::<Vec<Neighbour>>::with_capacity(datas.len());
// get a map from request id to rank
let mut req_hash = HashMap::<usize, usize>::new();
for (i, elt) in req_res.iter().enumerate() {
// the response of request req_res[i].0 is at rank i
req_hash.insert(elt.0, i);
}
for i in 0..datas.len() {
let answer_i = req_hash.get_key_value(&i).unwrap().1;
answers.push((req_res[*answer_i].1).clone());
}
answers
} // end of insert_parallel
} // end of Hnsw
// This function takes a binary heap with points declared with a negative distance
// and returns a vector of points with their correct positive distance to some reference distance
// The vector is sorted by construction
#[allow(unused)]
fn from_negative_binaryheap_to_sorted_vector<'b, T: Send + Sync + Copy>(
heap_points: &mut BinaryHeap<Arc<PointWithOrder<'b, T>>>,
) -> Vec<Arc<PointWithOrder<'b, T>>> {
let nb_points = heap_points.len();
let mut vec_points = Vec::<Arc<PointWithOrder<T>>>::with_capacity(nb_points);
//
for p in heap_points.iter() {
assert!(p.dist_to_ref <= 0.);
let reverse_p = Arc::new(PointWithOrder::new(&p.point_ref, -p.dist_to_ref));
vec_points.push(reverse_p);
}
trace!(
"from_negative_binaryheap_to_sorted_vector nb points in out {:?} {:?} ",
nb_points,
vec_points.len()
);
vec_points
}
// This function takes a binary heap with points declared with a positive distance
// and returns a binary_heap of points with their correct negative distance to some reference distance
//
fn from_positive_binaryheap_to_negative_binary_heap<'b, T: Send + Sync + Clone>(
positive_heap: &mut BinaryHeap<Arc<PointWithOrder<'b, T>>>,
) -> BinaryHeap<Arc<PointWithOrder<'b, T>>> {
let nb_points = positive_heap.len();
let mut negative_heap = BinaryHeap::<Arc<PointWithOrder<T>>>::with_capacity(nb_points);
//
for p in positive_heap.iter() {
assert!(p.dist_to_ref >= 0.);
let reverse_p = Arc::new(PointWithOrder::new(&p.point_ref, -p.dist_to_ref));
negative_heap.push(reverse_p);
}
trace!(
"from_positive_binaryheap_to_negative_binary_heap nb points in out {:?} {:?} ",
nb_points,
negative_heap.len()
);
negative_heap
}
// essentialy to check dump/reload conssistency
// in fact checks only equality of graph
#[allow(unused)]
pub(crate) fn check_graph_equality<T1, D1, T2, D2>(hnsw1: &Hnsw<T1, D1>, hnsw2: &Hnsw<T2, D2>)
where
T1: Copy + Clone + Send + Sync,
D1: Distance<T1> + Default + Send + Sync,
T2: Copy + Clone + Send + Sync,
D2: Distance<T2> + Default + Send + Sync,
{
//
debug!("In check_graph_equality");
//
assert_eq!(hnsw1.get_nb_point(), hnsw2.get_nb_point());
// check for entry point
assert!(
hnsw1.layer_indexed_points.entry_point.read().is_some()
|| hnsw1.layer_indexed_points.entry_point.read().is_some(),
"one entry point is None"
);
let ep1_read = hnsw1.layer_indexed_points.entry_point.read();
let ep2_read = hnsw2.layer_indexed_points.entry_point.read();
let ep1 = ep1_read.as_ref().unwrap();
let ep2 = ep2_read.as_ref().unwrap();
assert_eq!(
ep1.origin_id, ep2.origin_id,
"different entry points {:?} {:?}",
ep1.origin_id, ep2.origin_id
);
assert_eq!(ep1.p_id, ep2.p_id, "origin id {:?} ", ep1.origin_id);
// check layers
let layers_1 = hnsw1.layer_indexed_points.points_by_layer.read();
let layers_2 = hnsw2.layer_indexed_points.points_by_layer.read();
let mut nb_point_checked = 0;
let mut nb_neighbours_checked = 0;
for i in 0..NB_LAYER_MAX as usize {
debug!("Checking layer {:?}", i);
assert_eq!(layers_1[i].len(), layers_2[i].len());
for j in 0..layers_1[i].len() {
let p1 = &layers_1[i][j];
let p2 = &layers_2[i][j];
assert_eq!(p1.origin_id, p2.origin_id);
assert_eq!(
p1.p_id, p2.p_id,
"Checking origin_id point {:?} ",
p1.origin_id
);
nb_point_checked += 1;
// check neighborhood
let nbgh1 = p1.neighbours.read();
let nbgh2 = p2.neighbours.read();
assert_eq!(nbgh1.len(), nbgh2.len());
for k in 0..nbgh1.len() {
assert_eq!(nbgh1[k].len(), nbgh2[k].len());
for l in 0..nbgh1[k].len() {
assert_eq!(
nbgh1[k][l].point_ref.origin_id,
nbgh2[k][l].point_ref.origin_id
);
assert_eq!(nbgh1[k][l].point_ref.p_id, nbgh2[k][l].point_ref.p_id);
// CAVEAT for precision with f32
assert_eq!(nbgh1[k][l].dist_to_ref, nbgh2[k][l].dist_to_ref);
nb_neighbours_checked += 1;
}
}
} // end of for j
} // end of for i
assert_eq!(nb_point_checked, hnsw1.get_nb_point());
debug!("nb neighbours checked {:?}", nb_neighbours_checked);
debug!("exiting check_equality");
} // end of check_reload
#[cfg(test)]
mod tests {
use super::*;
use anndists::dist;
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_iter_point() {
//
println!("\n\n test_iter_point");
//
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let nbcolumn = 5000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = rng.sample(unif);
data[j].push(xsi);
}
}
//
// check insertion
let ef_construct = 25;
let nb_connection = 10;
let start = ProcessTime::now();
let hns = Hnsw::<f32, dist::DistL1>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
dist::DistL1 {},
);
for (i, d) in data.iter().enumerate() {
hns.insert((d, i));
}
let cpu_time = start.elapsed();
println!(" test_insert_iter_point time inserting {:?}", cpu_time);
hns.dump_layer_info();
// now check iteration
let ptiter = hns.get_point_indexation().into_iter();
let mut nb_dumped = 0;
for _point in ptiter {
// println!("point : {:?}", _point.p_id);
nb_dumped += 1;
}
//
assert_eq!(nb_dumped, nbcolumn);
} // end of test_iter_point
#[test]
fn test_iter_layerpoint() {
//
println!("\n\n test_iter_point");
//
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let nbcolumn = 5000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = rng.sample(unif);
data[j].push(xsi);
}
}
//
// check insertion
let ef_construct = 25;
let nb_connection = 10;
let start = ProcessTime::now();
let hns = Hnsw::<f32, dist::DistL1>::new(
nb_connection,
nbcolumn,
16,
ef_construct,
dist::DistL1 {},
);
for (i, d) in data.iter().enumerate() {
hns.insert((d, i));
}
let cpu_time = start.elapsed();
println!(" test_insert_iter_point time inserting {:?}", cpu_time);
hns.dump_layer_info();
// now check iteration
let layer_num = 0;
let nbpl = hns.get_point_indexation().get_layer_nb_point(layer_num);
let layer_iter = hns.get_point_indexation().get_layer_iterator(layer_num);
//
let mut nb_dumped = 0;
for _point in layer_iter {
// println!("point : {:?}", _point.p_id);
nb_dumped += 1;
}
println!(
"test_iter_layerpoint : nb point in layer {} , nb found {}",
nbpl, nb_dumped
);
//
assert_eq!(nb_dumped, nbpl);
} // end of test_iter_layerpoint
// we should find point even if it is in layer >= 1
#[test]
fn test_sparse_search() {
log_init_test();
//
for _ in 0..800 {
let hnsw: Hnsw<f32, dist::DistL1> =
Hnsw::new(15, 100_000, 20, 500_000, dist::DistL1 {});
hnsw.insert((&[1.0, 0.0, 0.0, 0.0], 0));
let result = hnsw.search(&[1.0, 0.0, 0.0, 0.0], 2, 10);
assert_eq!(result, vec![Neighbour::new(0, 0.0, PointId(0, 0))]);
}
}
} // end of module test