Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
9
vendor/ruvector/scripts/patches/hnsw_rs/.gitignore
vendored
Normal file
9
vendor/ruvector/scripts/patches/hnsw_rs/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
target/**
|
||||
Runs
|
||||
Cargo.lock
|
||||
rls*
|
||||
dumpreloadtest*
|
||||
*.pdf
|
||||
*.html
|
||||
.idea/
|
||||
.vscode/
|
||||
111
vendor/ruvector/scripts/patches/hnsw_rs/Cargo.toml
vendored
Normal file
111
vendor/ruvector/scripts/patches/hnsw_rs/Cargo.toml
vendored
Normal file
@@ -0,0 +1,111 @@
|
||||
[package]
|
||||
name = "hnsw_rs"
|
||||
version = "0.3.3"
|
||||
authors = ["jeanpierre.both@gmail.com"]
|
||||
description = "Ann based on Hierarchical Navigable Small World Graphs from Yu.A. Malkov and D.A Yashunin"
|
||||
license = "MIT/Apache-2.0"
|
||||
readme = "README.md"
|
||||
keywords = ["algorithms", "ann", "hnsw"]
|
||||
repository = "https://github.com/jean-pierreBoth/hnswlib-rs"
|
||||
documentation = "https://docs.rs/hnsw_rs"
|
||||
edition = "2024"
|
||||
|
||||
|
||||
# declare a feature with no dependancy to get some modulated debug print
|
||||
# to be run with cargo build --features verbose_1
|
||||
#verbose_1 = [ ]
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
opt-level = 3
|
||||
|
||||
[lib]
|
||||
# cargo rustc --lib -- --crate-type cdylib [or staticlib] or rlib (default)
|
||||
# if we want to avoid specifying in advance crate-type
|
||||
path = "src/lib.rs"
|
||||
#crate-type = ["cdylib"]
|
||||
|
||||
|
||||
[[example]]
|
||||
name = "random"
|
||||
path = "examples/random.rs"
|
||||
|
||||
|
||||
[[example]]
|
||||
name = "ann-glove"
|
||||
path = "examples/ann-glove25-angular.rs"
|
||||
|
||||
|
||||
[[example]]
|
||||
name = "ann-mnist"
|
||||
path = "examples/ann-mnist-784-euclidean.rs"
|
||||
|
||||
[[example]]
|
||||
name = "ann-sift1m"
|
||||
path = "examples/ann-sift1m-128-euclidean.rs"
|
||||
|
||||
[[example]]
|
||||
name = "levenshtein"
|
||||
path = "examples/levensthein.rs"
|
||||
|
||||
|
||||
[dependencies]
|
||||
# default is version spec is ^ meaning can update up to max non null version number
|
||||
# cargo doc --no-deps avoid dependencies doc generation
|
||||
#
|
||||
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
bincode = { version = "1.3" }
|
||||
|
||||
cfg-if = { version = "1.0" }
|
||||
|
||||
# for //
|
||||
parking_lot = "0.12"
|
||||
rayon = { version = "1.11" }
|
||||
num_cpus = { version = "1.16" }
|
||||
|
||||
cpu-time = { version = "1.0" }
|
||||
num-traits = { version = "0.2" }
|
||||
|
||||
|
||||
# for hashing . hashbrown still needed beccause of get_key_value(&key)
|
||||
hashbrown = { version = "0.15" }
|
||||
indexmap = { version = ">= 2.11, < 2.13" }
|
||||
|
||||
rand = { version = "0.8" }
|
||||
lazy_static = { version = "1.4" }
|
||||
|
||||
#
|
||||
mmap-rs = { version = "0.6" }
|
||||
#
|
||||
# decreasing order of log for debug build : (max_level_)trace debug info warn error off
|
||||
# decreasing order of log for release build (release_max_level_) .. idem
|
||||
#log = { version = "0.4", features = ["max_level_debug", "release_max_level_info"] }
|
||||
log = { version = "0.4" }
|
||||
env_logger = { version = "0.11" }
|
||||
|
||||
anyhow = { version = "1.0" }
|
||||
|
||||
# anndists = { path = "../anndists" }
|
||||
anndists = { version = "0.1" }
|
||||
# anndists = { git = "https://github.com/jean-pierreBoth/anndists" }
|
||||
|
||||
# for benchmark reading, so the lbrary do not depend on hdf5 nor ndarray
|
||||
[dev-dependencies]
|
||||
# hdf5 = { version = "0.8" }
|
||||
# metno is needed as hdf5 is blocked to hdfsys 1.12
|
||||
hdf5 = {package = "hdf5-metno", version = "0.10.0" }
|
||||
|
||||
ndarray = { version = ">=0.16.0, <0.18" }
|
||||
skiplist = { version = "0.6" }
|
||||
tempfile = { version = "3" }
|
||||
itertools = {version = "0.14"}
|
||||
|
||||
[features]
|
||||
|
||||
default = []
|
||||
|
||||
# feature for std simd on nightly
|
||||
stdsimd = ["anndists/stdsimd"]
|
||||
# feature for simd on stable for x86*
|
||||
simdeez_f = ["anndists/simdeez_f"]
|
||||
56
vendor/ruvector/scripts/patches/hnsw_rs/Changes.md
vendored
Normal file
56
vendor/ruvector/scripts/patches/hnsw_rs/Changes.md
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
- version 0.3.3
|
||||
small fix on filter (thanks to VillSnow). include ndarray 0.17 as possible dep. fixed compiler warning on elided lifetimes
|
||||
|
||||
- version 0.3.2
|
||||
update dependencies to ndarray 0.16 , rand 0.9 indexmap 2.9, hdf5. edition=2024
|
||||
|
||||
- version 0.3.1
|
||||
|
||||
Possibility to reduce the number of levels used Hnsw structure with the function hnsw::modify_level_scale.
|
||||
This often increases significantly recall while incurring a moderate cpu cost. It is also possible
|
||||
to have same recall with smaller *max_nb_conn* parameters so reducing memory usage.
|
||||
See README.md at [bigann](https://github.com/jean-pierreBoth/bigann).
|
||||
Modification inspired by the article by [Munyampirwa](https://arxiv.org/abs/2412.01940)
|
||||
|
||||
Clippy cleaning and minor arguments change (PathBuf to Path String to &str) in dump/reload
|
||||
with the help of bwsw (https://github.com/bwsw)
|
||||
|
||||
|
||||
- **version 0.3.0**:
|
||||
|
||||
The distances implementation is now in a separate crate [anndsits](https://crates.io/crates/anndists). Using hnsw_rs::prelude:::* should make the change transparent.
|
||||
|
||||
The mmap implementation makes it possible to use the [coreset](https://github.com/jean-pierreBoth/coreset) crate to compute coreset and clusters of data stored in hnsw dumps.
|
||||
|
||||
- version 0.2.1:
|
||||
|
||||
when using mmap, the points less frequently used (points in lower layers) are preferentially mmap-ed while upper layers are preferentially
|
||||
explcitly read from file.
|
||||
|
||||
Hnswio is now Sync.
|
||||
|
||||
feature stdsimd, based on std::simd, runs with nightly on Hamming with u32,u64 and DisL1,DistL2, DistDot with f32
|
||||
|
||||
- The **version 0.2** introduces
|
||||
1. possibility to use mmap on the data file storing the vectors represented in the hnsw structure. This is mostly usefule for
|
||||
large vectors, where data needs more space than the graph part.
|
||||
As a consequence the format of this file changed. Old format can be read but new dumps will be in the new format.
|
||||
In case of mmap usage, a dump after inserting new elements must ensure that the old file is not overwritten, so a unique file name is
|
||||
generated if necessary. See documentation of module Hnswio
|
||||
|
||||
1. the filtering trait
|
||||
|
||||
|
||||
- Upgrade of many dependencies. Change from simple_logger to env_logger. The logger is initialized one for all in file src/lib.rs and cannot be intialized twice. The level of log can be modulated by the RUST_LOG env variable on a module basis or switched off. See the *env_logger* crate doc.
|
||||
|
||||
- A rust crate *edlib_rs* provides an interface to the *excellent* edlib C++ library [(Cf edlib)](https://github.com/Martinsos/edlib) can be found at [edlib_rs](https://github.com/jean-pierreBoth/edlib-rs) or on crate.io. It can be used to define a user adhoc distance on &[u8] with normal, prefix or infix mode (which is useful in genomics alignment).
|
||||
|
||||
- The library do not depend anymore on hdf5 and ndarray. They are dev-dependancies needed for examples, this simplify compatibility issues.
|
||||
- Added insertion methods for slices for easier use with the ndarray crate.
|
||||
|
||||
- simd/avx2 requires now the feature "simdeez_f". So by default the crate can compile on M1 chip and transitions to std::simd.
|
||||
|
||||
- Added DistPtr and possiblity to dump/reload with this distance type. (See *load_hnsw_with_dist* function)
|
||||
|
||||
- Implementation of Hamming for f64 exclusively in the context SuperMinHash in crate [probminhash](https://crates.io/crates/probminhash)
|
||||
|
||||
13
vendor/ruvector/scripts/patches/hnsw_rs/LICENSE-APACHE
vendored
Normal file
13
vendor/ruvector/scripts/patches/hnsw_rs/LICENSE-APACHE
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
Copyright 2020 jean-pierre.both
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
25
vendor/ruvector/scripts/patches/hnsw_rs/LICENSE-MIT
vendored
Normal file
25
vendor/ruvector/scripts/patches/hnsw_rs/LICENSE-MIT
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright (c) 2020 jean-pierre.both
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
168
vendor/ruvector/scripts/patches/hnsw_rs/README.md
vendored
Normal file
168
vendor/ruvector/scripts/patches/hnsw_rs/README.md
vendored
Normal file
@@ -0,0 +1,168 @@
|
||||
# hnsw-rs
|
||||
|
||||
This crate provides a Rust implementation of the paper by Yu.A. Malkov and D.A Yashunin:
|
||||
|
||||
"Efficient and Robust approximate nearest neighbours using Hierarchical Navigable Small World Graphs" (2016,2018)
|
||||
[arxiv](https://arxiv.org/abs/1603.09320)
|
||||
|
||||
|
||||
|
||||
## Functionalities
|
||||
|
||||
The crate is built on top of the [anndists](https://crates.io/crates/anndists) and can use the following distances:
|
||||
|
||||
* usual distances as L1, L2, Cosine, Jaccard, Hamming for vectors of standard numeric types, Levenshtein distance on u16.
|
||||
|
||||
* Hellinger distance and Jeffreys divergence between probability distributions (f32 and f64). It must be noted that the Jeffreys divergence
|
||||
(a symetrized Kullback-Leibler divergence) do not satisfy the triangle inequality. (Neither Cosine distance !).
|
||||
|
||||
* Jensen-Shannon distance between probability distributions (f32 and f64). It is defined as the **square root** of the Jensen-Shannon divergence and is a bounded metric. See [Nielsen F. in Entropy 2019, 21(5), 485](https://doi.org/10.3390/e21050485).
|
||||
|
||||
* A Trait to enable the user to implement its own distances.
|
||||
It takes as data slices of types T satisfying T:Serialize+Clone+Send+Sync. It is also possible to use C extern functions or closures.
|
||||
|
||||
* An interface towards C and more specifically to the [Julia](https://julialang.org/) language.
|
||||
See the companion Julia package [HnswAnn.jl](https://gitlab.com/jpboth/HnswAnn.jl) and the building paragraph for some help for Julia users.
|
||||
|
||||
The hnsw implementation provides:
|
||||
|
||||
* Multithreaded insertion and search requests.
|
||||
|
||||
* Dump and reload functions (*See module hnswio*) to store the data and the graph once it is built. These facilities rely partly on Serde so T needs to implement Serialize and Deserialized as derived by Serde.
|
||||
It is also possible to reload only the graph and not the data themselves. A specific type (struct NoData, associated to the NoDist distance is dedicated to this functionality.
|
||||
|
||||
* A flattening conversion of the Hnsw structure to keep only neighborhood relationships between points (without their internal data) internal to the Hnsw structure (*see module flatten.rs, FlatPoint and FlatNeighborhood*). It is thus possible to keep some topology information with low memory usage.
|
||||
|
||||
* Filtering: It is possible to add filters so only results which satisfies the filter is in the result set. The filtering is done during the search, so it is not a post filter. There is currently two ways of using the filter, one can add allowed ids in a sorted vector and send as a parameter, or one can define a function which will be called before an id is added to the result set.
|
||||
Examples on both these strategies are in the examples or tests directory. One can also implement the trait Filterable for new types, if one would like the filter to be kept in a bitvector, for example.
|
||||
|
||||
* Possibilty to use mmap on dumped data (not on graph part) which is useful for large data vectors. This enables coreset and clusters computation in streaming, see [coreset](https://github.com/jean-pierreBoth/coreset) and soon on [crates.io](https://crates.io/crates).
|
||||
|
||||
## Implementation
|
||||
|
||||
The graph construction and searches are multithreaded with the **parking_lot** crate (See **parallel_insert_data** and **parallel_search_neighbours** functions and also examples files).
|
||||
Distances are provided by the crate [anndists](https://github.com/jean-pierreBoth/anndists), see *Building*.
|
||||
|
||||
## Building
|
||||
|
||||
### Simd
|
||||
|
||||
Two features activate simd in the crate **anndists** :
|
||||
|
||||
* The feature "simdeez_f" provide simd for x86_64 processors.
|
||||
Compile with **cargo build --release --features "simdeez_f"** or change the default features in Cargo.toml.
|
||||
To compile this crate on a M1 chip just do not activate this feature.
|
||||
|
||||
* The feature "stdsimd" provides portable simd through std::simd but **requires rust nightly**.
|
||||
Setting this feature in features default (or by cargo command) activates the portable_simd feature on rust nightly.
|
||||
Not all couples (Distance, type) are provided yet. (See the crate anndists)
|
||||
|
||||
### Julia interface
|
||||
|
||||
By default the crate is a standalone project and builds a static libray and executable.
|
||||
To be used with the companion Julia package it is necessary to build a dynamic library.
|
||||
This can be done by just uncommenting (i.e get rid of the #) in file Cargo.toml the line:
|
||||
|
||||
*#crate-type = ["cdylib"]*
|
||||
|
||||
and rerun the command: cargo build --release.
|
||||
|
||||
This will generate a .so file in the target/release directory.
|
||||
|
||||
## Algorithm and Input Parameters
|
||||
|
||||
The algorithm stores points in layers (at most 16), and a graph is constructed to enable a search from less densely populated levels to most densely populated levels by constructing links from less dense layers to the most dense layer (level 0).
|
||||
|
||||
Roughly the algorithm goes along runs as follows:
|
||||
|
||||
Upon insertion, the level ***l*** of a new point is sampled with an exponential law, limiting the number of levels to 16,
|
||||
so that level 0 is the most densely populated layer, upper layers being exponentially less populated as level increases.
|
||||
The nearest neighbour of the point is searched in lookup tables from the upper level to the level just above its layer (***l***), so we should arrive near the new point at its level at a relatively low cost. Then the ***max_nb_connection*** nearest neighbours are searched in neighbours of neighbours table (with a reverse updating of tables) recursively from its layer ***l*** down to the most populated level 0.
|
||||
|
||||
The parameter of the exponential law to sample point levels is set to `ln(max_nb_connection)/scale`.
|
||||
By default *scale* is set to 1. It is possible to reduce the *scale* parameter and thus reduce the number of levels used (See Hnsw::modify_level_scale) without increasing max_nb_connection.
|
||||
This often provide better recalls without increasing *max_nb_connection* and thus spare memory usage. (See examples)
|
||||
|
||||
|
||||
The main parameters occuring in constructing the graph or in searching are:
|
||||
|
||||
* max_nb_connection (in hnsw initialization)
|
||||
The maximum number of links from one point to others. Values ranging from 16 to 64 are standard initialising values, the higher the more time consuming.
|
||||
|
||||
* ef_construction (in hnsw initialization)
|
||||
This parameter controls the width of the search for neighbours during insertion. Values from 200 to 800 are standard initialising values, the higher the more time consuming.
|
||||
|
||||
* max_layer (in hnsw initialization)
|
||||
The maximum number of layers in graph. Must be less or equal than 16.
|
||||
|
||||
* ef_arg (in search methods)
|
||||
This parameter controls the width of the search in the lowest level, it must be greater than number of neighbours asked but can be less than ***ef_construction***.
|
||||
As a rule of thumb could be between the number of neighbours we will ask for (knbn arg in search method) and max_nb_connection.
|
||||
|
||||
* keep_pruned and extend_candidates.
|
||||
These parameters are described in the paper by Malkov and Yashunin can be used to
|
||||
modify the search strategy. The interested user should check the paper to see the impact. By default
|
||||
the values are as recommended in the paper.
|
||||
|
||||
## Benchmarks and Examples [(examples)](./examples)
|
||||
|
||||
Some examples are taken from the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks)
|
||||
and recall rates and request/s are given in comments in the examples files for some input parameters.
|
||||
The annhdf5 module implements reading the standardized data files
|
||||
of the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks),
|
||||
just download the necessary benchmark data files and modify path in sources accordingly.
|
||||
Then run: cargo build --release --features="simdeez_f" --examples .
|
||||
It is possible in these examples to change from parallel searches to serial searches to check for speeds
|
||||
or modify parameters to see the impact on performance.
|
||||
|
||||
With a i9-13900HX 24 cores laptop we get the following results:
|
||||
1. fashion-mnist-784-euclidean : search requests run at 62000 req/s with a recall rate of 0.977
|
||||
2. ann-glove-25-angular : search for the first 100 neighbours run with recall 0.979 at 12000 req/s
|
||||
3. sift1m benchmark: (1 million points in 128 dimension) search requests for the 10 first neighbours runs at 15000 req/s with a recall rate of 0.9907 or at 8300 req/s with a recall rate of 0.9959, depending on the parameters.
|
||||
|
||||
Moreover a tiny crate [bigann](https://github.com/jean-pierreBoth/bigann)
|
||||
gives results on the first 10 Million points of the [BIGANN](https://big-ann-benchmarks.com/neurips21.html) benchmark. The benchmark is also described at [IRISA](http://corpus-texmex.irisa.fr/). This crate can used to play with parameters on this data. Results give a recall between 0.92 and 0.99 depending on number of requests and parameters.
|
||||
|
||||
Some lines extracted from this Mnist benchmark show how it works for f32 and L2 norm
|
||||
|
||||
```rust
|
||||
// reading data
|
||||
let anndata = AnnBenchmarkData::new(fname).unwrap();
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let max_nb_connection = 24;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 400;
|
||||
// allocating network
|
||||
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2{});
|
||||
hnsw.set_extend_candidates(false);
|
||||
// parallel insertion of train data
|
||||
let data_for_par_insertion = anndata.train_data.iter().map( |x| (&x.0, x.1)).collect();
|
||||
hnsw.parallel_insert(&data_for_par_insertion);
|
||||
//
|
||||
hnsw.dump_layer_info();
|
||||
// Now the bench with 10 neighbours
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
let knbn = 10;
|
||||
let ef_c = max_nb_connection;
|
||||
// search 10 nearest neighbours for test data
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_c);
|
||||
....
|
||||
```
|
||||
|
||||
## Contributions
|
||||
|
||||
[Sannsyn](https://sannsyn.com/en/) contributed to Drop implementation and FilterT trait.
|
||||
Petter Egesund added the DistLevenshtein distance.
|
||||
|
||||
## Evolutions are described [here](./Changes.md)
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, [LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>
|
||||
* MIT license [LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>
|
||||
|
||||
at your option.
|
||||
|
||||
220
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-glove25-angular.rs
vendored
Normal file
220
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-glove25-angular.rs
vendored
Normal file
@@ -0,0 +1,220 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
// glove 25 // 2.7 Ghz 4 cores 8Mb L3 k = 10
|
||||
// ============================================
|
||||
//
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
// 24 800 64 1. 1 0 0.928 4090 1.003
|
||||
// 24 800 64 1. 1 1 0.927 4594 1.003
|
||||
// 24 400, 48 1. 1 0 0.919 6349 1.0044
|
||||
// 24 800 48 1 1 1 0.918 5785 1.005
|
||||
// 24 400 32 1. 0 0 0.898 8662
|
||||
// 24 400 64 1. 1 0 0.930 4711 1.0027
|
||||
// 24 400 64 1. 1 1 0.921 4550 1.0039
|
||||
// 24 1600 48 1 1 0 0.924 5380 1.0034
|
||||
|
||||
// 32 400 48 1 1 0 0.93 4706 1.0026
|
||||
// 32 800 64 1 1 0 0.94 3780. 1.0015
|
||||
// 32 1600 48 1 1 0 0.934 4455 1.0023
|
||||
// 48 1600 48 1 1 0 0.945 3253 1.00098
|
||||
|
||||
// 24 400 48 1 1 0 0.92 6036. 1.0038
|
||||
// 48 800 48 1 1 0 0.935 4018 1.002
|
||||
// 48 800 64 1 1 0 0.942 3091 1.0014
|
||||
// 48 800 64 1 1 1 0.9435 2640 1.00126
|
||||
|
||||
// k = 100
|
||||
|
||||
// 24 800 48 1 1 0 0.96 2432 1.004
|
||||
// 48 800 128 1 1 0 0.979 1626 1.001
|
||||
|
||||
// glove 25 // 8 cores i7 2.3 Ghz 8Mb L3 knbn = 100
|
||||
// ==================================================
|
||||
|
||||
// 48 800 48 1 1 0 0.935 13400 1.002
|
||||
// 48 800 128 1 1 0 0.979 5227 1.002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
|
||||
// ==================================================
|
||||
// 48 800 48 1 1 0 0.936 30748 1.002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
|
||||
// ==================================================
|
||||
// 48 800 128 1 1 0 0.979 12000 1.002
|
||||
|
||||
// results with scale modification 0.5
|
||||
//====================================
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
|
||||
// ==================================================
|
||||
// 24 800 48 0.5 1 0 0.931 40700 1.002
|
||||
// 48 800 48 0.5 1 0 0.941 30001 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
|
||||
// ==================================================
|
||||
// 24 800 128 0.5 1 0 0.974 16521 1.002
|
||||
// 48 800 128 0.5 1 0 0.985 11484 1.001
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
use log::info;
|
||||
|
||||
mod utils;
|
||||
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
let _ = env_logger::builder().is_test(true).try_init().unwrap();
|
||||
let parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/glove-25-angular.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let mut anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
// pre normalisation to use Dot computations instead of Cosine
|
||||
anndata.do_l2_normalization();
|
||||
// run bench
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
info!(
|
||||
"Train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!("Nb neighbours answers for test data : {} \n\n", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 24;
|
||||
let ef_c = 800;
|
||||
println!(
|
||||
" max_nb_conn : {:?}, ef_construction : {:?} ",
|
||||
max_nb_connection, ef_c
|
||||
);
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
let nb_search = anndata.test_data.len();
|
||||
println!(" number of search {:?}", nb_search);
|
||||
// Hnsw allocation
|
||||
let mut hnsw =
|
||||
Hnsw::<f32, DistDot>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistDot {});
|
||||
//
|
||||
hnsw.set_extend_candidates(true);
|
||||
hnsw.modify_level_scale(0.5);
|
||||
//
|
||||
// parallel insertion
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
// Now the bench with 10 neighbours
|
||||
//
|
||||
let knbn = 10;
|
||||
let ef_search = 48;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
|
||||
let knbn = 100;
|
||||
let ef_search = 128;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
}
|
||||
|
||||
pub fn search<Dist>(
|
||||
hnsw: &mut Hnsw<f32, Dist>,
|
||||
knbn: usize,
|
||||
ef_search: usize,
|
||||
anndata: &annhdf5::AnnBenchmarkData,
|
||||
) where
|
||||
Dist: Distance<f32> + Send + Sync,
|
||||
{
|
||||
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
|
||||
let parallel = true;
|
||||
//
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let nb_search = anndata.test_data.len();
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
println!("searching with ef : {:?}", ef_search);
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
// search
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> =
|
||||
hnsw.search(&anndata.test_data[i], knbn, ef_search);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
let cpu_time = start.elapsed();
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let max_dist = anndata.test_distances.row(i)[knbn - 1];
|
||||
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_d.len());
|
||||
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_d.is_empty() {
|
||||
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
}
|
||||
162
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-mnist-784-euclidean.rs
vendored
Normal file
162
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-mnist-784-euclidean.rs
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
// search in serial mode i7-core @2.7Ghz for 10 fist neighbours
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
//
|
||||
// 12 400 12 1 0 0 0.917 6486 1.005
|
||||
// 24 400 24 1 1 0 0.9779 3456 1.001
|
||||
|
||||
// parallel mode 4 i7-core @2.7Ghz
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
// 24 400 24 1 0 0 0.977 12566 1.001
|
||||
// 24 400 12 1 0 0 0.947 18425 1.003
|
||||
|
||||
// 8 hyperthreaded i7-core @ 2.3 Ghz
|
||||
// 24 400 24 1 0 0 0.977 22197 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez
|
||||
// 24 400 24 1 0 0 0.977 62000 1.001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez with modify_level_scale at 0.5
|
||||
// 24 400 24 0.5 0 0 0.990 58722 1.000
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
use log::info;
|
||||
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
let mut parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/fashion-mnist-784-euclidean.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
let nb_elem = anndata.train_data.len();
|
||||
info!(
|
||||
"Train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!("Nb neighbours answers for test data : {}", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 24;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 400;
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
println!(
|
||||
" ====================================================================================="
|
||||
);
|
||||
let nb_search = anndata.test_data.len();
|
||||
println!(" number of search {:?}", nb_search);
|
||||
|
||||
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
hnsw.set_extend_candidates(false);
|
||||
//
|
||||
hnsw.modify_level_scale(0.25);
|
||||
// parallel insertion
|
||||
let mut start = ProcessTime::now();
|
||||
let mut now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
// Now the bench with 10 neighbours
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
let knbn = 10;
|
||||
let ef_c = max_nb_connection;
|
||||
println!("\n searching with ef : {:?}", ef_c);
|
||||
start = ProcessTime::now();
|
||||
now = SystemTime::now();
|
||||
// search
|
||||
parallel = true;
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_c);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> = hnsw.search(&anndata.test_data[i], knbn, ef_c);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
cpu_time = start.elapsed();
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time, search_sys_time
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let true_distances = anndata.test_distances.row(i);
|
||||
let max_dist = true_distances[knbn - 1];
|
||||
let mut _knn_neighbours_id: Vec<usize> =
|
||||
knn_neighbours_for_tests[i].iter().map(|p| p.d_id).collect();
|
||||
let knn_neighbours_dist: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_dist.len());
|
||||
// count how many distances of knn_neighbours_dist are less than
|
||||
let recall = knn_neighbours_dist
|
||||
.iter()
|
||||
.filter(|x| *x <= &max_dist)
|
||||
.count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_dist.is_empty() {
|
||||
ratio = knn_neighbours_dist[knn_neighbours_dist.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
}
|
||||
196
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-sift1m-128-euclidean.rs
vendored
Normal file
196
vendor/ruvector/scripts/patches/hnsw_rs/examples/ann-sift1m-128-euclidean.rs
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use env_logger::Builder;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anndists::dist::*;
|
||||
use log::info;
|
||||
|
||||
// search in paralle mode 8 core i7-10875H @2.3Ghz time 100 neighbours
|
||||
|
||||
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
|
||||
//
|
||||
// 64 800 64 1 0 0 0.976 4894 1.001
|
||||
// 64 800 128 1 0 0 0.985 3811 1.00064
|
||||
// 64 800 128 1 1 0 0.9854 3765 1.0
|
||||
|
||||
// 64 1600 64 1 0 0 0.9877 3419. 1.0005
|
||||
|
||||
// search in parallel mode 8 core i7-10875H @2.3Ghz time for 10 neighbours
|
||||
|
||||
// 64 1600 64 1 0 0 0.9907 6100 1.0004
|
||||
// 64 1600 128 1 0 0 0.9959 3077. 1.0001
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez
|
||||
|
||||
// 64 1600 64 1 0 0 0.9907 15258 1.0004
|
||||
// 64 1600 128 1 0 0 0.9957 8296 1.0002
|
||||
|
||||
// 24 core Core(TM) i9-13900HX simdeez with level scale modification factor 0.5
|
||||
//=============================================================================
|
||||
|
||||
// 48 1600 64 0.5 0 0 0.9938 14073 1.0002
|
||||
// 48 1600 128 0.5 0 0 0.9992 7906 1.0000
|
||||
|
||||
// with an AMD ryzen 9 7950X 16-Core simdeez with level scale modification factor 0.5
|
||||
//=============================================================================
|
||||
// 48 1600 64 0.5 0 0 0.9938 17000 1.0002
|
||||
// 48 1600 128 0.5 0 0 0.9992 9600 1.0000
|
||||
|
||||
use hnsw_rs::prelude::*;
|
||||
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
pub fn main() {
|
||||
//
|
||||
Builder::from_default_env().init();
|
||||
//
|
||||
let parallel = true;
|
||||
//
|
||||
let fname = String::from("/home/jpboth/Data/ANN/sift1m-128-euclidean.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
|
||||
// run bench
|
||||
let knbn_max = anndata.test_distances.dim().1;
|
||||
let nb_elem = anndata.train_data.len();
|
||||
info!(
|
||||
" train size : {}, test size : {}",
|
||||
nb_elem,
|
||||
anndata.test_data.len()
|
||||
);
|
||||
info!(" nb neighbours answers for test data : {}", knbn_max);
|
||||
//
|
||||
let max_nb_connection = 48;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 1600;
|
||||
//
|
||||
println!(
|
||||
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
|
||||
nb_elem, nb_layer, ef_c
|
||||
);
|
||||
println!(
|
||||
" ====================================================================================="
|
||||
);
|
||||
//
|
||||
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
//
|
||||
let extend_flag = false;
|
||||
info!("extend flag = {:?} ", extend_flag);
|
||||
hnsw.set_extend_candidates(extend_flag);
|
||||
hnsw.modify_level_scale(0.5);
|
||||
//
|
||||
// parallel insertion
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
let data_for_par_insertion = anndata
|
||||
.train_data
|
||||
.iter()
|
||||
.map(|x| (x.0.as_slice(), x.1))
|
||||
.collect();
|
||||
if parallel {
|
||||
println!(" \n parallel insertion");
|
||||
hnsw.parallel_insert_slice(&data_for_par_insertion);
|
||||
} else {
|
||||
println!(" \n serial insertion");
|
||||
for d in data_for_par_insertion {
|
||||
hnsw.insert_slice(d);
|
||||
}
|
||||
}
|
||||
let cpu_time: Duration = start.elapsed();
|
||||
//
|
||||
println!(
|
||||
"\n hnsw data insertion cpu time {:?} system time {:?} ",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
hnsw.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
|
||||
//
|
||||
//
|
||||
let knbn = 10.min(knbn_max);
|
||||
let ef_search = 64;
|
||||
println!("searching with ef = {}", ef_search);
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
//
|
||||
println!("searching with ef = {}", ef_search);
|
||||
let ef_search = 128;
|
||||
search(&mut hnsw, knbn, ef_search, &anndata);
|
||||
}
|
||||
|
||||
pub fn search<Dist>(
|
||||
hnsw: &mut Hnsw<f32, Dist>,
|
||||
knbn: usize,
|
||||
ef_search: usize,
|
||||
anndata: &annhdf5::AnnBenchmarkData,
|
||||
) where
|
||||
Dist: Distance<f32> + Send + Sync,
|
||||
{
|
||||
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
|
||||
let parallel = true;
|
||||
//
|
||||
let nb_elem = anndata.train_data.len();
|
||||
let nb_search = anndata.test_data.len();
|
||||
//
|
||||
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
|
||||
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
|
||||
hnsw.set_searching_mode(true);
|
||||
println!("searching with ef : {:?}", ef_search);
|
||||
let start = ProcessTime::now();
|
||||
let now = SystemTime::now();
|
||||
// search
|
||||
if parallel {
|
||||
println!(" \n parallel search");
|
||||
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
|
||||
} else {
|
||||
println!(" \n serial search");
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let knn_neighbours: Vec<Neighbour> =
|
||||
hnsw.search(&anndata.test_data[i], knbn, ef_search);
|
||||
knn_neighbours_for_tests.push(knn_neighbours);
|
||||
}
|
||||
}
|
||||
let cpu_time = start.elapsed();
|
||||
let search_cpu_time = cpu_time.as_micros() as f32;
|
||||
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
|
||||
println!(
|
||||
"total cpu time for search requests {:?} , system time {:?} ",
|
||||
search_cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
// now compute recall rate
|
||||
for i in 0..anndata.test_data.len() {
|
||||
let max_dist = anndata.test_distances.row(i)[knbn - 1];
|
||||
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
|
||||
.iter()
|
||||
.map(|p| p.distance)
|
||||
.collect();
|
||||
nb_returned.push(knn_neighbours_d.len());
|
||||
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
|
||||
recalls.push(recall);
|
||||
let mut ratio = 0.;
|
||||
if !knn_neighbours_d.is_empty() {
|
||||
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
|
||||
}
|
||||
last_distances_ratio.push(ratio);
|
||||
}
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!(
|
||||
"\n mean fraction nb returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n last distances ratio {:?} ",
|
||||
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
|
||||
);
|
||||
println!(
|
||||
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
|
||||
anndata.fname,
|
||||
mean_recall,
|
||||
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
|
||||
);
|
||||
} // end of search
|
||||
63
vendor/ruvector/scripts/patches/hnsw_rs/examples/levensthein.rs
vendored
Normal file
63
vendor/ruvector/scripts/patches/hnsw_rs/examples/levensthein.rs
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
use anndists::dist::*;
|
||||
|
||||
use hnsw_rs::prelude::*;
|
||||
use rand::Rng;
|
||||
use std::iter;
|
||||
|
||||
fn generate(len: usize) -> String {
|
||||
const CHARSET: &[u8] = b"abcdefghij";
|
||||
let mut rng = rand::rng();
|
||||
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
|
||||
iter::repeat_with(one_char).take(len).collect()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let nb_elem = 500000; // number of possible words in the dictionary
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 200;
|
||||
let nb_words = 1000;
|
||||
let hns = Hnsw::<u16, DistLevenshtein>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
DistLevenshtein {},
|
||||
);
|
||||
let mut words = vec![];
|
||||
for _n in 1..nb_words {
|
||||
let tw = generate(5);
|
||||
words.push(tw);
|
||||
}
|
||||
words.push(String::from("abcdj"));
|
||||
//
|
||||
for (i, w) in words.iter().enumerate() {
|
||||
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
|
||||
hns.insert((&vec, i));
|
||||
}
|
||||
// create a filter
|
||||
let mut filter: Vec<usize> = Vec::new();
|
||||
for i in 1..100 {
|
||||
filter.push(i);
|
||||
}
|
||||
//
|
||||
let ef_search: usize = 30;
|
||||
let tosearch: Vec<u16> = "abcde".chars().map(|c| c as u16).collect();
|
||||
//
|
||||
println!("========== search with filter ");
|
||||
let res = hns.search_filter(&tosearch, 10, ef_search, Some(&filter));
|
||||
for r in res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
println!("========== search without filter ");
|
||||
let res3 = hns.search(&tosearch, 10, ef_search);
|
||||
for r in res3 {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
}
|
||||
80
vendor/ruvector/scripts/patches/hnsw_rs/examples/random.rs
vendored
Normal file
80
vendor/ruvector/scripts/patches/hnsw_rs/examples/random.rs
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
#![allow(clippy::range_zip_with_len)]
|
||||
|
||||
use cpu_time::ProcessTime;
|
||||
use rand::distr::Uniform;
|
||||
use rand::prelude::*;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
|
||||
fn main() {
|
||||
env_logger::Builder::from_default_env().init();
|
||||
//
|
||||
let nb_elem = 500000;
|
||||
let dim = 25;
|
||||
// generate nb_elem colmuns vectors of dimension dim
|
||||
let mut rng = rand::rng();
|
||||
let unif = rand::distr::StandardUniform;
|
||||
let mut data = Vec::with_capacity(nb_elem);
|
||||
for _ in 0..nb_elem {
|
||||
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
|
||||
data.push(column);
|
||||
}
|
||||
// give an id to each data
|
||||
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
|
||||
|
||||
let ef_c = 200;
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
let mut start = ProcessTime::now();
|
||||
let mut begin_t = SystemTime::now();
|
||||
hns.parallel_insert(&data_with_id);
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
println!(" hnsw data insertion cpu time {:?}", cpu_time);
|
||||
println!(
|
||||
" hnsw data insertion parallel, system time {:?} \n",
|
||||
begin_t.elapsed().unwrap()
|
||||
);
|
||||
hns.dump_layer_info();
|
||||
println!(
|
||||
" parallel hnsw data nb point inserted {:?}",
|
||||
hns.get_nb_point()
|
||||
);
|
||||
//
|
||||
// serial insertion
|
||||
//
|
||||
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
start = ProcessTime::now();
|
||||
begin_t = SystemTime::now();
|
||||
for _i in 0..data_with_id.len() {
|
||||
hns.insert((data_with_id[_i].0.as_slice(), data_with_id[_i].1))
|
||||
}
|
||||
cpu_time = start.elapsed();
|
||||
println!("\n\n serial hnsw data insertion {:?}", cpu_time);
|
||||
println!(
|
||||
" hnsw data insertion serial, system time {:?}",
|
||||
begin_t.elapsed().unwrap()
|
||||
);
|
||||
hns.dump_layer_info();
|
||||
println!(
|
||||
" serial hnsw data nb point inserted {:?}",
|
||||
hns.get_nb_point()
|
||||
);
|
||||
|
||||
let ef_search = max_nb_connection * 2;
|
||||
let knbn = 10;
|
||||
//
|
||||
for _iter in 0..100 {
|
||||
let mut r_vec = Vec::<f32>::with_capacity(dim);
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
for _ in 0..dim {
|
||||
r_vec.push(rng.sample(unif));
|
||||
}
|
||||
//
|
||||
let _neighbours = hns.search(&r_vec, knbn, ef_search);
|
||||
}
|
||||
}
|
||||
233
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/annhdf5.rs
vendored
Normal file
233
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/annhdf5.rs
vendored
Normal file
@@ -0,0 +1,233 @@
|
||||
//! This file provides hdf5 utilities to load ann-benchmarks hdf5 data files
|
||||
//! As the libray does not depend on hdf5 nor on ndarray, it is nearly the same for both
|
||||
//! ann benchmarks.
|
||||
|
||||
use ndarray::Array2;
|
||||
|
||||
use ::hdf5::*;
|
||||
use log::debug;
|
||||
|
||||
// datasets
|
||||
// . distances (nbojects, dim) f32 matrix for tests objects
|
||||
// . neighbors (nbobjects, nbnearest) int32 matrix giving the num of nearest neighbors in train data
|
||||
// . test (nbobjects, dim) f32 matrix test data
|
||||
// . train (nbobjects, dim) f32 matrix train data
|
||||
|
||||
/// a structure to load hdf5 data file benchmarks from https://github.com/erikbern/ann-benchmarks
|
||||
pub struct AnnBenchmarkData {
|
||||
pub fname: String,
|
||||
/// distances from each test object to its nearest neighbours.
|
||||
pub test_distances: Array2<f32>,
|
||||
// for each test data , id of its nearest neighbours
|
||||
#[allow(unused)]
|
||||
pub test_neighbours: Array2<i32>,
|
||||
/// list of vectors for which we will search ann.
|
||||
pub test_data: Vec<Vec<f32>>,
|
||||
/// list of data vectors and id
|
||||
pub train_data: Vec<(Vec<f32>, usize)>,
|
||||
/// searched results. first neighbours for each test data.
|
||||
#[allow(unused)]
|
||||
pub searched_neighbours: Vec<Vec<i32>>,
|
||||
/// distances of neighbours obtained of each test
|
||||
#[allow(unused)]
|
||||
pub searched_distances: Vec<Vec<f32>>,
|
||||
}
|
||||
|
||||
impl AnnBenchmarkData {
|
||||
pub fn new(fname: String) -> Result<AnnBenchmarkData> {
|
||||
let res = hdf5::File::open(fname.clone());
|
||||
if res.is_err() {
|
||||
println!("you must download file {:?}", fname);
|
||||
panic!(
|
||||
"download benchmark file some where and modify examples source file accordingly"
|
||||
);
|
||||
}
|
||||
let file = res.ok().unwrap();
|
||||
//
|
||||
// get test distances
|
||||
//
|
||||
let res_distances = file.dataset("distances");
|
||||
if res_distances.is_err() {
|
||||
// let reader = hdf5::Reader::<f32>::new(&test_distance);
|
||||
panic!("error getting distances dataset");
|
||||
}
|
||||
let distances = res_distances.unwrap();
|
||||
let shape = distances.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
let dataf32 = distances.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
// error
|
||||
panic!("error getting type distances dataset");
|
||||
}
|
||||
// read really data
|
||||
let res = distances.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let test_distances = res.unwrap();
|
||||
// a check for row order
|
||||
debug!(
|
||||
"First 2 distances for first test {:?} {:?} ",
|
||||
test_distances.get((0, 0)).unwrap(),
|
||||
test_distances.get((0, 1)).unwrap()
|
||||
);
|
||||
//
|
||||
// read neighbours
|
||||
//
|
||||
let res_neighbours = file.dataset("neighbors");
|
||||
if res_neighbours.is_err() {
|
||||
// let reader = hdf5::Reader::<f32>::new(&test_distance);
|
||||
panic!("error getting neighbours");
|
||||
}
|
||||
let neighbours = res_neighbours.unwrap();
|
||||
let shape = neighbours.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
println!("neighbours shape : {:?}", shape);
|
||||
let datai32 = neighbours.dtype().unwrap().is::<i32>();
|
||||
if !datai32 {
|
||||
// error
|
||||
panic!("error getting type neighbours");
|
||||
}
|
||||
// read really data
|
||||
let res = neighbours.read_2d::<i32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading neighbours dataset");
|
||||
}
|
||||
let test_neighbours = res.unwrap();
|
||||
debug!(
|
||||
"First 2 neighbours for first test {:?} {:?} ",
|
||||
test_neighbours.get((0, 0)).unwrap(),
|
||||
test_neighbours.get((0, 1)).unwrap()
|
||||
);
|
||||
println!("\n 10 first neighbours for first vector : ");
|
||||
for i in 0..10 {
|
||||
print!(" {:?} ", test_neighbours.get((0, i)).unwrap());
|
||||
}
|
||||
println!("\n 10 first neighbours for second vector : ");
|
||||
for i in 0..10 {
|
||||
print!(" {:?} ", test_neighbours.get((1, i)).unwrap());
|
||||
}
|
||||
//
|
||||
// read test data
|
||||
// ===============
|
||||
//
|
||||
let res_testdata = file.dataset("test");
|
||||
if res_testdata.is_err() {
|
||||
panic!("error getting test de notataset");
|
||||
}
|
||||
let test_data = res_testdata.unwrap();
|
||||
let shape = test_data.shape(); // nota shape returns a slice, dim returns a t-uple
|
||||
assert_eq!(shape.len(), 2);
|
||||
let dataf32 = test_data.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
panic!("error getting type de notistances dataset");
|
||||
}
|
||||
// read really datae not
|
||||
let res = test_data.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let test_data_2d = res.unwrap();
|
||||
let mut test_data = Vec::<Vec<f32>>::with_capacity(shape[1]);
|
||||
let (nbrow, nbcolumn) = test_data_2d.dim();
|
||||
println!(" test data, nb element {:?}, dim : {:?}", nbrow, nbcolumn);
|
||||
for i in 0..nbrow {
|
||||
let mut vec = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
vec.push(*test_data_2d.get((i, j)).unwrap());
|
||||
}
|
||||
test_data.push(vec);
|
||||
}
|
||||
//
|
||||
// loaf train data
|
||||
//
|
||||
let res_traindata = file.dataset("train");
|
||||
if res_traindata.is_err() {
|
||||
panic!("error getting distances dataset");
|
||||
}
|
||||
let train_data = res_traindata.unwrap();
|
||||
let train_shape = train_data.shape();
|
||||
assert_eq!(shape.len(), 2);
|
||||
if test_data_2d.dim().1 != train_shape[1] {
|
||||
println!("test and train have not the same dimension");
|
||||
panic!();
|
||||
}
|
||||
println!(
|
||||
"\n train data shape : {:?}, nbvector {:?} ",
|
||||
train_shape, train_shape[0]
|
||||
);
|
||||
let dataf32 = train_data.dtype().unwrap().is::<f32>();
|
||||
if !dataf32 {
|
||||
// error
|
||||
panic!("error getting type distances dataset");
|
||||
}
|
||||
// read really data
|
||||
let res = train_data.read_2d::<f32>();
|
||||
if res.is_err() {
|
||||
// some error
|
||||
panic!("error reading distances dataset");
|
||||
}
|
||||
let train_data_2d = res.unwrap();
|
||||
let mut train_data = Vec::<(Vec<f32>, usize)>::with_capacity(shape[1]);
|
||||
let (nbrow, nbcolumn) = train_data_2d.dim();
|
||||
for i in 0..nbrow {
|
||||
let mut vec = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
vec.push(*train_data_2d.get((i, j)).unwrap());
|
||||
}
|
||||
train_data.push((vec, i));
|
||||
}
|
||||
//
|
||||
// now allocate array's for result
|
||||
//
|
||||
println!(
|
||||
" allocating vector for search neighbours answer : {:?}",
|
||||
test_data.len()
|
||||
);
|
||||
let searched_neighbours = Vec::<Vec<i32>>::with_capacity(test_data.len());
|
||||
let searched_distances = Vec::<Vec<f32>>::with_capacity(test_data.len());
|
||||
// searched_distances
|
||||
Ok(AnnBenchmarkData {
|
||||
fname: fname.clone(),
|
||||
test_distances,
|
||||
test_neighbours,
|
||||
test_data,
|
||||
train_data,
|
||||
searched_neighbours,
|
||||
searched_distances,
|
||||
})
|
||||
} // end new
|
||||
|
||||
/// do l2 normalisation of test and train vector to use DistDot metrinc instead DistCosine to spare cpu
|
||||
#[allow(unused)]
|
||||
pub fn do_l2_normalization(&mut self) {
|
||||
for i in 0..self.test_data.len() {
|
||||
anndists::dist::l2_normalize(&mut self.test_data[i]);
|
||||
}
|
||||
for i in 0..self.train_data.len() {
|
||||
anndists::dist::l2_normalize(&mut self.train_data[i].0);
|
||||
}
|
||||
} // end of do_l2_normalization
|
||||
} // end of impl block
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
|
||||
fn test_load_hdf5() {
|
||||
env_logger::Builder::from_default_env().init();
|
||||
//
|
||||
let fname = String::from("/home.2/Data/ANN/glove-25-angular.hdf5");
|
||||
println!("\n\n test_load_hdf5 {:?}", fname);
|
||||
// now recall that data are stored in row order.
|
||||
let _anndata = AnnBenchmarkData::new(fname).unwrap();
|
||||
//
|
||||
} // end of test_load_hdf5
|
||||
} // end of module test
|
||||
3
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/mod.rs
vendored
Normal file
3
vendor/ruvector/scripts/patches/hnsw_rs/examples/utils/mod.rs
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
//! hdf5 utilities for examples
|
||||
|
||||
pub mod annhdf5;
|
||||
87
vendor/ruvector/scripts/patches/hnsw_rs/src/api.rs
vendored
Normal file
87
vendor/ruvector/scripts/patches/hnsw_rs/src/api.rs
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Api for external language.
|
||||
//! This file provides a trait to be used as an opaque pointer for C or Julia calls used in file libext.rs
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
|
||||
use crate::hnsw::*;
|
||||
use crate::hnswio::*;
|
||||
use anndists::dist::distances::Distance;
|
||||
use log::info;
|
||||
|
||||
pub trait AnnT {
|
||||
/// type of data vectors
|
||||
type Val;
|
||||
//
|
||||
fn insert_data(&mut self, data: &[Self::Val], id: usize);
|
||||
//
|
||||
fn search_neighbours(&self, data: &[Self::Val], knbn: usize, ef_s: usize) -> Vec<Neighbour>;
|
||||
//
|
||||
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]);
|
||||
//
|
||||
fn parallel_search_neighbours(
|
||||
&self,
|
||||
data: &[Vec<Self::Val>],
|
||||
knbn: usize,
|
||||
ef_s: usize,
|
||||
) -> Vec<Vec<Neighbour>>;
|
||||
///
|
||||
/// dumps a data and graph in 2 files.
|
||||
/// Datas are dumped in file filename.hnsw.data and graph in filename.hnsw.graph
|
||||
///
|
||||
/// **We do not overwrite old files if they are currently in use by memory map**
|
||||
/// If these files already exist , they are not overwritten and a unique filename is generated by concatenating a random number to filename.
|
||||
/// The function returns the basename used for the dump
|
||||
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String>;
|
||||
}
|
||||
|
||||
impl<T, D> AnnT for Hnsw<'_, T, D>
|
||||
where
|
||||
T: Serialize + DeserializeOwned + Clone + Send + Sync,
|
||||
D: Distance<T> + Send + Sync,
|
||||
{
|
||||
type Val = T;
|
||||
//
|
||||
fn insert_data(&mut self, data: &[Self::Val], id: usize) {
|
||||
self.insert((data, id));
|
||||
}
|
||||
//
|
||||
fn search_neighbours(&self, data: &[T], knbn: usize, ef_s: usize) -> Vec<Neighbour> {
|
||||
self.search(data, knbn, ef_s)
|
||||
}
|
||||
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]) {
|
||||
self.parallel_insert(data);
|
||||
}
|
||||
|
||||
fn parallel_search_neighbours(
|
||||
&self,
|
||||
data: &[Vec<Self::Val>],
|
||||
knbn: usize,
|
||||
ef_s: usize,
|
||||
) -> Vec<Vec<Neighbour>> {
|
||||
self.parallel_search(data, knbn, ef_s)
|
||||
}
|
||||
|
||||
// The main entry point to do a dump.
|
||||
// It will generate two files one for the graph part of the data. The other for the real data points of the structure.
|
||||
// The names of file are $filename.hnsw.graph for the graph and $filename.hnsw.data.
|
||||
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String> {
|
||||
info!("In Hnsw::file_dump");
|
||||
//
|
||||
// do not overwrite if mmap is active
|
||||
let overwrite = !self.get_datamap_opt();
|
||||
let mut dumpinit = DumpInit::new(path, file_basename, overwrite);
|
||||
let dumpname = dumpinit.get_basename().clone();
|
||||
//
|
||||
let res = self.dump(DumpMode::Full, &mut dumpinit);
|
||||
//
|
||||
dumpinit.flush()?;
|
||||
info!("\n End of dump, file basename : {}\n", &dumpname);
|
||||
if res.is_ok() {
|
||||
Ok(dumpname)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("unexpected error"))
|
||||
}
|
||||
} // end of dump
|
||||
} // end of impl block AnnT for Hnsw<T,D>
|
||||
457
vendor/ruvector/scripts/patches/hnsw_rs/src/datamap.rs
vendored
Normal file
457
vendor/ruvector/scripts/patches/hnsw_rs/src/datamap.rs
vendored
Normal file
@@ -0,0 +1,457 @@
|
||||
//! This module provides a memory mapping of Data vectors filling the Hnsw structure.
|
||||
//! It is used by the module [hnswio] and also gives access to an iterator over data without loading the graph.
|
||||
//!
|
||||
//! We mmap the file and provide
|
||||
//! - a Hashmap from DataId to address
|
||||
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
|
||||
|
||||
use std::io::BufReader;
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use indexmap::map::IndexMap;
|
||||
use log::{debug, error, info, trace};
|
||||
use mmap_rs::{Mmap, MmapOptions};
|
||||
|
||||
use crate::hnsw::DataId;
|
||||
use crate::hnswio;
|
||||
|
||||
use crate::hnswio::MAGICDATAP;
|
||||
/// This structure uses the data part of the dump of a Hnsw structure to retrieve the data.
|
||||
/// The data is access via a mmap of the data file, so memory is spared at the expense of page loading.
|
||||
// possibly to be used in graph to spare memory?
|
||||
pub struct DataMap {
|
||||
/// File containing Points data
|
||||
_datapath: PathBuf,
|
||||
/// The mmap structure
|
||||
mmap: Mmap,
|
||||
/// map a dataId to an address where we get a bson encoded vector of type T
|
||||
hmap: IndexMap<DataId, usize>,
|
||||
/// type name of Data
|
||||
t_name: String,
|
||||
/// dimension of data vector
|
||||
dimension: usize,
|
||||
//
|
||||
distname: String,
|
||||
} // end of DataMap
|
||||
|
||||
impl DataMap {
|
||||
// TODO: specifiy mmap option
|
||||
/// The fname argument corresponds to the basename of the dump.
|
||||
/// To reload from file fname.hnsw.data just pass fname as argument.
|
||||
/// The dir argument is the directory where the fname.hnsw.data and fname.hnsw.graph reside.
|
||||
pub fn from_hnswdump<T: std::fmt::Debug>(
|
||||
dir: &Path,
|
||||
file_name: &str,
|
||||
) -> Result<DataMap, String> {
|
||||
// reload description to have data type, and check for dump version
|
||||
let mut graphpath = PathBuf::from(dir);
|
||||
graphpath.push(dir);
|
||||
let mut filename = file_name.to_string();
|
||||
filename.push_str(".hnsw.graph");
|
||||
graphpath.push(filename);
|
||||
let graphfileres = OpenOptions::new().read(true).open(&graphpath);
|
||||
if graphfileres.is_err() {
|
||||
println!("DataMap: could not open file {:?}", graphpath.as_os_str());
|
||||
std::process::exit(1);
|
||||
}
|
||||
let graphfile = graphfileres.unwrap();
|
||||
let mut graph_in = BufReader::new(graphfile);
|
||||
// we need to call load_description first to get distance name
|
||||
let hnsw_description = hnswio::load_description(&mut graph_in).unwrap();
|
||||
if hnsw_description.format_version <= 2 {
|
||||
let msg = String::from(
|
||||
"from_hnsw::from_hnsw : data mapping is only possible for dumps with the version > 0.1.19 of this crate",
|
||||
);
|
||||
error!(
|
||||
"Data mapping is only possible for dumps with the version > 0.1.19 of this crate"
|
||||
);
|
||||
return Err(msg);
|
||||
}
|
||||
let distname = hnsw_description.distname.clone();
|
||||
let t_name = hnsw_description.get_typename();
|
||||
// check typename coherence
|
||||
info!("Got typename from reload : {:?}", t_name);
|
||||
if std::any::type_name::<T>() != t_name {
|
||||
error!(
|
||||
"Description has typename {:?}, function type argument is : {:?}",
|
||||
t_name,
|
||||
std::any::type_name::<T>()
|
||||
);
|
||||
return Err(String::from("type error"));
|
||||
}
|
||||
// get dimension as declared in description
|
||||
let descr_dimension = hnsw_description.get_dimension();
|
||||
drop(graph_in);
|
||||
//
|
||||
// we know data filename is hnswdump.hnsw.data
|
||||
//
|
||||
let mut datapath = PathBuf::new();
|
||||
datapath.push(dir);
|
||||
let mut filename = file_name.to_string();
|
||||
filename.push_str(".hnsw.data");
|
||||
datapath.push(filename);
|
||||
//
|
||||
let meta = std::fs::metadata(&datapath);
|
||||
if meta.is_err() {
|
||||
error!("Could not open file : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let fsize = meta.unwrap().len().try_into().unwrap();
|
||||
//
|
||||
let file_res = File::open(&datapath);
|
||||
if file_res.is_err() {
|
||||
error!("Could not open file : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let file = file_res.unwrap();
|
||||
let offset = 0;
|
||||
//
|
||||
let mmap_opt = MmapOptions::new(fsize).unwrap();
|
||||
let mmap_opt = unsafe { mmap_opt.with_file(&file, offset) };
|
||||
let mapping_res = mmap_opt.map();
|
||||
if mapping_res.is_err() {
|
||||
error!("Could not memory map : {:?}", &datapath);
|
||||
std::process::exit(1);
|
||||
}
|
||||
let mmap = mapping_res.unwrap();
|
||||
//
|
||||
info!("Mmap done on file : {:?}", &datapath);
|
||||
//
|
||||
// where are we in decoding mmap slice? at beginning
|
||||
//
|
||||
let mapped_slice = mmap.as_slice();
|
||||
//
|
||||
// where are we in decoding mmap slice?
|
||||
let mut current_mmap_addr = 0usize;
|
||||
let mut usize_slice = [0u8; std::mem::size_of::<usize>()];
|
||||
// check magic
|
||||
let mut u32_slice = [0u8; std::mem::size_of::<u32>()];
|
||||
u32_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u32>();
|
||||
let magic = u32::from_ne_bytes(u32_slice);
|
||||
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
|
||||
// get dimension
|
||||
usize_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<usize>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<usize>();
|
||||
let dimension = usize::from_ne_bytes(usize_slice);
|
||||
if dimension != descr_dimension {
|
||||
error!(
|
||||
"Description and data do not agree on dimension, data got : {:?}, description got : {:?}",
|
||||
dimension, descr_dimension
|
||||
);
|
||||
return Err(String::from(
|
||||
"description and data do not agree on dimension",
|
||||
));
|
||||
} else {
|
||||
info!("Got dimension : {:?}", dimension);
|
||||
}
|
||||
//
|
||||
// now we know that each record consists in
|
||||
// - MAGICDATAP (u32), DataId (u64), dimension (u64) and then (length of type in bytes * dimension)
|
||||
//
|
||||
let record_size = std::mem::size_of::<u32>()
|
||||
+ 2 * std::mem::size_of::<u64>()
|
||||
+ dimension * std::mem::size_of::<T>();
|
||||
let residual = mmap.size() - current_mmap_addr;
|
||||
info!(
|
||||
"Mmap size {}, current_mmap_addr {}, residual : {}",
|
||||
mmap.size(),
|
||||
current_mmap_addr,
|
||||
residual
|
||||
);
|
||||
let nb_record = residual / record_size;
|
||||
debug!("Record size : {}, nb_record : {}", record_size, nb_record);
|
||||
// allocate hmap with correct capacity
|
||||
let mut hmap = IndexMap::<DataId, usize>::with_capacity(nb_record);
|
||||
// fill hmap to have address of each data point in file
|
||||
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
|
||||
//
|
||||
// now we loop on records
|
||||
//
|
||||
for i in 0..nb_record {
|
||||
debug!("Record i : {}, addr : {}", i, current_mmap_addr);
|
||||
// decode Magic
|
||||
u32_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u32>();
|
||||
let magic = u32::from_ne_bytes(u32_slice);
|
||||
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
|
||||
// decode DataId
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
let data_id = u64::from_ne_bytes(u64_slice) as usize;
|
||||
debug!(
|
||||
"Inserting in hmap : got dataid : {:?} current map address : {:?}",
|
||||
data_id, current_mmap_addr
|
||||
);
|
||||
// Note we store address where we have to decode dimension*size_of::<T> and full bson encoded vector
|
||||
hmap.insert(data_id, current_mmap_addr);
|
||||
// now read serialized length
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
|
||||
if i == 0 {
|
||||
debug!("serialized bytes len to reload {:?}", serialized_len);
|
||||
}
|
||||
let mut v_serialized = vec![0; serialized_len];
|
||||
v_serialized.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + serialized_len],
|
||||
);
|
||||
current_mmap_addr += serialized_len;
|
||||
let slice_t =
|
||||
unsafe { std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, dimension) };
|
||||
trace!(
|
||||
"Deserialized v : {:?} address : {:?} ",
|
||||
slice_t,
|
||||
v_serialized.as_ptr() as *const T
|
||||
);
|
||||
} // end of for on record
|
||||
//
|
||||
debug!("End of DataMap::from_hnsw.");
|
||||
//
|
||||
let datamap = DataMap {
|
||||
_datapath: datapath,
|
||||
mmap,
|
||||
hmap,
|
||||
t_name,
|
||||
dimension: descr_dimension,
|
||||
distname,
|
||||
};
|
||||
//
|
||||
Ok(datamap)
|
||||
} // end of from_datas
|
||||
|
||||
//
|
||||
|
||||
/// returns true if type T corresponds to type as retrieved in DataMap.
|
||||
/// This function can (should!) be used before calling [Self::get_data()]
|
||||
pub fn check_data_type<T>(&self) -> bool
|
||||
where
|
||||
T: 'static + Sized,
|
||||
{
|
||||
// we check last part of name of type
|
||||
let tname_vec = self.t_name.rsplit_terminator("::").collect::<Vec<&str>>();
|
||||
|
||||
if tname_vec.last().is_none() {
|
||||
let errmsg = "DataMap::check_data_type() cannot determine data type name ";
|
||||
error!("DataMap::check_data_type() cannot determine data type name ");
|
||||
std::panic!("DataMap::check_data_type(), {}", errmsg);
|
||||
}
|
||||
let tname_last = tname_vec.last().unwrap();
|
||||
//
|
||||
let datat_name_arg = std::any::type_name::<T>().to_string();
|
||||
let datat_name_vec = datat_name_arg
|
||||
.rsplit_terminator("::")
|
||||
.collect::<Vec<&str>>();
|
||||
|
||||
let datat_name_arg_last = datat_name_vec.last().unwrap();
|
||||
//
|
||||
if datat_name_arg_last == tname_last {
|
||||
true
|
||||
} else {
|
||||
info!(
|
||||
"Data type in DataMap : {}, type arg = {}",
|
||||
tname_last, datat_name_arg_last
|
||||
);
|
||||
false
|
||||
}
|
||||
} // end of check_data_type
|
||||
|
||||
//
|
||||
|
||||
/// return the data corresponding to dataid. Access is done using mmap.
|
||||
/// Function returns None if address is invalid
|
||||
/// This function requires you know the type T.
|
||||
/// **As mmap loading calls an unsafe function it is recommended to check the type name with [Self::check_data_type()]**
|
||||
pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> {
|
||||
//
|
||||
trace!("In DataMap::get_data, dataid : {:?}", dataid);
|
||||
let address = self.hmap.get(dataid)?;
|
||||
debug!("Address for id : {}, address : {:?}", dataid, address);
|
||||
let mut current_mmap_addr = *address;
|
||||
let mapped_slice = self.mmap.as_slice();
|
||||
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
|
||||
u64_slice.copy_from_slice(
|
||||
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
|
||||
);
|
||||
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
|
||||
current_mmap_addr += std::mem::size_of::<u64>();
|
||||
trace!("Serialized bytes len to reload {:?}", serialized_len);
|
||||
let slice_t = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
mapped_slice[current_mmap_addr..].as_ptr() as *const T,
|
||||
self.dimension,
|
||||
)
|
||||
};
|
||||
Some(slice_t)
|
||||
}
|
||||
|
||||
/// returns Keys in order they are in the file, thus optimizing file/memory access.
|
||||
/// Note that in case of parallel insertion this can be different from insertion odrer.
|
||||
pub fn get_dataid_iter(&self) -> indexmap::map::Keys<'_, DataId, usize> {
|
||||
self.hmap.keys()
|
||||
}
|
||||
|
||||
/// returns full data type name
|
||||
pub fn get_data_typename(&self) -> String {
|
||||
self.t_name.clone()
|
||||
}
|
||||
|
||||
/// returns full data type name
|
||||
pub fn get_distname(&self) -> String {
|
||||
self.distname.clone()
|
||||
}
|
||||
|
||||
/// return the number of data in mmap
|
||||
pub fn get_nb_data(&self) -> usize {
|
||||
self.hmap.len()
|
||||
}
|
||||
} // end of impl DataMap
|
||||
|
||||
//=====================================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::hnswio::HnswIo;
|
||||
use anndists::dist::*;
|
||||
|
||||
pub use crate::api::AnnT;
|
||||
use crate::prelude::*;
|
||||
|
||||
use rand::distr::{Distribution, Uniform};
|
||||
|
||||
fn log_init_test() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_mmap() {
|
||||
println!("\n\n test_file_mmap");
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 50;
|
||||
let nbrow = 11;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
debug!("j : {:?}, data : {:?} ", j, &data[j]);
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "mmap_test";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
|
||||
let check_reload = false;
|
||||
if check_reload {
|
||||
// We check we can reload
|
||||
debug!("HNSW reload.");
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let mut reloader = HnswIo::new(directory.path(), fname);
|
||||
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
|
||||
check_graph_equality(&hnsw_loaded, &hnsw);
|
||||
info!("========= reload success, going to mmap reloading =========");
|
||||
}
|
||||
//
|
||||
// now we have check that datamap seems ok, test reload of hnsw with mmap
|
||||
let datamap: DataMap = DataMap::from_hnswdump::<f32>(directory.path(), fname).unwrap();
|
||||
let nb_test = 30;
|
||||
info!("Checking random access of id , nb test : {}", nb_test);
|
||||
for _ in 0..nb_test {
|
||||
// sample an id in 0..nb_data
|
||||
let unif = Uniform::<usize>::new(0, nbcolumn).unwrap();
|
||||
let id = unif.sample(&mut rng);
|
||||
let d = datamap.get_data::<f32>(&id);
|
||||
assert!(d.is_some());
|
||||
if d.is_some() {
|
||||
debug!("id = {}, v = {:?}", id, d.as_ref().unwrap());
|
||||
assert_eq!(d.as_ref().unwrap(), &data[id]);
|
||||
}
|
||||
}
|
||||
// test iterator from datamap
|
||||
let keys = datamap.get_dataid_iter();
|
||||
for k in keys {
|
||||
let _data = datamap.get_data::<f32>(k);
|
||||
}
|
||||
} // end of test_file_mmap
|
||||
|
||||
#[test]
|
||||
fn test_mmap_iter() {
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<u32>::new(0, 10000).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 50;
|
||||
let nbrow = 11;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
debug!("j : {:?}, data : {:?} ", j, &data[j]);
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<u32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "mmap_order_test";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
// now we have check that datamap seems ok, test reload of hnsw with mmap
|
||||
let datamap: DataMap = DataMap::from_hnswdump::<u32>(directory.path(), fname).unwrap();
|
||||
// testing type check
|
||||
assert!(datamap.check_data_type::<u32>());
|
||||
assert!(!datamap.check_data_type::<f32>());
|
||||
info!("Datamap iteration order checking");
|
||||
let keys = datamap.get_dataid_iter();
|
||||
for (i, dataid) in keys.enumerate() {
|
||||
let v = datamap.get_data::<u32>(dataid).unwrap();
|
||||
assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, i);
|
||||
}
|
||||
// rm files generated!
|
||||
let _ = std::fs::remove_file("mmap_order_test.hnsw.data");
|
||||
let _ = std::fs::remove_file("mmap_order_test.hnsw.graph");
|
||||
}
|
||||
//
|
||||
} // end of mod tests
|
||||
24
vendor/ruvector/scripts/patches/hnsw_rs/src/filter.rs
vendored
Normal file
24
vendor/ruvector/scripts/patches/hnsw_rs/src/filter.rs
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
//! defines a trait for filtering requests.
|
||||
//! See examples in tests/filtertest.rs
|
||||
|
||||
use crate::prelude::DataId;
|
||||
|
||||
/// Only queries returning true are taken into account along the search
|
||||
pub trait FilterT {
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool;
|
||||
}
|
||||
|
||||
impl FilterT for Vec<usize> {
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool {
|
||||
self.binary_search(id).is_ok()
|
||||
}
|
||||
}
|
||||
|
||||
impl<F> FilterT for F
|
||||
where
|
||||
F: Fn(&DataId) -> bool,
|
||||
{
|
||||
fn hnsw_filter(&self, id: &DataId) -> bool {
|
||||
self(id)
|
||||
}
|
||||
}
|
||||
200
vendor/ruvector/scripts/patches/hnsw_rs/src/flatten.rs
vendored
Normal file
200
vendor/ruvector/scripts/patches/hnsw_rs/src/flatten.rs
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
//! This module provides conversion of a Point structure to a FlatPoint containing just the Id of a point
|
||||
//! and those of its neighbours.
|
||||
//! The whole Hnsw structure is then flattened into a Hashtable associating the data ID of a point to
|
||||
//! its corresponding FlatPoint.
|
||||
//! It can be used, for example, when reloading only the graph part of the data to have knowledge
|
||||
//! of relative proximity of points as described just by their DataId
|
||||
//!
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::hnsw;
|
||||
use anndists::dist::distances::Distance;
|
||||
use hnsw::*;
|
||||
use log::error;
|
||||
|
||||
// an ordering of Neighbour of a Point
|
||||
|
||||
impl PartialEq for Neighbour {
|
||||
fn eq(&self, other: &Neighbour) -> bool {
|
||||
self.distance == other.distance
|
||||
} // end eq
|
||||
}
|
||||
|
||||
impl Eq for Neighbour {}
|
||||
|
||||
// order points by distance to self.
|
||||
#[allow(clippy::non_canonical_partial_ord_impl)]
|
||||
impl PartialOrd for Neighbour {
|
||||
fn partial_cmp(&self, other: &Neighbour) -> Option<Ordering> {
|
||||
self.distance.partial_cmp(&other.distance)
|
||||
} // end cmp
|
||||
} // end impl PartialOrd
|
||||
|
||||
impl Ord for Neighbour {
|
||||
fn cmp(&self, other: &Neighbour) -> Ordering {
|
||||
if !self.distance.is_nan() && !other.distance.is_nan() {
|
||||
self.distance.partial_cmp(&other.distance).unwrap()
|
||||
} else {
|
||||
panic!("got a NaN in a distance");
|
||||
}
|
||||
} // end cmp
|
||||
}
|
||||
|
||||
/// a reduced version of point inserted in the Hnsw structure.
|
||||
/// It contains original id of point as submitted to the struct Hnsw
|
||||
/// an ordered (by distance) list of neighbours to the point
|
||||
/// and it position in layers.
|
||||
#[derive(Clone)]
|
||||
pub struct FlatPoint {
|
||||
/// an id coming from client using hnsw, should identify point uniquely
|
||||
origin_id: DataId,
|
||||
/// a point id identifying point as stored in our structure
|
||||
p_id: PointId,
|
||||
/// neighbours info
|
||||
neighbours: Vec<Neighbour>,
|
||||
}
|
||||
|
||||
impl FlatPoint {
|
||||
/// returns the neighbours orderded by distance.
|
||||
pub fn get_neighbours(&self) -> &Vec<Neighbour> {
|
||||
&self.neighbours
|
||||
}
|
||||
/// returns the origin id of the point
|
||||
pub fn get_id(&self) -> DataId {
|
||||
self.origin_id
|
||||
}
|
||||
//
|
||||
pub fn get_p_id(&self) -> PointId {
|
||||
self.p_id
|
||||
}
|
||||
} // end impl block for FlatPoint
|
||||
|
||||
fn flatten_point<T: Clone + Send + Sync>(point: &Point<T>) -> FlatPoint {
|
||||
let neighbours = point.get_neighborhood_id();
|
||||
// now we flatten neighbours
|
||||
let mut flat_neighbours = Vec::<Neighbour>::new();
|
||||
for layer in neighbours {
|
||||
for neighbour in layer {
|
||||
flat_neighbours.push(neighbour);
|
||||
}
|
||||
}
|
||||
flat_neighbours.sort_unstable();
|
||||
FlatPoint {
|
||||
origin_id: point.get_origin_id(),
|
||||
p_id: point.get_point_id(),
|
||||
neighbours: flat_neighbours,
|
||||
}
|
||||
} // end of flatten_point
|
||||
|
||||
/// A structure providing neighbourhood information of a point stored in the Hnsw structure given its DataId.
|
||||
/// The structure uses the [FlatPoint] structure.
|
||||
/// This structure can be obtained by FlatNeighborhood::from<&Hnsw<T,D>>
|
||||
pub struct FlatNeighborhood {
|
||||
hash_t: HashMap<DataId, FlatPoint>,
|
||||
}
|
||||
|
||||
impl FlatNeighborhood {
|
||||
/// get neighbour of a point given its id.
|
||||
/// The neighbours are sorted in increasing distance from data_id.
|
||||
pub fn get_neighbours(&self, p_id: DataId) -> Option<Vec<Neighbour>> {
|
||||
self.hash_t
|
||||
.get(&p_id)
|
||||
.map(|point| point.get_neighbours().clone())
|
||||
}
|
||||
} // end impl block for FlatNeighborhood
|
||||
|
||||
impl<T: Clone + Send + Sync, D: Distance<T> + Send + Sync> From<&Hnsw<'_, T, D>>
|
||||
for FlatNeighborhood
|
||||
{
|
||||
/// extract from the Hnsw strucure a hashtable mapping original DataId into a FlatPoint structure gathering its neighbourhood information.
|
||||
/// Useful after reloading from a dump with T=NoData and D = NoDist as points are then reloaded with neighbourhood information only.
|
||||
fn from(hnsw: &Hnsw<T, D>) -> Self {
|
||||
let mut hash_t = HashMap::new();
|
||||
let pt_iter = hnsw.get_point_indexation().into_iter();
|
||||
//
|
||||
for point in pt_iter {
|
||||
// println!("point : {:?}", _point.p_id);
|
||||
let res_insert = hash_t.insert(point.get_origin_id(), flatten_point(&point));
|
||||
if let Some(old_point) = res_insert {
|
||||
error!("2 points with same origin id {:?}", old_point.origin_id);
|
||||
}
|
||||
}
|
||||
FlatNeighborhood { hash_t }
|
||||
}
|
||||
} // e,d of Fom implementation
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use anndists::dist::distances::*;
|
||||
use log::debug;
|
||||
|
||||
use crate::api::AnnT;
|
||||
use crate::hnswio::*;
|
||||
|
||||
use rand::distr::{Distribution, Uniform};
|
||||
|
||||
fn log_init_test() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dump_reload_graph_flatten() {
|
||||
println!("\n\n test_dump_reload_graph_flatten");
|
||||
log_init_test();
|
||||
// generate a random test
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
// 1000 vectors of size 10 f32
|
||||
let nbcolumn = 1000;
|
||||
let nbrow = 10;
|
||||
let mut xsi;
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for j in 0..nbcolumn {
|
||||
data.push(Vec::with_capacity(nbrow));
|
||||
for _ in 0..nbrow {
|
||||
xsi = unif.sample(&mut rng);
|
||||
data[j].push(xsi);
|
||||
}
|
||||
}
|
||||
// define hnsw
|
||||
let ef_construct = 25;
|
||||
let nb_connection = 10;
|
||||
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hnsw.insert((d, i));
|
||||
}
|
||||
// some loggin info
|
||||
hnsw.dump_layer_info();
|
||||
// get flat neighbours of point 3
|
||||
let neighborhood_before_dump = FlatNeighborhood::from(&hnsw);
|
||||
let nbg_2_before = neighborhood_before_dump.get_neighbours(2).unwrap();
|
||||
println!("voisins du point 2 {:?}", nbg_2_before);
|
||||
// dump in a file. Must take care of name as tests runs in // !!!
|
||||
let fname = "dumpreloadtestflat";
|
||||
let directory = tempfile::tempdir().unwrap();
|
||||
let _res = hnsw.file_dump(directory.path(), fname);
|
||||
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
|
||||
//
|
||||
// reload
|
||||
debug!("HNSW reload");
|
||||
// we will need a procedural macro to get from distance name to its instantiation.
|
||||
// from now on we test with DistL1
|
||||
let mut reloader = HnswIo::new(directory.path(), fname);
|
||||
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
|
||||
let neighborhood_after_dump = FlatNeighborhood::from(&hnsw_loaded);
|
||||
let nbg_2_after = neighborhood_after_dump.get_neighbours(2).unwrap();
|
||||
println!("Neighbors of point 2 {:?}", nbg_2_after);
|
||||
// test equality of neighborhood
|
||||
assert_eq!(nbg_2_after.len(), nbg_2_before.len());
|
||||
for i in 0..nbg_2_before.len() {
|
||||
assert_eq!(nbg_2_before[i].p_id, nbg_2_after[i].p_id);
|
||||
assert_eq!(nbg_2_before[i].distance, nbg_2_after[i].distance);
|
||||
}
|
||||
check_graph_equality(&hnsw_loaded, &hnsw);
|
||||
} // end of test_dump_reload
|
||||
} // end module test
|
||||
1872
vendor/ruvector/scripts/patches/hnsw_rs/src/hnsw.rs
vendored
Normal file
1872
vendor/ruvector/scripts/patches/hnsw_rs/src/hnsw.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1703
vendor/ruvector/scripts/patches/hnsw_rs/src/hnswio.rs
vendored
Normal file
1703
vendor/ruvector/scripts/patches/hnsw_rs/src/hnswio.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
30
vendor/ruvector/scripts/patches/hnsw_rs/src/lib.rs
vendored
Normal file
30
vendor/ruvector/scripts/patches/hnsw_rs/src/lib.rs
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
#![cfg_attr(feature = "stdsimd", feature(portable_simd))]
|
||||
//
|
||||
// for logging (debug mostly, switched at compile time in cargo.toml)
|
||||
use env_logger::Builder;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
pub mod api;
|
||||
pub mod datamap;
|
||||
pub mod filter;
|
||||
pub mod flatten;
|
||||
pub mod hnsw;
|
||||
pub mod hnswio;
|
||||
pub mod libext;
|
||||
pub mod prelude;
|
||||
|
||||
// we impose our version of anndists
|
||||
pub use anndists;
|
||||
|
||||
lazy_static! {
|
||||
static ref LOG: u64 = init_log();
|
||||
}
|
||||
|
||||
// install a logger facility
|
||||
#[allow(unused)]
|
||||
fn init_log() -> u64 {
|
||||
Builder::from_default_env().init();
|
||||
println!("\n ************** initializing logger *****************\n");
|
||||
1
|
||||
}
|
||||
1240
vendor/ruvector/scripts/patches/hnsw_rs/src/libext.rs
vendored
Normal file
1240
vendor/ruvector/scripts/patches/hnsw_rs/src/libext.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11
vendor/ruvector/scripts/patches/hnsw_rs/src/prelude.rs
vendored
Normal file
11
vendor/ruvector/scripts/patches/hnsw_rs/src/prelude.rs
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
// gathers modules to include and re-exorts all of anndists!
|
||||
|
||||
pub use crate::api::*;
|
||||
pub use crate::hnsw::*;
|
||||
|
||||
#[allow(unused)]
|
||||
pub use crate::filter::*;
|
||||
|
||||
pub use crate::hnswio::*;
|
||||
|
||||
pub use anndists::dist::distances::*;
|
||||
34
vendor/ruvector/scripts/patches/hnsw_rs/tests/deallocation_test.rs
vendored
Normal file
34
vendor/ruvector/scripts/patches/hnsw_rs/tests/deallocation_test.rs
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
use env_logger::Builder;
|
||||
|
||||
use anndists::dist::DistL1;
|
||||
use hnsw_rs::hnsw::Hnsw;
|
||||
|
||||
// A test program to see if memory from insertions gets deallocated.
|
||||
// This program sets up a process that iteratively builds a new model and lets it go out of scope.
|
||||
// Since the models go out of scope, the desired behavior is that memory consumption is constant while this program is running.
|
||||
fn main() {
|
||||
//
|
||||
Builder::from_default_env().init();
|
||||
//
|
||||
let mut counter: usize = 0;
|
||||
loop {
|
||||
let hnsw: Hnsw<f32, DistL1> = Hnsw::new(15, 100_000, 20, 500_000, DistL1 {});
|
||||
let s1 = [1.0, 0.0, 0.0, 0.0];
|
||||
hnsw.insert_slice((&s1, 0));
|
||||
let s2 = [0.0, 1.0, 1.0];
|
||||
hnsw.insert_slice((&s2, 1));
|
||||
let s3 = [0.0, 0.0, 1.0];
|
||||
hnsw.insert_slice((&s3, 2));
|
||||
let s4 = [1.0, 0.0, 0.0, 1.0];
|
||||
hnsw.insert_slice((&s4, 3));
|
||||
let s5 = [1.0, 1.0, 1.0];
|
||||
hnsw.insert_slice((&s5, 4));
|
||||
let s6 = [1.0, -1.0, 1.0];
|
||||
hnsw.insert_slice((&s6, 5));
|
||||
|
||||
if counter % 1_000_000 == 0 {
|
||||
println!("counter : {}", counter)
|
||||
}
|
||||
counter += 1;
|
||||
}
|
||||
}
|
||||
266
vendor/ruvector/scripts/patches/hnsw_rs/tests/filtertest.rs
vendored
Normal file
266
vendor/ruvector/scripts/patches/hnsw_rs/tests/filtertest.rs
vendored
Normal file
@@ -0,0 +1,266 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
#![allow(clippy::range_zip_with_len)]
|
||||
|
||||
use anndists::dist::*;
|
||||
use hnsw_rs::prelude::*;
|
||||
use rand::{Rng, distr::Uniform};
|
||||
use std::iter;
|
||||
|
||||
#[allow(unused)]
|
||||
fn log_init_test() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
// Shows two ways to do filtering, by a sorted vector or with a closure
|
||||
// We define a hnsw-index with 500 entries
|
||||
// Only ids within 300-400 should be in the result-set
|
||||
|
||||
// Used to create a random string
|
||||
fn generate_random_string(len: usize) -> String {
|
||||
const CHARSET: &[u8] = b"abcdefghij";
|
||||
let mut rng = rand::rng();
|
||||
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
|
||||
iter::repeat_with(one_char).take(len).collect()
|
||||
}
|
||||
|
||||
// this function uses a sorted vector as a filter
|
||||
fn search_closure_filter(
|
||||
word: &str,
|
||||
hns: &Hnsw<u16, DistLevenshtein>,
|
||||
words: &[String],
|
||||
filter_vector: &[usize],
|
||||
) {
|
||||
// transform string to u16 values
|
||||
let vec: Vec<u16> = word.chars().map(|c| c as u16).collect();
|
||||
// now create a closure using this filter_vector
|
||||
// here we can off course implement more advanced filter logic
|
||||
let filter = |id: &usize| -> bool { filter_vector.binary_search(id).is_ok() };
|
||||
|
||||
// Now let us do the search by using the defined clojure, which in turn uses our vector
|
||||
// ids not in the vector will not be indluced in the search results
|
||||
println!("========== Search with closure filter");
|
||||
let ef_search = 30;
|
||||
let res = hns.search_possible_filter(&vec, 10, ef_search, Some(&filter));
|
||||
for r in res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_levenstein() {
|
||||
let nb_elem = 500000; // number of possible words in the dictionary
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let ef_c = 200;
|
||||
let hns = Hnsw::<u16, DistLevenshtein>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
DistLevenshtein {},
|
||||
);
|
||||
let mut words = vec![];
|
||||
for _n in 1..1000 {
|
||||
let tw = generate_random_string(8);
|
||||
words.push(tw);
|
||||
}
|
||||
|
||||
for (i, w) in words.iter().enumerate() {
|
||||
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
|
||||
hns.insert((&vec, i));
|
||||
if i % 1000 == 0 {
|
||||
println!("Inserting: {:?}", i);
|
||||
}
|
||||
}
|
||||
// Create a sorted vector of ids
|
||||
// the ids in the vector will be used as a filter
|
||||
let filtered_hns = Hnsw::<u16, DistLevenshtein>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
DistLevenshtein {},
|
||||
);
|
||||
let mut filter_vector: Vec<usize> = Vec::new();
|
||||
for i in 300..400 {
|
||||
filter_vector.push(i);
|
||||
let v: Vec<u16> = words[i].chars().map(|c| c as u16).collect();
|
||||
filtered_hns.insert((&v, i));
|
||||
}
|
||||
//
|
||||
let ef_search = 30;
|
||||
let tosearch = "abcdefg";
|
||||
let knbn = 10;
|
||||
let vec_tosearch: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
|
||||
//
|
||||
println!("========== Search in full hns with filter");
|
||||
let vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
|
||||
for r in &vec_res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
//
|
||||
println!("========== Search in restricted_hns but without filter");
|
||||
//
|
||||
let vec: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
|
||||
let res: Vec<Neighbour> = filtered_hns.search(&vec, knbn, ef_search);
|
||||
for r in &res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
//
|
||||
// search with filter
|
||||
// first with closure
|
||||
println!("========== Search in full hns with closure filter");
|
||||
search_closure_filter(tosearch, &hns, &words, &filter_vector);
|
||||
//
|
||||
// now with vector filter and estimate recall
|
||||
//
|
||||
println!("========== Search in full hns with vector filter");
|
||||
let filter_vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
|
||||
for r in &filter_vec_res {
|
||||
println!(
|
||||
"Word: {:?} Id: {:?} Distance: {:?}",
|
||||
words[r.d_id], r.d_id, r.distance
|
||||
);
|
||||
}
|
||||
// how many neighbours in res are in filter_vec_res
|
||||
let mut nb_found: usize = 0;
|
||||
for n in &res {
|
||||
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
|
||||
if found.is_some() {
|
||||
nb_found += 1;
|
||||
assert_eq!(n.distance, found.unwrap().distance);
|
||||
}
|
||||
}
|
||||
println!(" recall : {}", nb_found as f32 / res.len() as f32);
|
||||
println!(
|
||||
" last distances ratio : {} ",
|
||||
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
|
||||
);
|
||||
}
|
||||
|
||||
// A test with random uniform data vectors and L2 distance
|
||||
// We compare a search of a random vector in hnsw structure with a filter to a filtered_hnsw
|
||||
// containing only the data fitting the filter
|
||||
#[test]
|
||||
fn filter_l2() {
|
||||
let nb_elem = 5000;
|
||||
let dim = 25;
|
||||
// generate nb_elem colmuns vectors of dimension dim
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
let mut data = Vec::with_capacity(nb_elem);
|
||||
for _ in 0..nb_elem {
|
||||
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
|
||||
data.push(column);
|
||||
}
|
||||
// give an id to each data
|
||||
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
|
||||
|
||||
let ef_c = 200;
|
||||
let max_nb_connection = 15;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
hnsw.parallel_insert(&data_with_id);
|
||||
|
||||
//
|
||||
let ef_search = 30;
|
||||
let knbn = 10;
|
||||
let vec_tosearch = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
|
||||
//
|
||||
// Create a sorted vector of ids
|
||||
// the ids in the vector will be used as a filter
|
||||
let filtered_hns =
|
||||
Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
|
||||
let mut filter_vector: Vec<usize> = Vec::new();
|
||||
for i in 300..400 {
|
||||
filter_vector.push(i);
|
||||
filtered_hns.insert((&data[i], i));
|
||||
}
|
||||
//
|
||||
println!("========== Search in full hnsw with filter");
|
||||
let filter_vec_res = hnsw.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
|
||||
for r in &filter_vec_res {
|
||||
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
|
||||
}
|
||||
//
|
||||
println!("========== Search in restricted_hns but without filter");
|
||||
let res: Vec<Neighbour> = filtered_hns.search(&vec_tosearch, knbn, ef_search);
|
||||
for r in &res {
|
||||
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
|
||||
}
|
||||
// how many neighbours in res are in filter_vec_res and what is the distance gap
|
||||
let mut nb_found: usize = 0;
|
||||
for n in &res {
|
||||
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
|
||||
if found.is_some() {
|
||||
nb_found += 1;
|
||||
assert!((1. - n.distance / found.unwrap().distance).abs() < 1.0e-5);
|
||||
}
|
||||
}
|
||||
println!(" recall : {}", nb_found as f32 / res.len() as f32);
|
||||
println!(
|
||||
" last distances ratio : {} ",
|
||||
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
|
||||
);
|
||||
} // end of filter_l2
|
||||
|
||||
//
|
||||
|
||||
use std::collections::HashMap;
|
||||
#[test]
|
||||
fn filter_villsnow() {
|
||||
println!("\n\n in test villsnow");
|
||||
log_init_test();
|
||||
//
|
||||
let grid_size = 100;
|
||||
let mut hnsw = Hnsw::<f64, DistL2>::new(4, grid_size * grid_size, 16, 100, DistL2::default());
|
||||
let mut points = HashMap::new();
|
||||
|
||||
{
|
||||
for (id, (i, j)) in itertools::iproduct!(0..grid_size, 0..grid_size,).enumerate() {
|
||||
let data = [
|
||||
(i as f64 + 0.5) / grid_size as f64,
|
||||
(j as f64 + 0.5) / grid_size as f64,
|
||||
];
|
||||
hnsw.insert((&data, id));
|
||||
points.insert(id, data);
|
||||
}
|
||||
|
||||
hnsw.set_searching_mode(true);
|
||||
}
|
||||
{
|
||||
println!("first case");
|
||||
// first case
|
||||
let filter = |id: &usize| DistL2::default().eval(&points[id], &[1.0, 1.0]) < 1e-2;
|
||||
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 1
|
||||
|
||||
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 4, Some(&filter));
|
||||
if !hit.is_empty() {
|
||||
log::info!("got point : {:?}", points.get(&hit[0].d_id));
|
||||
log::info!("got {:?}, must be true", filter(&hit[0].d_id)); // -> sometimes false
|
||||
} else {
|
||||
log::info!("found no point");
|
||||
}
|
||||
assert!(hit.len() <= 1);
|
||||
}
|
||||
{
|
||||
println!("second case");
|
||||
// second case
|
||||
let filter = |_id: &usize| false;
|
||||
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 0, obviously
|
||||
|
||||
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 64, Some(&filter));
|
||||
println!("villsnow , {:?}", hit.len());
|
||||
log::info!("got {:?}, must be 0", hit.len()); // -> 1
|
||||
assert_eq!(hit.len(), 0);
|
||||
}
|
||||
}
|
||||
328
vendor/ruvector/scripts/patches/hnsw_rs/tests/serpar.rs
vendored
Normal file
328
vendor/ruvector/scripts/patches/hnsw_rs/tests/serpar.rs
vendored
Normal file
@@ -0,0 +1,328 @@
|
||||
#![allow(clippy::range_zip_with_len)]
|
||||
|
||||
//! some testing utilities.
|
||||
//! run with to get output statistics : cargo test --release -- --nocapture --test test_parallel.
|
||||
//! serial test corresponds to random-10nn-euclidean(k=10)
|
||||
//! parallel test corresponds to random data in 25 dimensions k = 10, dist Cosine
|
||||
|
||||
use rand::distr::Uniform;
|
||||
use rand::prelude::*;
|
||||
|
||||
use skiplist::OrderedSkipList;
|
||||
|
||||
use anndists::dist;
|
||||
use hnsw_rs::prelude::*;
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
|
||||
pub fn gen_random_vector_f32(nbrow: usize) -> Vec<f32> {
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
(0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>()
|
||||
}
|
||||
|
||||
/// return nbcolumn vectors of dimension nbrow
|
||||
pub fn gen_random_matrix_f32(nbrow: usize, nbcolumn: usize) -> Vec<Vec<f32>> {
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
let mut data = Vec::with_capacity(nbcolumn);
|
||||
for _ in 0..nbcolumn {
|
||||
let column = (0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
|
||||
data.push(column);
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
fn brute_force_neighbours<T: Serialize + DeserializeOwned + Copy + Send + Sync>(
|
||||
nb_neighbours: usize,
|
||||
refdata: &PointIndexation<T>,
|
||||
distance: PointDistance<T>,
|
||||
data: &[T],
|
||||
) -> OrderedSkipList<PointIdWithOrder> {
|
||||
let mut neighbours = OrderedSkipList::<PointIdWithOrder>::with_capacity(refdata.get_nb_point());
|
||||
|
||||
let mut ptiter = refdata.into_iter();
|
||||
let mut more = true;
|
||||
while more {
|
||||
if let Some(point) = ptiter.next() {
|
||||
let dist_p = distance.eval(data, point.get_v());
|
||||
let ordered_point = PointIdWithOrder::new(point.get_point_id(), dist_p);
|
||||
// log::debug!(" brute force inserting {:?}", ordered_point);
|
||||
if neighbours.len() < nb_neighbours {
|
||||
neighbours.insert(ordered_point);
|
||||
} else {
|
||||
neighbours.insert(ordered_point);
|
||||
neighbours.pop_back();
|
||||
}
|
||||
} else {
|
||||
more = false;
|
||||
}
|
||||
} // end while
|
||||
neighbours
|
||||
} // end of brute_force_2
|
||||
|
||||
//================================================================================================
|
||||
|
||||
mod tests {
|
||||
use cpu_time::ProcessTime;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
use dist::l2_normalize;
|
||||
|
||||
#[test]
|
||||
fn test_serial() {
|
||||
//
|
||||
//
|
||||
let nb_elem = 1000;
|
||||
let dim = 10;
|
||||
let knbn = 10;
|
||||
let ef = 20;
|
||||
let parallel = true;
|
||||
//
|
||||
println!("\n\n test_serial nb_elem {:?}", nb_elem);
|
||||
//
|
||||
let data = gen_random_matrix_f32(dim, nb_elem);
|
||||
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
|
||||
|
||||
let ef_c = 400;
|
||||
let max_nb_connection = 32;
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let mut hns = Hnsw::<f32, dist::DistL1>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
dist::DistL1 {},
|
||||
);
|
||||
hns.set_extend_candidates(true);
|
||||
hns.set_keeping_pruned(true);
|
||||
let mut start = ProcessTime::now();
|
||||
if parallel {
|
||||
println!("parallel insertion");
|
||||
hns.parallel_insert(&data_with_id);
|
||||
} else {
|
||||
println!("serial insertion");
|
||||
for (i, d) in data.iter().enumerate() {
|
||||
hns.insert((d, i));
|
||||
}
|
||||
}
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
println!(" hnsw serial data insertion {:?}", cpu_time);
|
||||
hns.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
|
||||
//
|
||||
|
||||
let nbtest = 300;
|
||||
let mut recalls = Vec::<usize>::with_capacity(nbtest);
|
||||
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
|
||||
let mut search_times = Vec::<f32>::with_capacity(nbtest);
|
||||
for _itest in 0..nbtest {
|
||||
//
|
||||
let mut r_vec = Vec::<f32>::with_capacity(dim);
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
for _ in 0..dim {
|
||||
r_vec.push(rng.sample(unif));
|
||||
}
|
||||
start = ProcessTime::now();
|
||||
let brute_neighbours = brute_force_neighbours(
|
||||
knbn,
|
||||
hns.get_point_indexation(),
|
||||
Box::new(dist::DistL1 {}),
|
||||
&r_vec,
|
||||
);
|
||||
cpu_time = start.elapsed();
|
||||
if nbtest <= 100 {
|
||||
println!("\n\n **************** test {:?}", _itest);
|
||||
println!("\n brute force neighbours :");
|
||||
println!("======================");
|
||||
println!(" brute force computing {:?} \n ", cpu_time);
|
||||
for i in 0..brute_neighbours.len() {
|
||||
let p = brute_neighbours[i].point_id;
|
||||
println!(" {:?} {:?} ", p, brute_neighbours[i].dist_to_ref);
|
||||
}
|
||||
}
|
||||
//
|
||||
hns.set_searching_mode(true);
|
||||
start = ProcessTime::now();
|
||||
let knn_neighbours = hns.search(&r_vec, knbn, ef);
|
||||
cpu_time = start.elapsed();
|
||||
search_times.push(cpu_time.as_micros() as f32);
|
||||
if nbtest <= 100 {
|
||||
println!("\n\n hnsw searching {:?} \n", cpu_time);
|
||||
println!("\n knn neighbours");
|
||||
println!("======================");
|
||||
for n in &knn_neighbours {
|
||||
println!(" {:?} {:?} {:?} ", n.d_id, n.p_id, n.distance);
|
||||
}
|
||||
}
|
||||
// compute recall
|
||||
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
|
||||
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
|
||||
let recall = knn_neighbours_dist
|
||||
.iter()
|
||||
.filter(|d| *d <= &max_dist)
|
||||
.count();
|
||||
if nbtest <= 100 {
|
||||
println!("recall {:?}", (recall as f32) / (knbn as f32));
|
||||
}
|
||||
recalls.push(recall);
|
||||
nb_returned.push(knn_neighbours.len());
|
||||
} // end on nbtest
|
||||
//
|
||||
// compute recall
|
||||
//
|
||||
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
|
||||
println!(
|
||||
"\n mean fraction (of knbn) returned by search {:?} ",
|
||||
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
|
||||
);
|
||||
println!(
|
||||
"\n nb element {:?} nb search : {:?} recall rate is {:?} search time inverse {:?} ",
|
||||
nb_elem,
|
||||
nbtest,
|
||||
mean_recall,
|
||||
1.0e+6_f32 / mean_search_time
|
||||
);
|
||||
} // end test1
|
||||
|
||||
#[test]
|
||||
fn test_parallel() {
|
||||
//
|
||||
let nb_elem = 1000;
|
||||
let dim = 25;
|
||||
let knbn = 10;
|
||||
let ef_c = 800;
|
||||
let max_nb_connection = 48;
|
||||
let ef = 20;
|
||||
//
|
||||
//
|
||||
let mut data = gen_random_matrix_f32(dim, nb_elem);
|
||||
for v in &mut data {
|
||||
l2_normalize(v);
|
||||
}
|
||||
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
|
||||
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
|
||||
let mut hns = Hnsw::<f32, dist::DistDot>::new(
|
||||
max_nb_connection,
|
||||
nb_elem,
|
||||
nb_layer,
|
||||
ef_c,
|
||||
dist::DistDot {},
|
||||
);
|
||||
// !
|
||||
// hns.set_extend_candidates(true);
|
||||
let mut start = ProcessTime::now();
|
||||
let now = std::time::SystemTime::now();
|
||||
// parallel insertion
|
||||
hns.parallel_insert(&data_with_id);
|
||||
let mut cpu_time: Duration = start.elapsed();
|
||||
println!(
|
||||
"\n hnsw data parallel insertion cpu time {:?} , system time {:?}",
|
||||
cpu_time,
|
||||
now.elapsed()
|
||||
);
|
||||
// one serial more to check
|
||||
let mut v = gen_random_vector_f32(dim);
|
||||
l2_normalize(&mut v);
|
||||
hns.insert((&v, hns.get_nb_point() + 1));
|
||||
//
|
||||
hns.dump_layer_info();
|
||||
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
|
||||
//
|
||||
println!("\n hnsw testing requests ...");
|
||||
let nbtest = 100;
|
||||
let mut recalls = Vec::<usize>::with_capacity(nbtest);
|
||||
let mut recalls_id = Vec::<usize>::with_capacity(nbtest);
|
||||
|
||||
let mut search_times = Vec::<f32>::with_capacity(nbtest);
|
||||
for _itest in 0..nbtest {
|
||||
let mut r_vec = Vec::<f32>::with_capacity(dim);
|
||||
let mut rng = rand::rng();
|
||||
let unif = Uniform::<f32>::new(0., 1.).unwrap();
|
||||
for _ in 0..dim {
|
||||
r_vec.push(rng.sample(unif));
|
||||
}
|
||||
l2_normalize(&mut r_vec);
|
||||
|
||||
start = ProcessTime::now();
|
||||
let brute_neighbours = brute_force_neighbours(
|
||||
knbn,
|
||||
hns.get_point_indexation(),
|
||||
Box::new(dist::DistDot),
|
||||
&r_vec,
|
||||
);
|
||||
cpu_time = start.elapsed();
|
||||
if nbtest <= 100 {
|
||||
println!("\n\n test_par nb_elem {:?}", nb_elem);
|
||||
println!("\n brute force neighbours :");
|
||||
println!("======================");
|
||||
println!(" brute force computing {:?} \n", cpu_time);
|
||||
for i in 0..brute_neighbours.len() {
|
||||
println!(
|
||||
" {:?} {:?} ",
|
||||
brute_neighbours[i].point_id, brute_neighbours[i].dist_to_ref
|
||||
);
|
||||
}
|
||||
}
|
||||
//
|
||||
let knbn = 10;
|
||||
hns.set_searching_mode(true);
|
||||
start = ProcessTime::now();
|
||||
let knn_neighbours = hns.search(&r_vec, knbn, ef);
|
||||
cpu_time = start.elapsed();
|
||||
search_times.push(cpu_time.as_micros() as f32);
|
||||
if nbtest <= 100 {
|
||||
println!("\n knn neighbours");
|
||||
println!("======================");
|
||||
println!(" hnsw searching {:?} \n", cpu_time);
|
||||
for n in &knn_neighbours {
|
||||
println!(" {:?} \t {:?} \t {:?}", n.d_id, n.p_id, n.distance);
|
||||
}
|
||||
}
|
||||
// compute recall with balls
|
||||
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
|
||||
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
|
||||
let recall = knn_neighbours_dist
|
||||
.iter()
|
||||
.filter(|d| *d <= &max_dist)
|
||||
.count();
|
||||
if nbtest <= 100 {
|
||||
println!("recall {:?}", (recall as f32) / (knbn as f32));
|
||||
}
|
||||
recalls.push(recall);
|
||||
// compute recall with id
|
||||
let mut recall_id = 0;
|
||||
let mut knn_neighbours_id: Vec<PointId> =
|
||||
knn_neighbours.iter().map(|p| p.p_id).collect();
|
||||
knn_neighbours_id.sort_unstable();
|
||||
let snbn = knbn.min(brute_neighbours.len());
|
||||
for j in 0..snbn {
|
||||
let to_search = brute_neighbours[j].point_id;
|
||||
if knn_neighbours_id.binary_search(&to_search).is_ok() {
|
||||
recall_id += 1;
|
||||
}
|
||||
}
|
||||
recalls_id.push(recall_id);
|
||||
} // end on nbtest
|
||||
//
|
||||
// compute recall
|
||||
//
|
||||
|
||||
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
|
||||
println!(
|
||||
"\n nb search {:?} recall rate is {:?} search time inverse {:?} ",
|
||||
nbtest,
|
||||
mean_recall,
|
||||
1.0e+6_f32 / mean_search_time
|
||||
);
|
||||
let mean_recall_id =
|
||||
(recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
|
||||
println!("mean recall rate with point ids {:?}", mean_recall_id);
|
||||
//
|
||||
// assert!(1==0);
|
||||
} // end test_par
|
||||
}
|
||||
Reference in New Issue
Block a user