Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,371 @@
//! Binary Quantization
//!
//! Compresses vectors to 1 bit per dimension, achieving 32x memory reduction.
//! Uses Hamming distance for fast comparison.
/// Quantize f32 vector to binary (1 bit per dimension)
///
/// Positive values -> 1, negative/zero values -> 0
pub fn quantize(vector: &[f32]) -> Vec<u8> {
let n_bytes = (vector.len() + 7) / 8;
let mut result = vec![0u8; n_bytes];
for (i, &v) in vector.iter().enumerate() {
if v > 0.0 {
let byte_idx = i / 8;
let bit_idx = i % 8;
result[byte_idx] |= 1 << bit_idx;
}
}
result
}
/// Quantize with threshold
pub fn quantize_with_threshold(vector: &[f32], threshold: f32) -> Vec<u8> {
let n_bytes = (vector.len() + 7) / 8;
let mut result = vec![0u8; n_bytes];
for (i, &v) in vector.iter().enumerate() {
if v > threshold {
let byte_idx = i / 8;
let bit_idx = i % 8;
result[byte_idx] |= 1 << bit_idx;
}
}
result
}
/// Calculate Hamming distance between binary vectors
pub fn hamming_distance(a: &[u8], b: &[u8]) -> u32 {
debug_assert_eq!(a.len(), b.len());
a.iter()
.zip(b.iter())
.map(|(&x, &y)| (x ^ y).count_ones())
.sum()
}
/// SIMD-optimized Hamming distance using POPCNT
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "popcnt")]
unsafe fn hamming_distance_popcnt(a: &[u8], b: &[u8]) -> u32 {
use std::arch::x86_64::*;
let n = a.len();
let mut count = 0u32;
// Process 8 bytes at a time
let chunks = n / 8;
for i in 0..chunks {
let offset = i * 8;
let va = *(a.as_ptr().add(offset) as *const u64);
let vb = *(b.as_ptr().add(offset) as *const u64);
count += _popcnt64((va ^ vb) as i64) as u32;
}
// Handle remainder
for i in (chunks * 8)..n {
count += (a[i] ^ b[i]).count_ones();
}
count
}
/// AVX2-optimized Hamming distance using vpshufb popcount
///
/// Uses the SWAR (SIMD Within A Register) technique with lookup tables.
/// Processes 32 bytes per iteration, which is 4x faster than scalar POPCNT
/// for large vectors (1024+ dimensions).
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn hamming_distance_avx2(a: &[u8], b: &[u8]) -> u32 {
use std::arch::x86_64::*;
let n = a.len();
// Lookup table for popcount of 4-bit values
let lookup = _mm256_setr_epi8(
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3,
3, 4,
);
let low_mask = _mm256_set1_epi8(0x0F);
let mut total = _mm256_setzero_si256();
// Process 32 bytes at a time
let chunks = n / 32;
for i in 0..chunks {
let offset = i * 32;
let va = _mm256_loadu_si256(a.as_ptr().add(offset) as *const __m256i);
let vb = _mm256_loadu_si256(b.as_ptr().add(offset) as *const __m256i);
// XOR the vectors
let xor = _mm256_xor_si256(va, vb);
// Split into low and high nibbles
let lo = _mm256_and_si256(xor, low_mask);
let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
// Lookup popcount for each nibble
let popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
let popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
// Sum nibble popcounts
let popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
// Accumulate using sad (sum of absolute differences from zero)
let sad = _mm256_sad_epu8(popcnt, _mm256_setzero_si256());
total = _mm256_add_epi64(total, sad);
}
// Horizontal sum of the 4 64-bit values
let sum128_lo = _mm256_castsi256_si128(total);
let sum128_hi = _mm256_extracti128_si256(total, 1);
let sum128 = _mm_add_epi64(sum128_lo, sum128_hi);
let sum64 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
let mut count = _mm_cvtsi128_si64(sum64) as u32;
// Handle remainder with scalar POPCNT
for i in (chunks * 32)..n {
count += (a[i] ^ b[i]).count_ones();
}
count
}
/// Calculate Hamming distance with SIMD optimization
///
/// Automatically selects the best implementation:
/// - AVX2 vpshufb for large vectors (>= 128 bytes / 1024 bits)
/// - POPCNT for medium vectors (>= 8 bytes)
/// - Scalar for small vectors
pub fn hamming_distance_simd(a: &[u8], b: &[u8]) -> u32 {
#[cfg(target_arch = "x86_64")]
{
let n = a.len();
// For large vectors, AVX2 vpshufb is fastest
if n >= 128 && is_x86_feature_detected!("avx2") {
return unsafe { hamming_distance_avx2(a, b) };
}
// For medium vectors, use POPCNT
if is_x86_feature_detected!("popcnt") {
return unsafe { hamming_distance_popcnt(a, b) };
}
}
hamming_distance(a, b)
}
/// Normalize Hamming distance to [0, 1] range
pub fn normalized_hamming_distance(a: &[u8], b: &[u8], dimensions: usize) -> f32 {
let dist = hamming_distance_simd(a, b);
dist as f32 / dimensions as f32
}
/// Binary quantized vector
#[derive(Debug, Clone)]
pub struct BinaryQuantizedVector {
pub data: Vec<u8>,
pub dimensions: usize,
}
impl BinaryQuantizedVector {
/// Create from f32 vector
pub fn from_f32(vector: &[f32]) -> Self {
Self {
data: quantize(vector),
dimensions: vector.len(),
}
}
/// Create from f32 vector with threshold
pub fn from_f32_threshold(vector: &[f32], threshold: f32) -> Self {
Self {
data: quantize_with_threshold(vector, threshold),
dimensions: vector.len(),
}
}
/// Calculate Hamming distance to another binary vector
pub fn hamming_distance(&self, other: &Self) -> u32 {
debug_assert_eq!(self.dimensions, other.dimensions);
hamming_distance_simd(&self.data, &other.data)
}
/// Calculate normalized distance [0, 1]
pub fn normalized_distance(&self, other: &Self) -> f32 {
self.hamming_distance(other) as f32 / self.dimensions as f32
}
/// Memory size in bytes
pub fn memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.data.len()
}
/// Compression ratio compared to f32
pub fn compression_ratio(&self) -> f32 {
32.0 // f32 (32 bits) -> 1 bit
}
/// Get bit at position
pub fn get_bit(&self, pos: usize) -> bool {
debug_assert!(pos < self.dimensions);
let byte_idx = pos / 8;
let bit_idx = pos % 8;
(self.data[byte_idx] >> bit_idx) & 1 == 1
}
/// Count number of 1 bits
pub fn popcount(&self) -> u32 {
self.data.iter().map(|&b| b.count_ones()).sum()
}
}
/// Two-stage search with binary quantization
///
/// 1. Fast Hamming distance filtering using binary vectors
/// 2. Rerank top candidates with full precision distance
pub struct BinarySearcher {
/// Binary quantized vectors
binary_vectors: Vec<BinaryQuantizedVector>,
/// Original vectors for reranking
original_vectors: Vec<Vec<f32>>,
/// Rerank factor (rerank top k * factor candidates)
rerank_factor: usize,
}
impl BinarySearcher {
/// Create a new binary searcher
pub fn new(vectors: Vec<Vec<f32>>, rerank_factor: usize) -> Self {
let binary_vectors: Vec<_> = vectors
.iter()
.map(|v| BinaryQuantizedVector::from_f32(v))
.collect();
Self {
binary_vectors,
original_vectors: vectors,
rerank_factor,
}
}
/// Search for k nearest neighbors
pub fn search(&self, query: &[f32], k: usize) -> Vec<(usize, f32)> {
let query_binary = BinaryQuantizedVector::from_f32(query);
// Stage 1: Fast Hamming distance search
let mut candidates: Vec<(usize, u32)> = self
.binary_vectors
.iter()
.enumerate()
.map(|(i, bv)| (i, query_binary.hamming_distance(bv)))
.collect();
// Sort by Hamming distance
candidates.sort_by_key(|(_, d)| *d);
// Take top k * rerank_factor candidates
let n_candidates = (k * self.rerank_factor).min(candidates.len());
let top_candidates: Vec<usize> = candidates
.iter()
.take(n_candidates)
.map(|(i, _)| *i)
.collect();
// Stage 2: Rerank with full precision distance
let mut reranked: Vec<(usize, f32)> = top_candidates
.iter()
.map(|&i| {
let dist: f32 = query
.iter()
.zip(self.original_vectors[i].iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>()
.sqrt();
(i, dist)
})
.collect();
reranked.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
reranked.truncate(k);
reranked
}
}
// ============================================================================
// Tests
// ============================================================================
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quantize() {
let v = vec![0.5, -0.3, 0.1, -0.8, 0.2, -0.1, 0.9, -0.5];
let q = quantize(&v);
assert_eq!(q.len(), 1);
// Bits: 1, 0, 1, 0, 1, 0, 1, 0 = 0b01010101 = 85
assert_eq!(q[0], 0b01010101);
}
#[test]
fn test_hamming_distance() {
let a = vec![0b11110000];
let b = vec![0b10101010];
// XOR: 0b01011010, popcount = 4
assert_eq!(hamming_distance(&a, &b), 4);
}
#[test]
fn test_compression_ratio() {
let v = BinaryQuantizedVector::from_f32(&vec![0.0; 1024]);
assert_eq!(v.compression_ratio(), 32.0);
assert_eq!(v.data.len(), 128); // 1024 bits = 128 bytes
}
#[test]
fn test_simd_matches_scalar() {
let a: Vec<u8> = (0..128).collect();
let b: Vec<u8> = (0..128).map(|i| 255 - i).collect();
let scalar = hamming_distance(&a, &b);
let simd = hamming_distance_simd(&a, &b);
assert_eq!(scalar, simd);
}
#[test]
fn test_binary_searcher() {
let vectors: Vec<Vec<f32>> = (0..100)
.map(|i| vec![i as f32 * 0.1, (100 - i) as f32 * 0.1, 0.5])
.collect();
let searcher = BinarySearcher::new(vectors.clone(), 4);
let query = vec![5.0, 5.0, 0.5];
let results = searcher.search(&query, 5);
assert_eq!(results.len(), 5);
// Results should be ordered by distance
for i in 1..results.len() {
assert!(results[i].1 >= results[i - 1].1);
}
}
#[test]
fn test_get_bit() {
let v = vec![1.0, -1.0, 1.0, -1.0];
let bv = BinaryQuantizedVector::from_f32(&v);
assert!(bv.get_bit(0));
assert!(!bv.get_bit(1));
assert!(bv.get_bit(2));
assert!(!bv.get_bit(3));
}
}

View File

@@ -0,0 +1,63 @@
//! Vector quantization for memory reduction
//!
//! Provides various quantization methods:
//! - Scalar (SQ8): 4x compression
//! - Product (PQ): 8-32x compression
//! - Binary: 32x compression
pub mod binary;
pub mod product;
pub mod scalar;
use std::sync::atomic::{AtomicUsize, Ordering};
/// Global quantization table memory tracking
static TABLE_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
/// Get quantization table memory in MB
pub fn get_table_memory_mb() -> f64 {
TABLE_MEMORY_BYTES.load(Ordering::Relaxed) as f64 / (1024.0 * 1024.0)
}
/// Track table memory allocation
pub fn track_table_allocation(bytes: usize) {
TABLE_MEMORY_BYTES.fetch_add(bytes, Ordering::Relaxed);
}
/// Quantization type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QuantizationType {
/// No quantization (full precision)
None,
/// Scalar quantization (f32 -> i8)
Scalar,
/// Product quantization (subspace division)
Product,
/// Binary quantization (f32 -> 1 bit)
Binary,
}
impl std::fmt::Display for QuantizationType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
QuantizationType::None => write!(f, "none"),
QuantizationType::Scalar => write!(f, "sq8"),
QuantizationType::Product => write!(f, "pq"),
QuantizationType::Binary => write!(f, "binary"),
}
}
}
impl std::str::FromStr for QuantizationType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"none" | "" => Ok(QuantizationType::None),
"scalar" | "sq8" | "sq" => Ok(QuantizationType::Scalar),
"product" | "pq" => Ok(QuantizationType::Product),
"binary" | "bq" => Ok(QuantizationType::Binary),
_ => Err(format!("Unknown quantization type: {}", s)),
}
}
}

View File

@@ -0,0 +1,380 @@
//! Product Quantization (PQ)
//!
//! Compresses vectors by dividing into subspaces and quantizing each
//! independently. Achieves 8-32x compression with precomputed distance tables.
use rand::prelude::SliceRandom;
use rand::Rng;
/// Product Quantization configuration
#[derive(Debug, Clone)]
pub struct PQConfig {
/// Number of subspaces (subvectors)
pub m: usize,
/// Number of centroids per subspace (typically 256 for 8-bit codes)
pub k: usize,
/// Random seed
pub seed: u64,
}
impl Default for PQConfig {
fn default() -> Self {
Self {
m: 8, // 8 subspaces
k: 256, // 256 centroids (8-bit codes)
seed: 42,
}
}
}
/// Product Quantization index
pub struct ProductQuantizer {
/// Configuration
config: PQConfig,
/// Dimensions per subspace
dims_per_subspace: usize,
/// Total dimensions
dimensions: usize,
/// Centroids for each subspace: [m][k][dims_per_subspace]
centroids: Vec<Vec<Vec<f32>>>,
/// Whether trained
trained: bool,
}
impl ProductQuantizer {
/// Create a new product quantizer
pub fn new(dimensions: usize, config: PQConfig) -> Self {
assert!(
dimensions % config.m == 0,
"Dimensions must be divisible by number of subspaces"
);
let dims_per_subspace = dimensions / config.m;
Self {
config,
dims_per_subspace,
dimensions,
centroids: Vec::new(),
trained: false,
}
}
/// Train the quantizer on sample vectors
pub fn train(&mut self, vectors: &[Vec<f32>]) {
use rand::prelude::*;
use rand_chacha::ChaCha8Rng;
let mut rng = ChaCha8Rng::seed_from_u64(self.config.seed);
self.centroids = Vec::with_capacity(self.config.m);
for subspace in 0..self.config.m {
let start = subspace * self.dims_per_subspace;
let end = start + self.dims_per_subspace;
// Extract subvectors
let subvectors: Vec<Vec<f32>> =
vectors.iter().map(|v| v[start..end].to_vec()).collect();
// Run k-means on this subspace
let centroids = self.kmeans(&subvectors, self.config.k, 10, &mut rng);
self.centroids.push(centroids);
}
self.trained = true;
}
/// K-means clustering
fn kmeans<R: Rng>(
&self,
vectors: &[Vec<f32>],
k: usize,
iterations: usize,
rng: &mut R,
) -> Vec<Vec<f32>> {
if vectors.is_empty() || k == 0 {
return Vec::new();
}
let dims = vectors[0].len();
let k = k.min(vectors.len());
// Initialize centroids randomly
let mut indices: Vec<usize> = (0..vectors.len()).collect();
indices.shuffle(rng);
let mut centroids: Vec<Vec<f32>> = indices
.iter()
.take(k)
.map(|&i| vectors[i].clone())
.collect();
for _ in 0..iterations {
// Assign vectors to nearest centroid
let mut assignments: Vec<Vec<usize>> = vec![Vec::new(); k];
for (i, v) in vectors.iter().enumerate() {
let nearest = self.find_nearest(v, &centroids);
assignments[nearest].push(i);
}
// Update centroids
for (c, assigned) in assignments.iter().enumerate() {
if assigned.is_empty() {
continue;
}
let mut new_centroid = vec![0.0f32; dims];
for &i in assigned {
for (j, &val) in vectors[i].iter().enumerate() {
new_centroid[j] += val;
}
}
let count = assigned.len() as f32;
for val in &mut new_centroid {
*val /= count;
}
centroids[c] = new_centroid;
}
}
centroids
}
/// Find nearest centroid index
fn find_nearest(&self, vector: &[f32], centroids: &[Vec<f32>]) -> usize {
let mut best = 0;
let mut best_dist = f32::MAX;
for (i, c) in centroids.iter().enumerate() {
let dist: f32 = vector
.iter()
.zip(c.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist < best_dist {
best_dist = dist;
best = i;
}
}
best
}
/// Encode a vector to PQ codes
pub fn encode(&self, vector: &[f32]) -> Vec<u8> {
assert!(self.trained, "Quantizer must be trained");
assert_eq!(vector.len(), self.dimensions);
let mut codes = Vec::with_capacity(self.config.m);
for subspace in 0..self.config.m {
let start = subspace * self.dims_per_subspace;
let end = start + self.dims_per_subspace;
let subvector = &vector[start..end];
let nearest = self.find_nearest(subvector, &self.centroids[subspace]);
codes.push(nearest as u8);
}
codes
}
/// Decode PQ codes back to approximate vector
pub fn decode(&self, codes: &[u8]) -> Vec<f32> {
assert!(self.trained, "Quantizer must be trained");
assert_eq!(codes.len(), self.config.m);
let mut vector = Vec::with_capacity(self.dimensions);
for (subspace, &code) in codes.iter().enumerate() {
let centroid = &self.centroids[subspace][code as usize];
vector.extend_from_slice(centroid);
}
vector
}
/// Compute asymmetric distance (query to encoded vector)
/// More accurate than symmetric but slower
pub fn asymmetric_distance(&self, query: &[f32], codes: &[u8]) -> f32 {
assert_eq!(query.len(), self.dimensions);
assert_eq!(codes.len(), self.config.m);
let mut distance_sq = 0.0f32;
for (subspace, &code) in codes.iter().enumerate() {
let start = subspace * self.dims_per_subspace;
let end = start + self.dims_per_subspace;
let query_sub = &query[start..end];
let centroid = &self.centroids[subspace][code as usize];
for (q, c) in query_sub.iter().zip(centroid.iter()) {
distance_sq += (q - c).powi(2);
}
}
distance_sq.sqrt()
}
/// Precompute distance table for a query
/// Returns: [m][k] distances from query subvector to each centroid
pub fn precompute_distance_table(&self, query: &[f32]) -> Vec<Vec<f32>> {
assert_eq!(query.len(), self.dimensions);
let mut table = Vec::with_capacity(self.config.m);
for subspace in 0..self.config.m {
let start = subspace * self.dims_per_subspace;
let end = start + self.dims_per_subspace;
let query_sub = &query[start..end];
let distances: Vec<f32> = self.centroids[subspace]
.iter()
.map(|c| {
query_sub
.iter()
.zip(c.iter())
.map(|(q, v)| (q - v).powi(2))
.sum::<f32>()
})
.collect();
table.push(distances);
}
table
}
/// Fast distance using precomputed table
pub fn table_distance(&self, table: &[Vec<f32>], codes: &[u8]) -> f32 {
let mut distance_sq = 0.0f32;
for (subspace, &code) in codes.iter().enumerate() {
distance_sq += table[subspace][code as usize];
}
distance_sq.sqrt()
}
/// Memory per encoded vector in bytes
pub fn bytes_per_vector(&self) -> usize {
self.config.m // One byte per subspace
}
/// Compression ratio
pub fn compression_ratio(&self) -> f32 {
(self.dimensions * 4) as f32 / self.config.m as f32
}
}
/// Encoded vector with its codes
#[derive(Debug, Clone)]
pub struct PQVector {
pub codes: Vec<u8>,
}
impl PQVector {
pub fn memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.codes.len()
}
}
// ============================================================================
// Tests
// ============================================================================
#[cfg(test)]
mod tests {
use super::*;
use rand::prelude::*;
use rand_chacha::ChaCha8Rng;
fn random_vectors(n: usize, dims: usize, seed: u64) -> Vec<Vec<f32>> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..n)
.map(|_| (0..dims).map(|_| rng.gen_range(-1.0..1.0)).collect())
.collect()
}
#[test]
fn test_train_and_encode() {
let dims = 128;
let config = PQConfig {
m: 8,
k: 64,
seed: 42,
};
let mut pq = ProductQuantizer::new(dims, config);
let training = random_vectors(1000, dims, 42);
pq.train(&training);
// Encode a vector
let vector = random_vectors(1, dims, 123)[0].clone();
let codes = pq.encode(&vector);
assert_eq!(codes.len(), 8);
// Decode and check distance
let decoded = pq.decode(&codes);
let error: f32 = vector
.iter()
.zip(decoded.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>()
.sqrt();
// Error should be reasonable
assert!(error < 2.0, "Reconstruction error too high: {}", error);
}
#[test]
fn test_distance_table() {
let dims = 64;
let config = PQConfig {
m: 4,
k: 16,
seed: 42,
};
let mut pq = ProductQuantizer::new(dims, config);
let training = random_vectors(500, dims, 42);
pq.train(&training);
let query = random_vectors(1, dims, 123)[0].clone();
let target = random_vectors(1, dims, 456)[0].clone();
let codes = pq.encode(&target);
// Compare asymmetric and table distances
let asym_dist = pq.asymmetric_distance(&query, &codes);
let table = pq.precompute_distance_table(&query);
let table_dist = pq.table_distance(&table, &codes);
assert!((asym_dist - table_dist).abs() < 0.001);
}
#[test]
fn test_compression_ratio() {
let dims = 1536;
let config = PQConfig {
m: 48,
k: 256,
seed: 42,
};
let pq = ProductQuantizer::new(dims, config);
// Original: 1536 * 4 = 6144 bytes
// Compressed: 48 bytes
// Ratio: 128x
assert_eq!(pq.bytes_per_vector(), 48);
assert!((pq.compression_ratio() - 128.0).abs() < 0.1);
}
}

View File

@@ -0,0 +1,227 @@
//! Scalar Quantization (SQ8)
//!
//! Compresses f32 vectors to i8, achieving 4x memory reduction
//! with minimal accuracy loss.
/// Quantize f32 vector to i8
///
/// Returns (quantized_data, scale, offset)
pub fn quantize(vector: &[f32]) -> (Vec<i8>, f32, f32) {
if vector.is_empty() {
return (Vec::new(), 1.0, 0.0);
}
// Find min and max
let mut min = f32::MAX;
let mut max = f32::MIN;
for &v in vector {
if v < min {
min = v;
}
if v > max {
max = v;
}
}
let range = max - min;
let scale = if range > 0.0 { range / 254.0 } else { 1.0 };
let offset = min;
// Quantize to i8 (-127 to 127)
let quantized: Vec<i8> = vector
.iter()
.map(|&v| {
let normalized = (v - offset) / scale;
(normalized.clamp(0.0, 254.0) - 127.0) as i8
})
.collect();
(quantized, scale, offset)
}
/// Dequantize i8 vector back to f32
pub fn dequantize(quantized: &[i8], scale: f32, offset: f32) -> Vec<f32> {
quantized
.iter()
.map(|&q| (q as f32 + 127.0) * scale + offset)
.collect()
}
/// Calculate squared Euclidean distance between quantized vectors
pub fn distance_sq(a: &[i8], b: &[i8]) -> i32 {
debug_assert_eq!(a.len(), b.len());
a.iter()
.zip(b.iter())
.map(|(&x, &y)| {
let diff = x as i32 - y as i32;
diff * diff
})
.sum()
}
/// Calculate Euclidean distance between quantized vectors
pub fn distance(a: &[i8], b: &[i8], scale: f32) -> f32 {
(distance_sq(a, b) as f32).sqrt() * scale
}
/// Quantized vector with metadata
#[derive(Debug, Clone)]
pub struct ScalarQuantizedVector {
pub data: Vec<i8>,
pub scale: f32,
pub offset: f32,
}
impl ScalarQuantizedVector {
/// Create from f32 vector
pub fn from_f32(vector: &[f32]) -> Self {
let (data, scale, offset) = quantize(vector);
Self {
data,
scale,
offset,
}
}
/// Convert back to f32
pub fn to_f32(&self) -> Vec<f32> {
dequantize(&self.data, self.scale, self.offset)
}
/// Calculate distance to another quantized vector
pub fn distance(&self, other: &Self) -> f32 {
let max_scale = self.scale.max(other.scale);
distance(&self.data, &other.data, max_scale)
}
/// Memory size in bytes
pub fn memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.data.len()
}
/// Compression ratio compared to f32
pub fn compression_ratio(&self) -> f32 {
4.0 // f32 (4 bytes) -> i8 (1 byte)
}
}
// ============================================================================
// SIMD-optimized distance (for larger vectors)
// ============================================================================
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn distance_sq_avx2(a: &[i8], b: &[i8]) -> i32 {
use std::arch::x86_64::*;
let n = a.len();
let mut sum = _mm256_setzero_si256();
let chunks = n / 32;
for i in 0..chunks {
let offset = i * 32;
let va = _mm256_loadu_si256(a.as_ptr().add(offset) as *const __m256i);
let vb = _mm256_loadu_si256(b.as_ptr().add(offset) as *const __m256i);
// Subtract (with sign extension trick for i8)
let diff_lo = _mm256_sub_epi16(
_mm256_cvtepi8_epi16(_mm256_castsi256_si128(va)),
_mm256_cvtepi8_epi16(_mm256_castsi256_si128(vb)),
);
let diff_hi = _mm256_sub_epi16(
_mm256_cvtepi8_epi16(_mm256_extracti128_si256(va, 1)),
_mm256_cvtepi8_epi16(_mm256_extracti128_si256(vb, 1)),
);
// Square and accumulate
let sq_lo = _mm256_madd_epi16(diff_lo, diff_lo);
let sq_hi = _mm256_madd_epi16(diff_hi, diff_hi);
sum = _mm256_add_epi32(sum, sq_lo);
sum = _mm256_add_epi32(sum, sq_hi);
}
// Horizontal sum
let sum128_lo = _mm256_castsi256_si128(sum);
let sum128_hi = _mm256_extracti128_si256(sum, 1);
let sum128 = _mm_add_epi32(sum128_lo, sum128_hi);
let sum64 = _mm_add_epi32(sum128, _mm_srli_si128(sum128, 8));
let sum32 = _mm_add_epi32(sum64, _mm_srli_si128(sum64, 4));
let mut result = _mm_cvtsi128_si32(sum32);
// Handle remainder
for i in (chunks * 32)..n {
let diff = a[i] as i32 - b[i] as i32;
result += diff * diff;
}
result
}
/// SIMD-accelerated distance calculation
pub fn distance_simd(a: &[i8], b: &[i8], scale: f32) -> f32 {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return (unsafe { distance_sq_avx2(a, b) } as f32).sqrt() * scale;
}
}
distance(a, b, scale)
}
// ============================================================================
// Tests
// ============================================================================
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quantize_dequantize() {
let original = vec![0.1, 0.5, -0.3, 0.8, -0.9];
let (quantized, scale, offset) = quantize(&original);
let restored = dequantize(&quantized, scale, offset);
for (o, r) in original.iter().zip(restored.iter()) {
assert!((o - r).abs() < 0.02, "orig={}, restored={}", o, r);
}
}
#[test]
fn test_distance() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let qa = ScalarQuantizedVector::from_f32(&a);
let qb = ScalarQuantizedVector::from_f32(&b);
let dist = qa.distance(&qb);
// Euclidean distance should be sqrt(2) ≈ 1.414
assert!((dist - 1.414).abs() < 0.2, "dist={}", dist);
}
#[test]
fn test_compression_ratio() {
let v = ScalarQuantizedVector::from_f32(&vec![0.0; 1000]);
assert_eq!(v.compression_ratio(), 4.0);
assert_eq!(v.data.len(), 1000); // 1000 i8 = 1000 bytes
}
#[test]
fn test_simd_matches_scalar() {
let a: Vec<i8> = (0..128).map(|i| i as i8).collect();
let b: Vec<i8> = (0..128).map(|i| -(i as i8)).collect();
let scalar_result = distance_sq(&a, &b);
let simd_result = (distance_simd(&a, &b, 1.0).powi(2)) as i32;
assert!((scalar_result - simd_result).abs() < 10);
}
}