wifi-densepose/crates/ruvector-core/src/quantization.rs

//! Quantization techniques for memory compression
//!
//! This module provides tiered quantization strategies as specified in ADR-001:
//!
//! | Quantization | Compression | Use Case |
//! |--------------|-------------|----------|
//! | Scalar (u8)  | 4x          | Warm data (40-80% access) |
//! | Int4         | 8x          | Cool data (10-40% access) |
//! | Product      | 8-16x       | Cold data (1-10% access) |
//! | Binary       | 32x         | Archive (<1% access) |
//!
//! ## Performance Optimizations v2
//!
//! - SIMD-accelerated distance calculations for scalar (int8) quantization
//! - SIMD popcnt for binary hamming distance
//! - 4x loop unrolling for better instruction-level parallelism
//! - Separate accumulator strategy to reduce data dependencies

use crate::error::Result;
use serde::{Deserialize, Serialize};

/// Trait for quantized vector representations
pub trait QuantizedVector: Send + Sync {
    /// Quantize a full-precision vector
    fn quantize(vector: &[f32]) -> Self;

    /// Calculate distance to another quantized vector
    fn distance(&self, other: &Self) -> f32;

    /// Reconstruct approximate full-precision vector
    fn reconstruct(&self) -> Vec<f32>;
}

/// Scalar quantization to int8 (4x compression)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScalarQuantized {
    /// Quantized values (int8)
    pub data: Vec<u8>,
    /// Minimum value for dequantization
    pub min: f32,
    /// Scale factor for dequantization
    pub scale: f32,
}

impl QuantizedVector for ScalarQuantized {
    fn quantize(vector: &[f32]) -> Self {
        let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
        let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);

        // Handle edge case where all values are the same (scale = 0)
        let scale = if (max - min).abs() < f32::EPSILON {
            1.0 // Arbitrary non-zero scale when all values are identical
        } else {
            (max - min) / 255.0
        };

        let data = vector
            .iter()
            .map(|&v| ((v - min) / scale).round().clamp(0.0, 255.0) as u8)
            .collect();

        Self { data, min, scale }
    }

    fn distance(&self, other: &Self) -> f32 {
        // Fast int8 distance calculation with SIMD optimization
        // Use i32 to avoid overflow: max diff is 255, and 255*255=65025 fits in i32

        // Scale handling: We use the average of both scales for balanced comparison.
        // Using max(scale) would bias toward the vector with larger range,
        // while average provides a more symmetric distance metric.
        // This ensures distance(a, b) ≈ distance(b, a) in the reconstructed space.
        let avg_scale = (self.scale + other.scale) / 2.0;

        // Use SIMD-optimized version for larger vectors
        #[cfg(target_arch = "aarch64")]
        {
            if self.data.len() >= 16 {
                return unsafe { scalar_distance_neon(&self.data, &other.data) }.sqrt() * avg_scale;
            }
        }

        #[cfg(target_arch = "x86_64")]
        {
            if self.data.len() >= 32 && is_x86_feature_detected!("avx2") {
                return unsafe { scalar_distance_avx2(&self.data, &other.data) }.sqrt() * avg_scale;
            }
        }

        // Scalar fallback with 4x loop unrolling for better ILP
        scalar_distance_scalar(&self.data, &other.data).sqrt() * avg_scale
    }

    fn reconstruct(&self) -> Vec<f32> {
        self.data
            .iter()
            .map(|&v| self.min + (v as f32) * self.scale)
            .collect()
    }
}

/// Product quantization (8-16x compression)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProductQuantized {
    /// Quantized codes (one per subspace)
    pub codes: Vec<u8>,
    /// Codebooks for each subspace
    pub codebooks: Vec<Vec<Vec<f32>>>,
}

impl ProductQuantized {
    /// Train product quantization on a set of vectors
    pub fn train(
        vectors: &[Vec<f32>],
        num_subspaces: usize,
        codebook_size: usize,
        iterations: usize,
    ) -> Result<Self> {
        if vectors.is_empty() {
            return Err(crate::error::RuvectorError::InvalidInput(
                "Cannot train on empty vector set".into(),
            ));
        }
        if vectors[0].is_empty() {
            return Err(crate::error::RuvectorError::InvalidInput(
                "Cannot train on vectors with zero dimensions".into(),
            ));
        }
        if codebook_size > 256 {
            return Err(crate::error::RuvectorError::InvalidParameter(format!(
                "Codebook size {} exceeds u8 maximum of 256",
                codebook_size
            )));
        }
        let dimensions = vectors[0].len();
        let subspace_dim = dimensions / num_subspaces;

        let mut codebooks = Vec::with_capacity(num_subspaces);

        // Train codebook for each subspace using k-means
        for subspace_idx in 0..num_subspaces {
            let start = subspace_idx * subspace_dim;
            let end = start + subspace_dim;

            // Extract subspace vectors
            let subspace_vectors: Vec<Vec<f32>> =
                vectors.iter().map(|v| v[start..end].to_vec()).collect();

            // Run k-means
            let codebook = kmeans_clustering(&subspace_vectors, codebook_size, iterations);
            codebooks.push(codebook);
        }

        Ok(Self {
            codes: vec![],
            codebooks,
        })
    }

    /// Quantize a vector using trained codebooks
    pub fn encode(&self, vector: &[f32]) -> Vec<u8> {
        let num_subspaces = self.codebooks.len();
        let subspace_dim = vector.len() / num_subspaces;

        let mut codes = Vec::with_capacity(num_subspaces);

        for (subspace_idx, codebook) in self.codebooks.iter().enumerate() {
            let start = subspace_idx * subspace_dim;
            let end = start + subspace_dim;
            let subvector = &vector[start..end];

            // Find nearest centroid
            let code = codebook
                .iter()
                .enumerate()
                .min_by(|(_, a), (_, b)| {
                    let dist_a = euclidean_squared(subvector, a);
                    let dist_b = euclidean_squared(subvector, b);
                    dist_a.partial_cmp(&dist_b).unwrap()
                })
                .map(|(idx, _)| idx as u8)
                .unwrap_or(0);

            codes.push(code);
        }

        codes
    }
}

/// Int4 quantization (8x compression)
///
/// Quantizes f32 to 4-bit integers (0-15), packing 2 values per byte.
/// Provides 8x compression with better precision than binary.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Int4Quantized {
    /// Packed 4-bit values (2 per byte)
    pub data: Vec<u8>,
    /// Minimum value for dequantization
    pub min: f32,
    /// Scale factor for dequantization
    pub scale: f32,
    /// Number of dimensions
    pub dimensions: usize,
}

impl Int4Quantized {
    /// Quantize a vector to 4-bit representation
    pub fn quantize(vector: &[f32]) -> Self {
        let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
        let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);

        // Handle edge case where all values are the same
        let scale = if (max - min).abs() < f32::EPSILON {
            1.0
        } else {
            (max - min) / 15.0 // 4-bit gives 0-15 range
        };

        let dimensions = vector.len();
        let num_bytes = dimensions.div_ceil(2);
        let mut data = vec![0u8; num_bytes];

        for (i, &v) in vector.iter().enumerate() {
            let quantized = ((v - min) / scale).round().clamp(0.0, 15.0) as u8;
            let byte_idx = i / 2;
            if i % 2 == 0 {
                // Low nibble
                data[byte_idx] |= quantized;
            } else {
                // High nibble
                data[byte_idx] |= quantized << 4;
            }
        }

        Self {
            data,
            min,
            scale,
            dimensions,
        }
    }

    /// Calculate distance to another Int4 quantized vector
    pub fn distance(&self, other: &Self) -> f32 {
        assert_eq!(self.dimensions, other.dimensions);

        // Use average scale for balanced comparison
        let avg_scale = (self.scale + other.scale) / 2.0;
        let _avg_min = (self.min + other.min) / 2.0;

        let mut sum_sq = 0i32;

        for i in 0..self.dimensions {
            let byte_idx = i / 2;
            let shift = if i % 2 == 0 { 0 } else { 4 };

            let a = ((self.data[byte_idx] >> shift) & 0x0F) as i32;
            let b = ((other.data[byte_idx] >> shift) & 0x0F) as i32;
            let diff = a - b;
            sum_sq += diff * diff;
        }

        (sum_sq as f32).sqrt() * avg_scale
    }

    /// Reconstruct approximate full-precision vector
    pub fn reconstruct(&self) -> Vec<f32> {
        let mut result = Vec::with_capacity(self.dimensions);

        for i in 0..self.dimensions {
            let byte_idx = i / 2;
            let shift = if i % 2 == 0 { 0 } else { 4 };
            let quantized = (self.data[byte_idx] >> shift) & 0x0F;
            result.push(self.min + (quantized as f32) * self.scale);
        }

        result
    }

    /// Get compression ratio (8x for Int4)
    pub fn compression_ratio() -> f32 {
        8.0 // f32 (4 bytes) -> 4 bits (0.5 bytes)
    }
}

/// Binary quantization (32x compression)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BinaryQuantized {
    /// Binary representation (1 bit per dimension, packed into bytes)
    pub bits: Vec<u8>,
    /// Number of dimensions
    pub dimensions: usize,
}

impl QuantizedVector for BinaryQuantized {
    fn quantize(vector: &[f32]) -> Self {
        let dimensions = vector.len();
        let num_bytes = dimensions.div_ceil(8);
        let mut bits = vec![0u8; num_bytes];

        for (i, &v) in vector.iter().enumerate() {
            if v > 0.0 {
                let byte_idx = i / 8;
                let bit_idx = i % 8;
                bits[byte_idx] |= 1 << bit_idx;
            }
        }

        Self { bits, dimensions }
    }

    fn distance(&self, other: &Self) -> f32 {
        // Hamming distance using SIMD-friendly operations
        Self::hamming_distance_fast(&self.bits, &other.bits) as f32
    }

    fn reconstruct(&self) -> Vec<f32> {
        let mut result = Vec::with_capacity(self.dimensions);

        for i in 0..self.dimensions {
            let byte_idx = i / 8;
            let bit_idx = i % 8;
            let bit = (self.bits[byte_idx] >> bit_idx) & 1;
            result.push(if bit == 1 { 1.0 } else { -1.0 });
        }

        result
    }
}

impl BinaryQuantized {
    /// Fast hamming distance using SIMD-optimized operations
    ///
    /// Uses hardware POPCNT on x86_64 or NEON vcnt on ARM64 for optimal performance.
    /// Processes 16 bytes at a time on ARM64, 8 bytes at a time on x86_64.
    /// Falls back to 64-bit operations for remainders.
    pub fn hamming_distance_fast(a: &[u8], b: &[u8]) -> u32 {
        // Use SIMD-optimized version based on architecture
        #[cfg(target_arch = "aarch64")]
        {
            if a.len() >= 16 {
                return unsafe { hamming_distance_neon(a, b) };
            }
        }

        #[cfg(target_arch = "x86_64")]
        {
            if a.len() >= 8 && is_x86_feature_detected!("popcnt") {
                return unsafe { hamming_distance_simd_x86(a, b) };
            }
        }

        // Scalar fallback using 64-bit operations
        let mut distance = 0u32;

        // Process 8 bytes at a time using u64
        let chunks_a = a.chunks_exact(8);
        let chunks_b = b.chunks_exact(8);
        let remainder_a = chunks_a.remainder();
        let remainder_b = chunks_b.remainder();

        for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
            let a_u64 = u64::from_le_bytes(chunk_a.try_into().unwrap());
            let b_u64 = u64::from_le_bytes(chunk_b.try_into().unwrap());
            distance += (a_u64 ^ b_u64).count_ones();
        }

        // Handle remainder bytes
        for (&a_byte, &b_byte) in remainder_a.iter().zip(remainder_b) {
            distance += (a_byte ^ b_byte).count_ones();
        }

        distance
    }

    /// Compute normalized hamming similarity (0.0 to 1.0)
    pub fn similarity(&self, other: &Self) -> f32 {
        let distance = self.distance(other);
        1.0 - (distance / self.dimensions as f32)
    }

    /// Get compression ratio (32x for binary)
    pub fn compression_ratio() -> f32 {
        32.0 // f32 (4 bytes = 32 bits) -> 1 bit
    }

    /// Convert to bytes for storage
    pub fn to_bytes(&self) -> &[u8] {
        &self.bits
    }

    /// Create from bytes
    pub fn from_bytes(bits: Vec<u8>, dimensions: usize) -> Self {
        Self { bits, dimensions }
    }
}

// ============================================================================
// Helper functions for scalar quantization distance
// ============================================================================

/// Scalar fallback for scalar quantization distance (sum of squared differences)
fn scalar_distance_scalar(a: &[u8], b: &[u8]) -> f32 {
    let mut sum_sq = 0i32;

    // 4x loop unrolling for better ILP
    let chunks = a.len() / 4;
    for i in 0..chunks {
        let idx = i * 4;
        let d0 = (a[idx] as i32) - (b[idx] as i32);
        let d1 = (a[idx + 1] as i32) - (b[idx + 1] as i32);
        let d2 = (a[idx + 2] as i32) - (b[idx + 2] as i32);
        let d3 = (a[idx + 3] as i32) - (b[idx + 3] as i32);
        sum_sq += d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3;
    }

    // Handle remainder
    for i in (chunks * 4)..a.len() {
        let diff = (a[i] as i32) - (b[i] as i32);
        sum_sq += diff * diff;
    }

    sum_sq as f32
}

/// NEON SIMD distance for scalar quantization
///
/// # Safety
/// Caller must ensure a.len() == b.len()
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn scalar_distance_neon(a: &[u8], b: &[u8]) -> f32 {
    use std::arch::aarch64::*;

    let len = a.len();
    let a_ptr = a.as_ptr();
    let b_ptr = b.as_ptr();

    let mut sum = vdupq_n_s32(0);

    // Process 8 bytes at a time
    let chunks = len / 8;
    let mut idx = 0usize;

    for _ in 0..chunks {
        // Load 8 u8 values
        let va = vld1_u8(a_ptr.add(idx));
        let vb = vld1_u8(b_ptr.add(idx));

        // Zero-extend u8 to u16
        let va_u16 = vmovl_u8(va);
        let vb_u16 = vmovl_u8(vb);

        // Convert to signed for subtraction
        let va_s16 = vreinterpretq_s16_u16(va_u16);
        let vb_s16 = vreinterpretq_s16_u16(vb_u16);

        // Compute difference
        let diff = vsubq_s16(va_s16, vb_s16);

        // Square and accumulate
        let prod_lo = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
        let prod_hi = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));

        sum = vaddq_s32(sum, prod_lo);
        sum = vaddq_s32(sum, prod_hi);

        idx += 8;
    }

    let mut total = vaddvq_s32(sum);

    // Handle remainder with bounds-check elimination
    for i in (chunks * 8)..len {
        let diff = (*a.get_unchecked(i) as i32) - (*b.get_unchecked(i) as i32);
        total += diff * diff;
    }

    total as f32
}

/// AVX2 SIMD distance for scalar quantization
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn scalar_distance_avx2(a: &[u8], b: &[u8]) -> f32 {
    use std::arch::x86_64::*;

    let len = a.len();
    let mut sum = _mm256_setzero_si256();

    // Process 16 bytes at a time
    let chunks = len / 16;
    for i in 0..chunks {
        let idx = i * 16;

        // Load 16 u8 values
        let va = _mm_loadu_si128(a.as_ptr().add(idx) as *const __m128i);
        let vb = _mm_loadu_si128(b.as_ptr().add(idx) as *const __m128i);

        // Zero-extend u8 to i16 (low and high halves)
        let va_lo = _mm256_cvtepu8_epi16(va);
        let vb_lo = _mm256_cvtepu8_epi16(vb);

        // Compute difference
        let diff = _mm256_sub_epi16(va_lo, vb_lo);

        // Square (multiply i16 * i16 -> i32)
        let prod = _mm256_madd_epi16(diff, diff);

        // Accumulate
        sum = _mm256_add_epi32(sum, prod);
    }

    // Horizontal sum
    let sum_lo = _mm256_castsi256_si128(sum);
    let sum_hi = _mm256_extracti128_si256(sum, 1);
    let sum_128 = _mm_add_epi32(sum_lo, sum_hi);

    let shuffle = _mm_shuffle_epi32(sum_128, 0b10_11_00_01);
    let sum_64 = _mm_add_epi32(sum_128, shuffle);

    let shuffle2 = _mm_shuffle_epi32(sum_64, 0b00_00_10_10);
    let final_sum = _mm_add_epi32(sum_64, shuffle2);

    let mut total = _mm_cvtsi128_si32(final_sum);

    // Handle remainder
    for i in (chunks * 16)..len {
        let diff = (a[i] as i32) - (b[i] as i32);
        total += diff * diff;
    }

    total as f32
}

// Helper functions

fn euclidean_squared(a: &[f32], b: &[f32]) -> f32 {
    a.iter()
        .zip(b)
        .map(|(&x, &y)| {
            let diff = x - y;
            diff * diff
        })
        .sum()
}

fn kmeans_clustering(vectors: &[Vec<f32>], k: usize, iterations: usize) -> Vec<Vec<f32>> {
    use rand::seq::SliceRandom;
    use rand::thread_rng;

    let mut rng = thread_rng();

    // Initialize centroids randomly
    let mut centroids: Vec<Vec<f32>> = vectors.choose_multiple(&mut rng, k).cloned().collect();

    for _ in 0..iterations {
        // Assign vectors to nearest centroid
        let mut assignments = vec![Vec::new(); k];

        for vector in vectors {
            let nearest = centroids
                .iter()
                .enumerate()
                .min_by(|(_, a), (_, b)| {
                    let dist_a = euclidean_squared(vector, a);
                    let dist_b = euclidean_squared(vector, b);
                    dist_a.partial_cmp(&dist_b).unwrap()
                })
                .map(|(idx, _)| idx)
                .unwrap_or(0);

            assignments[nearest].push(vector.clone());
        }

        // Update centroids
        for (centroid, assigned) in centroids.iter_mut().zip(&assignments) {
            if !assigned.is_empty() {
                let dim = centroid.len();
                *centroid = vec![0.0; dim];

                for vector in assigned {
                    for (i, &v) in vector.iter().enumerate() {
                        centroid[i] += v;
                    }
                }

                let count = assigned.len() as f32;
                for v in centroid.iter_mut() {
                    *v /= count;
                }
            }
        }
    }

    centroids
}

// =============================================================================
// SIMD-Optimized Distance Calculations for Quantized Vectors
// =============================================================================

// NOTE: scalar_distance_scalar is already defined above (lines 404-425)
// NOTE: scalar_distance_neon is already defined above (lines 430-473)
// NOTE: scalar_distance_avx2 is already defined above (lines 479-540)
// This section uses the existing implementations for consistency

/// SIMD-optimized hamming distance using popcnt
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "popcnt")]
#[inline]
unsafe fn hamming_distance_simd_x86(a: &[u8], b: &[u8]) -> u32 {
    use std::arch::x86_64::*;

    let mut distance = 0u64;

    // Process 8 bytes at a time using u64 with hardware popcnt
    let chunks_a = a.chunks_exact(8);
    let chunks_b = b.chunks_exact(8);
    let remainder_a = chunks_a.remainder();
    let remainder_b = chunks_b.remainder();

    for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
        let a_u64 = u64::from_le_bytes(chunk_a.try_into().unwrap());
        let b_u64 = u64::from_le_bytes(chunk_b.try_into().unwrap());
        distance += _popcnt64((a_u64 ^ b_u64) as i64) as u64;
    }

    // Handle remainder
    for (&a_byte, &b_byte) in remainder_a.iter().zip(remainder_b) {
        distance += (a_byte ^ b_byte).count_ones() as u64;
    }

    distance as u32
}

/// NEON-optimized hamming distance for ARM64
///
/// # Safety
/// Caller must ensure a.len() == b.len()
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn hamming_distance_neon(a: &[u8], b: &[u8]) -> u32 {
    use std::arch::aarch64::*;

    let len = a.len();
    let a_ptr = a.as_ptr();
    let b_ptr = b.as_ptr();

    let chunks = len / 16;
    let mut idx = 0usize;

    let mut sum = vdupq_n_u8(0);

    for _ in 0..chunks {
        // Load 16 bytes
        let a_vec = vld1q_u8(a_ptr.add(idx));
        let b_vec = vld1q_u8(b_ptr.add(idx));

        // XOR and count bits using vcntq_u8 (population count)
        let xor_result = veorq_u8(a_vec, b_vec);
        let bits = vcntq_u8(xor_result);

        // Accumulate
        sum = vaddq_u8(sum, bits);

        idx += 16;
    }

    // Horizontal sum
    let sum_val = vaddvq_u8(sum) as u32;

    // Handle remainder with bounds-check elimination
    let mut remainder_sum = 0u32;
    let start = chunks * 16;
    for i in start..len {
        remainder_sum += (*a.get_unchecked(i) ^ *b.get_unchecked(i)).count_ones();
    }

    sum_val + remainder_sum
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_scalar_quantization() {
        let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let quantized = ScalarQuantized::quantize(&vector);
        let reconstructed = quantized.reconstruct();

        // Check approximate reconstruction
        for (orig, recon) in vector.iter().zip(&reconstructed) {
            assert!((orig - recon).abs() < 0.1);
        }
    }

    #[test]
    fn test_binary_quantization() {
        let vector = vec![1.0, -1.0, 2.0, -2.0, 0.5];
        let quantized = BinaryQuantized::quantize(&vector);

        assert_eq!(quantized.dimensions, 5);
        assert_eq!(quantized.bits.len(), 1); // 5 bits fit in 1 byte
    }

    #[test]
    fn test_binary_distance() {
        let v1 = vec![1.0, 1.0, 1.0, 1.0];
        let v2 = vec![1.0, 1.0, -1.0, -1.0];

        let q1 = BinaryQuantized::quantize(&v1);
        let q2 = BinaryQuantized::quantize(&v2);

        let dist = q1.distance(&q2);
        assert_eq!(dist, 2.0); // 2 bits differ
    }

    #[test]
    fn test_scalar_quantization_roundtrip() {
        // Test that quantize -> reconstruct produces values close to original
        let test_vectors = vec![
            vec![1.0, 2.0, 3.0, 4.0, 5.0],
            vec![-10.0, -5.0, 0.0, 5.0, 10.0],
            vec![0.1, 0.2, 0.3, 0.4, 0.5],
            vec![100.0, 200.0, 300.0, 400.0, 500.0],
        ];

        for vector in test_vectors {
            let quantized = ScalarQuantized::quantize(&vector);
            let reconstructed = quantized.reconstruct();

            assert_eq!(vector.len(), reconstructed.len());

            for (orig, recon) in vector.iter().zip(reconstructed.iter()) {
                // With 8-bit quantization, max error is roughly (max-min)/255
                let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);
                let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
                let max_error = (max - min) / 255.0 * 2.0; // Allow 2x for rounding

                assert!(
                    (orig - recon).abs() < max_error,
                    "Roundtrip error too large: orig={}, recon={}, error={}",
                    orig,
                    recon,
                    (orig - recon).abs()
                );
            }
        }
    }

    #[test]
    fn test_scalar_distance_symmetry() {
        // Test that distance(a, b) == distance(b, a)
        let v1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let v2 = vec![2.0, 3.0, 4.0, 5.0, 6.0];

        let q1 = ScalarQuantized::quantize(&v1);
        let q2 = ScalarQuantized::quantize(&v2);

        let dist_ab = q1.distance(&q2);
        let dist_ba = q2.distance(&q1);

        // Distance should be symmetric (within floating point precision)
        assert!(
            (dist_ab - dist_ba).abs() < 0.01,
            "Distance is not symmetric: d(a,b)={}, d(b,a)={}",
            dist_ab,
            dist_ba
        );
    }

    #[test]
    fn test_scalar_distance_different_scales() {
        // Test distance calculation with vectors that have different scales
        let v1 = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // range: 4.0
        let v2 = vec![10.0, 20.0, 30.0, 40.0, 50.0]; // range: 40.0

        let q1 = ScalarQuantized::quantize(&v1);
        let q2 = ScalarQuantized::quantize(&v2);

        let dist_ab = q1.distance(&q2);
        let dist_ba = q2.distance(&q1);

        // With average scaling, symmetry should be maintained
        assert!(
            (dist_ab - dist_ba).abs() < 0.01,
            "Distance with different scales not symmetric: d(a,b)={}, d(b,a)={}",
            dist_ab,
            dist_ba
        );
    }

    #[test]
    fn test_scalar_quantization_edge_cases() {
        // Test with all same values
        let same_values = vec![5.0, 5.0, 5.0, 5.0];
        let quantized = ScalarQuantized::quantize(&same_values);
        let reconstructed = quantized.reconstruct();

        for (orig, recon) in same_values.iter().zip(reconstructed.iter()) {
            assert!((orig - recon).abs() < 0.01);
        }

        // Test with extreme ranges
        let extreme = vec![f32::MIN / 1e10, 0.0, f32::MAX / 1e10];
        let quantized = ScalarQuantized::quantize(&extreme);
        let reconstructed = quantized.reconstruct();

        assert_eq!(extreme.len(), reconstructed.len());
    }

    #[test]
    fn test_binary_distance_symmetry() {
        // Test that binary distance is symmetric
        let v1 = vec![1.0, -1.0, 1.0, -1.0];
        let v2 = vec![1.0, 1.0, -1.0, -1.0];

        let q1 = BinaryQuantized::quantize(&v1);
        let q2 = BinaryQuantized::quantize(&v2);

        let dist_ab = q1.distance(&q2);
        let dist_ba = q2.distance(&q1);

        assert_eq!(
            dist_ab, dist_ba,
            "Binary distance not symmetric: d(a,b)={}, d(b,a)={}",
            dist_ab, dist_ba
        );
    }

    #[test]
    fn test_int4_quantization() {
        let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let quantized = Int4Quantized::quantize(&vector);
        let reconstructed = quantized.reconstruct();

        assert_eq!(quantized.dimensions, 5);
        // 5 dimensions = 3 bytes (2 per byte, last byte has 1)
        assert_eq!(quantized.data.len(), 3);

        // Check approximate reconstruction
        for (orig, recon) in vector.iter().zip(&reconstructed) {
            // With 4-bit quantization, max error is roughly (max-min)/15
            let max_error = (5.0 - 1.0) / 15.0 * 2.0;
            assert!(
                (orig - recon).abs() < max_error,
                "Int4 roundtrip error too large: orig={}, recon={}",
                orig,
                recon
            );
        }
    }

    #[test]
    fn test_int4_distance() {
        // Use vectors with different quantized patterns
        // v1 spans [0.0, 15.0] -> quantizes to [0, 1, 2, ..., 15] (linear mapping)
        // v2 spans [0.0, 15.0] but with different distribution
        let v1 = vec![0.0, 5.0, 10.0, 15.0];
        let v2 = vec![0.0, 3.0, 12.0, 15.0]; // Different middle values

        let q1 = Int4Quantized::quantize(&v1);
        let q2 = Int4Quantized::quantize(&v2);

        let dist = q1.distance(&q2);
        // The quantized values differ in the middle, so distance should be positive
        assert!(
            dist > 0.0,
            "Distance should be positive, got {}. q1.data={:?}, q2.data={:?}",
            dist,
            q1.data,
            q2.data
        );
    }

    #[test]
    fn test_int4_distance_symmetry() {
        let v1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let v2 = vec![2.0, 3.0, 4.0, 5.0, 6.0];

        let q1 = Int4Quantized::quantize(&v1);
        let q2 = Int4Quantized::quantize(&v2);

        let dist_ab = q1.distance(&q2);
        let dist_ba = q2.distance(&q1);

        assert!(
            (dist_ab - dist_ba).abs() < 0.01,
            "Int4 distance not symmetric: d(a,b)={}, d(b,a)={}",
            dist_ab,
            dist_ba
        );
    }

    #[test]
    fn test_int4_compression_ratio() {
        assert_eq!(Int4Quantized::compression_ratio(), 8.0);
    }

    #[test]
    fn test_binary_fast_hamming() {
        // Test fast hamming distance with various sizes
        let a = vec![0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xAA];
        let b = vec![0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x55];

        let distance = BinaryQuantized::hamming_distance_fast(&a, &b);
        // All bits differ: 9 bytes * 8 bits = 72 bits
        assert_eq!(distance, 72);
    }

    #[test]
    fn test_binary_similarity() {
        let v1 = vec![1.0; 8]; // All positive
        let v2 = vec![1.0; 8]; // Same

        let q1 = BinaryQuantized::quantize(&v1);
        let q2 = BinaryQuantized::quantize(&v2);

        let sim = q1.similarity(&q2);
        assert!(
            (sim - 1.0).abs() < 0.001,
            "Same vectors should have similarity 1.0"
        );
    }

    #[test]
    fn test_binary_compression_ratio() {
        assert_eq!(BinaryQuantized::compression_ratio(), 32.0);
    }
}