Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/crates/ruvllm/src/bitnet/quantizer.rs
+++ b/crates/ruvllm/src/bitnet/quantizer.rs
@@ -0,0 +1,365 @@
+//! PT-BitNet Post-Training Quantization
+//!
+//! Core absmean ternary quantization algorithm for converting FP32 weights
+//! to BitNet b1.58 ternary format.
+
+use super::ternary_tensor::{pack_ternary, TernaryTensor};
+use crate::error::{Result, RuvLLMError};
+
+/// Configuration for PT-BitNet post-training quantization.
+///
+/// Controls the quantization process behavior, including block size,
+/// calibration, and layer selection.
+///
+/// # Example
+///
+/// ```rust,ignore
+/// use ruvllm::bitnet::PtBitnetConfig;
+///
+/// let config = PtBitnetConfig {
+///     calibration_samples: 1000,
+///     block_size: 256,
+///     optimize_scales: true,
+///     layers_to_quantize: LayerMask::ExpertsOnly,
+///     export_format: TernaryFormat::BitnetT158,
+///     ..Default::default()
+/// };
+/// ```
+#[derive(Debug, Clone)]
+pub struct PtBitnetConfig {
+    /// Number of calibration samples for scale optimization
+    pub calibration_samples: usize,
+    /// Elements per quantization block
+    pub block_size: usize,
+    /// Enable scale factor optimization via calibration
+    pub optimize_scales: bool,
+    /// Which layers to quantize
+    pub layers_to_quantize: LayerMask,
+    /// Export format for GGUF serialization
+    pub export_format: TernaryFormat,
+    /// Precision for router and shared layers
+    pub router_precision: Precision,
+    /// Use memory-mapped I/O for weight loading
+    pub use_mmap: bool,
+    /// Use Metal GPU for calibration (Mac Studio only)
+    pub use_metal_calibration: bool,
+    /// Maximum memory budget in GB
+    pub max_memory_gb: usize,
+}
+
+impl Default for PtBitnetConfig {
+    fn default() -> Self {
+        Self {
+            calibration_samples: 1000,
+            block_size: 256,
+            optimize_scales: true,
+            layers_to_quantize: LayerMask::ExpertsOnly,
+            export_format: TernaryFormat::BitnetT158,
+            router_precision: Precision::FP16,
+            use_mmap: true,
+            use_metal_calibration: cfg!(all(target_os = "macos", feature = "metal-compute")),
+            max_memory_gb: 64,
+        }
+    }
+}
+
+/// Layer selection mask for quantization.
+///
+/// Determines which model layers to convert to ternary. Per ADR-017 (AD-2),
+/// the MoE router, embeddings, and LM head must remain in higher precision.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum LayerMask {
+    /// Only MoE expert FFN layers (recommended for Phase 1)
+    ExpertsOnly,
+    /// All linear layers except router/embeddings/head
+    All,
+    /// Custom layer selection by name pattern
+    Custom(Vec<String>),
+}
+
+/// Ternary tensor export format.
+///
+/// Determines the GGUF quantization type used for serialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TernaryFormat {
+    /// BitNet b1.58 native format (type 30)
+    BitnetT158,
+    /// IQ1_S compatible format (type 19)
+    IQ1S,
+}
+
+/// Precision for non-quantized layers.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Precision {
+    /// 16-bit floating point
+    FP16,
+    /// Brain floating point 16
+    BF16,
+    /// 32-bit floating point
+    FP32,
+}
+
+/// Core absmean ternary quantization algorithm.
+///
+/// Implements the BitNet b1.58 quantization formula:
+/// ```text
+/// gamma = mean(|block|) + epsilon
+/// normalized = block / gamma
+/// ternary = round(clamp(normalized, -1, 1))
+/// ```
+///
+/// # Arguments
+///
+/// * `block` - FP32 weight block (typically 256 elements)
+///
+/// # Returns
+///
+/// Tuple of (ternary values, scale factor):
+/// - `Vec<i8>`: Ternary weights in {-1, 0, +1}
+/// - `f32`: Absmean scale factor (gamma)
+///
+/// # Example
+///
+/// ```rust,ignore
+/// use ruvllm::bitnet::absmean_ternary;
+///
+/// let weights = vec![0.5, -0.3, 0.8, -0.1, 0.0, 0.4];
+/// let (ternary, scale) = absmean_ternary(&weights);
+///
+/// println!("Scale: {}", scale);
+/// println!("Ternary: {:?}", ternary);  // e.g., [1, -1, 1, 0, 0, 1]
+/// ```
+pub fn absmean_ternary(block: &[f32]) -> (Vec<i8>, f32) {
+    // Guard: empty block returns empty ternary with epsilon scale
+    if block.is_empty() {
+        return (vec![], 1e-8);
+    }
+
+    // Compute absmean scale: gamma = mean(|W|)
+    let sum_abs: f32 = block.iter().map(|&w| w.abs()).sum();
+    let gamma = (sum_abs / block.len() as f32) + 1e-8;
+
+    // Normalize and quantize to {-1, 0, +1}
+    let ternary: Vec<i8> = block
+        .iter()
+        .map(|&w| {
+            let normalized = w / gamma;
+            let clamped = normalized.clamp(-1.0, 1.0);
+            clamped.round() as i8
+        })
+        .collect();
+
+    (ternary, gamma)
+}
+
+/// Quantize a full FP32 tensor to ternary representation.
+///
+/// Processes the input tensor in blocks of `config.block_size`, applying
+/// absmean quantization to each block independently.
+///
+/// # Arguments
+///
+/// * `weights` - FP32 weight tensor (flattened)
+/// * `shape` - Tensor shape (rows, cols)
+/// * `config` - Quantization configuration
+///
+/// # Returns
+///
+/// `TernaryTensor` with packed 2-bit data and per-block scales
+///
+/// # Errors
+///
+/// Returns an error if the weight dimensions are invalid.
+///
+/// # Example
+///
+/// ```rust,ignore
+/// use ruvllm::bitnet::{quantize_tensor, PtBitnetConfig};
+///
+/// let weights = vec![0.5; 512];  // 512 FP32 weights
+/// let shape = (2, 256);
+/// let config = PtBitnetConfig::default();
+///
+/// let ternary = quantize_tensor(&weights, shape, &config)?;
+/// println!("Compressed to {} bytes", ternary.memory_bytes());
+/// ```
+pub fn quantize_tensor(
+    weights: &[f32],
+    shape: (usize, usize),
+    config: &PtBitnetConfig,
+) -> Result<TernaryTensor> {
+    let (rows, cols) = shape;
+
+    if rows == 0 || cols == 0 {
+        return Err(RuvLLMError::Model(format!(
+            "Invalid tensor shape: dimensions must be non-zero, got {:?}",
+            shape
+        )));
+    }
+
+    let block_size = config.block_size;
+    if block_size == 0 {
+        return Err(RuvLLMError::Model(
+            "block_size must be non-zero".to_string(),
+        ));
+    }
+
+    let total_elements = rows.checked_mul(cols).ok_or_else(|| {
+        RuvLLMError::Model(format!(
+            "Integer overflow computing total elements for shape {:?}",
+            shape
+        ))
+    })?;
+
+    if weights.len() != total_elements {
+        return Err(RuvLLMError::Model(format!(
+            "Weight size mismatch: expected {} elements for shape {:?}, got {}",
+            total_elements,
+            shape,
+            weights.len()
+        )));
+    }
+
+    // Use checked arithmetic to prevent overflow in block count
+    let num_blocks = total_elements.checked_add(block_size - 1).ok_or_else(|| {
+        RuvLLMError::Model("Integer overflow in block count calculation".to_string())
+    })? / block_size;
+
+    let mut all_ternary = Vec::with_capacity(total_elements);
+    let mut scales = Vec::with_capacity(num_blocks);
+
+    // Process each block
+    for block_idx in 0..num_blocks {
+        let start = block_idx * block_size;
+        let end = (start + block_size).min(total_elements);
+        let block = &weights[start..end];
+
+        let (ternary, scale) = absmean_ternary(block);
+        all_ternary.extend_from_slice(&ternary);
+        scales.push(scale);
+    }
+
+    // Pack ternary values into 2-bit representation
+    let packed_data = pack_ternary(&all_ternary);
+
+    Ok(TernaryTensor {
+        packed_data,
+        scales,
+        shape,
+        block_size,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_absmean_ternary_simple() {
+        // Simple block with known values
+        let block = vec![0.5, -0.5, 0.0, 1.0, -1.0, 0.25];
+        let (ternary, scale) = absmean_ternary(&block);
+
+        // All values should be in {-1, 0, +1}
+        assert!(ternary.iter().all(|&v| v >= -1 && v <= 1));
+
+        // Scale should be positive
+        assert!(scale > 0.0);
+
+        // Check specific values
+        // gamma ≈ (0.5 + 0.5 + 0.0 + 1.0 + 1.0 + 0.25) / 6 ≈ 0.542
+        // 0.5 / 0.542 ≈ 0.92 → round(0.92) = 1
+        // -0.5 / 0.542 ≈ -0.92 → round(-0.92) = -1
+        // 0.0 / 0.542 = 0 → round(0) = 0
+        assert_eq!(ternary[0], 1);
+        assert_eq!(ternary[1], -1);
+        assert_eq!(ternary[2], 0);
+    }
+
+    #[test]
+    fn test_absmean_ternary_all_zeros() {
+        let block = vec![0.0; 256];
+        let (ternary, scale) = absmean_ternary(&block);
+
+        // All should quantize to 0
+        assert!(ternary.iter().all(|&v| v == 0));
+
+        // Scale should be epsilon (1e-8)
+        assert!(scale < 1e-7 && scale > 0.0);
+    }
+
+    #[test]
+    fn test_absmean_ternary_large_values() {
+        let block = vec![10.0, -10.0, 5.0, -5.0];
+        let (ternary, _scale) = absmean_ternary(&block);
+
+        // All should saturate to ±1
+        assert!(ternary[0] == 1 || ternary[0] == -1);
+        assert!(ternary[1] == 1 || ternary[1] == -1);
+    }
+
+    #[test]
+    fn test_quantize_tensor_simple() {
+        let weights = vec![0.5; 512]; // 512 identical weights
+        let shape = (2, 256);
+        let config = PtBitnetConfig::default();
+
+        let ternary = quantize_tensor(&weights, shape, &config).unwrap();
+
+        assert_eq!(ternary.shape, shape);
+        assert_eq!(ternary.block_size, 256);
+        assert_eq!(ternary.num_blocks(), 2); // 512 / 256 = 2 blocks
+        assert_eq!(ternary.scales.len(), 2);
+
+        // 512 elements packed in 2 bits each = 128 bytes
+        assert_eq!(ternary.packed_data.len(), 128);
+    }
+
+    #[test]
+    fn test_quantize_tensor_size_mismatch() {
+        let weights = vec![0.5; 100]; // Wrong size
+        let shape = (2, 256); // Expects 512
+        let config = PtBitnetConfig::default();
+
+        let result = quantize_tensor(&weights, shape, &config);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_quantize_tensor_memory_savings() {
+        // Quantize a 1MB FP32 tensor (256K elements)
+        let weights = vec![0.5; 256 * 1024];
+        let shape = (512, 512);
+        let config = PtBitnetConfig::default();
+
+        let ternary = quantize_tensor(&weights, shape, &config).unwrap();
+
+        let original_bytes = weights.len() * 4; // FP32
+        let compressed_bytes = ternary.memory_bytes();
+
+        // Should be ~16x compression (32 bits → 2 bits + scale overhead)
+        let compression_ratio = original_bytes as f32 / compressed_bytes as f32;
+        assert!(compression_ratio > 10.0); // At least 10x compression
+        assert!(compression_ratio < 20.0); // Less than 20x (due to scales)
+    }
+
+    #[test]
+    fn test_config_default() {
+        let config = PtBitnetConfig::default();
+        assert_eq!(config.block_size, 256);
+        assert_eq!(config.calibration_samples, 1000);
+        assert!(config.optimize_scales);
+        assert_eq!(config.layers_to_quantize, LayerMask::ExpertsOnly);
+    }
+
+    #[test]
+    fn test_layer_mask_variants() {
+        let experts = LayerMask::ExpertsOnly;
+        let all = LayerMask::All;
+        let custom = LayerMask::Custom(vec!["layer.0".to_string()]);
+
+        assert_ne!(experts, all);
+        assert_ne!(all, custom);
+        assert_ne!(experts, custom);
+    }
+}