Files
wifi-densepose/crates/ruvllm/src/bitnet/quantizer.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

366 lines
11 KiB
Rust

//! PT-BitNet Post-Training Quantization
//!
//! Core absmean ternary quantization algorithm for converting FP32 weights
//! to BitNet b1.58 ternary format.
use super::ternary_tensor::{pack_ternary, TernaryTensor};
use crate::error::{Result, RuvLLMError};
/// Configuration for PT-BitNet post-training quantization.
///
/// Controls the quantization process behavior, including block size,
/// calibration, and layer selection.
///
/// # Example
///
/// ```rust,ignore
/// use ruvllm::bitnet::PtBitnetConfig;
///
/// let config = PtBitnetConfig {
/// calibration_samples: 1000,
/// block_size: 256,
/// optimize_scales: true,
/// layers_to_quantize: LayerMask::ExpertsOnly,
/// export_format: TernaryFormat::BitnetT158,
/// ..Default::default()
/// };
/// ```
#[derive(Debug, Clone)]
pub struct PtBitnetConfig {
/// Number of calibration samples for scale optimization
pub calibration_samples: usize,
/// Elements per quantization block
pub block_size: usize,
/// Enable scale factor optimization via calibration
pub optimize_scales: bool,
/// Which layers to quantize
pub layers_to_quantize: LayerMask,
/// Export format for GGUF serialization
pub export_format: TernaryFormat,
/// Precision for router and shared layers
pub router_precision: Precision,
/// Use memory-mapped I/O for weight loading
pub use_mmap: bool,
/// Use Metal GPU for calibration (Mac Studio only)
pub use_metal_calibration: bool,
/// Maximum memory budget in GB
pub max_memory_gb: usize,
}
impl Default for PtBitnetConfig {
fn default() -> Self {
Self {
calibration_samples: 1000,
block_size: 256,
optimize_scales: true,
layers_to_quantize: LayerMask::ExpertsOnly,
export_format: TernaryFormat::BitnetT158,
router_precision: Precision::FP16,
use_mmap: true,
use_metal_calibration: cfg!(all(target_os = "macos", feature = "metal-compute")),
max_memory_gb: 64,
}
}
}
/// Layer selection mask for quantization.
///
/// Determines which model layers to convert to ternary. Per ADR-017 (AD-2),
/// the MoE router, embeddings, and LM head must remain in higher precision.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LayerMask {
/// Only MoE expert FFN layers (recommended for Phase 1)
ExpertsOnly,
/// All linear layers except router/embeddings/head
All,
/// Custom layer selection by name pattern
Custom(Vec<String>),
}
/// Ternary tensor export format.
///
/// Determines the GGUF quantization type used for serialization.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TernaryFormat {
/// BitNet b1.58 native format (type 30)
BitnetT158,
/// IQ1_S compatible format (type 19)
IQ1S,
}
/// Precision for non-quantized layers.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Precision {
/// 16-bit floating point
FP16,
/// Brain floating point 16
BF16,
/// 32-bit floating point
FP32,
}
/// Core absmean ternary quantization algorithm.
///
/// Implements the BitNet b1.58 quantization formula:
/// ```text
/// gamma = mean(|block|) + epsilon
/// normalized = block / gamma
/// ternary = round(clamp(normalized, -1, 1))
/// ```
///
/// # Arguments
///
/// * `block` - FP32 weight block (typically 256 elements)
///
/// # Returns
///
/// Tuple of (ternary values, scale factor):
/// - `Vec<i8>`: Ternary weights in {-1, 0, +1}
/// - `f32`: Absmean scale factor (gamma)
///
/// # Example
///
/// ```rust,ignore
/// use ruvllm::bitnet::absmean_ternary;
///
/// let weights = vec![0.5, -0.3, 0.8, -0.1, 0.0, 0.4];
/// let (ternary, scale) = absmean_ternary(&weights);
///
/// println!("Scale: {}", scale);
/// println!("Ternary: {:?}", ternary); // e.g., [1, -1, 1, 0, 0, 1]
/// ```
pub fn absmean_ternary(block: &[f32]) -> (Vec<i8>, f32) {
// Guard: empty block returns empty ternary with epsilon scale
if block.is_empty() {
return (vec![], 1e-8);
}
// Compute absmean scale: gamma = mean(|W|)
let sum_abs: f32 = block.iter().map(|&w| w.abs()).sum();
let gamma = (sum_abs / block.len() as f32) + 1e-8;
// Normalize and quantize to {-1, 0, +1}
let ternary: Vec<i8> = block
.iter()
.map(|&w| {
let normalized = w / gamma;
let clamped = normalized.clamp(-1.0, 1.0);
clamped.round() as i8
})
.collect();
(ternary, gamma)
}
/// Quantize a full FP32 tensor to ternary representation.
///
/// Processes the input tensor in blocks of `config.block_size`, applying
/// absmean quantization to each block independently.
///
/// # Arguments
///
/// * `weights` - FP32 weight tensor (flattened)
/// * `shape` - Tensor shape (rows, cols)
/// * `config` - Quantization configuration
///
/// # Returns
///
/// `TernaryTensor` with packed 2-bit data and per-block scales
///
/// # Errors
///
/// Returns an error if the weight dimensions are invalid.
///
/// # Example
///
/// ```rust,ignore
/// use ruvllm::bitnet::{quantize_tensor, PtBitnetConfig};
///
/// let weights = vec![0.5; 512]; // 512 FP32 weights
/// let shape = (2, 256);
/// let config = PtBitnetConfig::default();
///
/// let ternary = quantize_tensor(&weights, shape, &config)?;
/// println!("Compressed to {} bytes", ternary.memory_bytes());
/// ```
pub fn quantize_tensor(
weights: &[f32],
shape: (usize, usize),
config: &PtBitnetConfig,
) -> Result<TernaryTensor> {
let (rows, cols) = shape;
if rows == 0 || cols == 0 {
return Err(RuvLLMError::Model(format!(
"Invalid tensor shape: dimensions must be non-zero, got {:?}",
shape
)));
}
let block_size = config.block_size;
if block_size == 0 {
return Err(RuvLLMError::Model(
"block_size must be non-zero".to_string(),
));
}
let total_elements = rows.checked_mul(cols).ok_or_else(|| {
RuvLLMError::Model(format!(
"Integer overflow computing total elements for shape {:?}",
shape
))
})?;
if weights.len() != total_elements {
return Err(RuvLLMError::Model(format!(
"Weight size mismatch: expected {} elements for shape {:?}, got {}",
total_elements,
shape,
weights.len()
)));
}
// Use checked arithmetic to prevent overflow in block count
let num_blocks = total_elements.checked_add(block_size - 1).ok_or_else(|| {
RuvLLMError::Model("Integer overflow in block count calculation".to_string())
})? / block_size;
let mut all_ternary = Vec::with_capacity(total_elements);
let mut scales = Vec::with_capacity(num_blocks);
// Process each block
for block_idx in 0..num_blocks {
let start = block_idx * block_size;
let end = (start + block_size).min(total_elements);
let block = &weights[start..end];
let (ternary, scale) = absmean_ternary(block);
all_ternary.extend_from_slice(&ternary);
scales.push(scale);
}
// Pack ternary values into 2-bit representation
let packed_data = pack_ternary(&all_ternary);
Ok(TernaryTensor {
packed_data,
scales,
shape,
block_size,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_absmean_ternary_simple() {
// Simple block with known values
let block = vec![0.5, -0.5, 0.0, 1.0, -1.0, 0.25];
let (ternary, scale) = absmean_ternary(&block);
// All values should be in {-1, 0, +1}
assert!(ternary.iter().all(|&v| v >= -1 && v <= 1));
// Scale should be positive
assert!(scale > 0.0);
// Check specific values
// gamma ≈ (0.5 + 0.5 + 0.0 + 1.0 + 1.0 + 0.25) / 6 ≈ 0.542
// 0.5 / 0.542 ≈ 0.92 → round(0.92) = 1
// -0.5 / 0.542 ≈ -0.92 → round(-0.92) = -1
// 0.0 / 0.542 = 0 → round(0) = 0
assert_eq!(ternary[0], 1);
assert_eq!(ternary[1], -1);
assert_eq!(ternary[2], 0);
}
#[test]
fn test_absmean_ternary_all_zeros() {
let block = vec![0.0; 256];
let (ternary, scale) = absmean_ternary(&block);
// All should quantize to 0
assert!(ternary.iter().all(|&v| v == 0));
// Scale should be epsilon (1e-8)
assert!(scale < 1e-7 && scale > 0.0);
}
#[test]
fn test_absmean_ternary_large_values() {
let block = vec![10.0, -10.0, 5.0, -5.0];
let (ternary, _scale) = absmean_ternary(&block);
// All should saturate to ±1
assert!(ternary[0] == 1 || ternary[0] == -1);
assert!(ternary[1] == 1 || ternary[1] == -1);
}
#[test]
fn test_quantize_tensor_simple() {
let weights = vec![0.5; 512]; // 512 identical weights
let shape = (2, 256);
let config = PtBitnetConfig::default();
let ternary = quantize_tensor(&weights, shape, &config).unwrap();
assert_eq!(ternary.shape, shape);
assert_eq!(ternary.block_size, 256);
assert_eq!(ternary.num_blocks(), 2); // 512 / 256 = 2 blocks
assert_eq!(ternary.scales.len(), 2);
// 512 elements packed in 2 bits each = 128 bytes
assert_eq!(ternary.packed_data.len(), 128);
}
#[test]
fn test_quantize_tensor_size_mismatch() {
let weights = vec![0.5; 100]; // Wrong size
let shape = (2, 256); // Expects 512
let config = PtBitnetConfig::default();
let result = quantize_tensor(&weights, shape, &config);
assert!(result.is_err());
}
#[test]
fn test_quantize_tensor_memory_savings() {
// Quantize a 1MB FP32 tensor (256K elements)
let weights = vec![0.5; 256 * 1024];
let shape = (512, 512);
let config = PtBitnetConfig::default();
let ternary = quantize_tensor(&weights, shape, &config).unwrap();
let original_bytes = weights.len() * 4; // FP32
let compressed_bytes = ternary.memory_bytes();
// Should be ~16x compression (32 bits → 2 bits + scale overhead)
let compression_ratio = original_bytes as f32 / compressed_bytes as f32;
assert!(compression_ratio > 10.0); // At least 10x compression
assert!(compression_ratio < 20.0); // Less than 20x (due to scales)
}
#[test]
fn test_config_default() {
let config = PtBitnetConfig::default();
assert_eq!(config.block_size, 256);
assert_eq!(config.calibration_samples, 1000);
assert!(config.optimize_scales);
assert_eq!(config.layers_to_quantize, LayerMask::ExpertsOnly);
}
#[test]
fn test_layer_mask_variants() {
let experts = LayerMask::ExpertsOnly;
let all = LayerMask::All;
let custom = LayerMask::Custom(vec!["layer.0".to_string()]);
assert_ne!(experts, all);
assert_ne!(all, custom);
assert_ne!(experts, custom);
}
}