Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
365
crates/ruvllm/src/bitnet/quantizer.rs
Normal file
365
crates/ruvllm/src/bitnet/quantizer.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
//! PT-BitNet Post-Training Quantization
|
||||
//!
|
||||
//! Core absmean ternary quantization algorithm for converting FP32 weights
|
||||
//! to BitNet b1.58 ternary format.
|
||||
|
||||
use super::ternary_tensor::{pack_ternary, TernaryTensor};
|
||||
use crate::error::{Result, RuvLLMError};
|
||||
|
||||
/// Configuration for PT-BitNet post-training quantization.
|
||||
///
|
||||
/// Controls the quantization process behavior, including block size,
|
||||
/// calibration, and layer selection.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// use ruvllm::bitnet::PtBitnetConfig;
|
||||
///
|
||||
/// let config = PtBitnetConfig {
|
||||
/// calibration_samples: 1000,
|
||||
/// block_size: 256,
|
||||
/// optimize_scales: true,
|
||||
/// layers_to_quantize: LayerMask::ExpertsOnly,
|
||||
/// export_format: TernaryFormat::BitnetT158,
|
||||
/// ..Default::default()
|
||||
/// };
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PtBitnetConfig {
|
||||
/// Number of calibration samples for scale optimization
|
||||
pub calibration_samples: usize,
|
||||
/// Elements per quantization block
|
||||
pub block_size: usize,
|
||||
/// Enable scale factor optimization via calibration
|
||||
pub optimize_scales: bool,
|
||||
/// Which layers to quantize
|
||||
pub layers_to_quantize: LayerMask,
|
||||
/// Export format for GGUF serialization
|
||||
pub export_format: TernaryFormat,
|
||||
/// Precision for router and shared layers
|
||||
pub router_precision: Precision,
|
||||
/// Use memory-mapped I/O for weight loading
|
||||
pub use_mmap: bool,
|
||||
/// Use Metal GPU for calibration (Mac Studio only)
|
||||
pub use_metal_calibration: bool,
|
||||
/// Maximum memory budget in GB
|
||||
pub max_memory_gb: usize,
|
||||
}
|
||||
|
||||
impl Default for PtBitnetConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
calibration_samples: 1000,
|
||||
block_size: 256,
|
||||
optimize_scales: true,
|
||||
layers_to_quantize: LayerMask::ExpertsOnly,
|
||||
export_format: TernaryFormat::BitnetT158,
|
||||
router_precision: Precision::FP16,
|
||||
use_mmap: true,
|
||||
use_metal_calibration: cfg!(all(target_os = "macos", feature = "metal-compute")),
|
||||
max_memory_gb: 64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Layer selection mask for quantization.
|
||||
///
|
||||
/// Determines which model layers to convert to ternary. Per ADR-017 (AD-2),
|
||||
/// the MoE router, embeddings, and LM head must remain in higher precision.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LayerMask {
|
||||
/// Only MoE expert FFN layers (recommended for Phase 1)
|
||||
ExpertsOnly,
|
||||
/// All linear layers except router/embeddings/head
|
||||
All,
|
||||
/// Custom layer selection by name pattern
|
||||
Custom(Vec<String>),
|
||||
}
|
||||
|
||||
/// Ternary tensor export format.
|
||||
///
|
||||
/// Determines the GGUF quantization type used for serialization.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TernaryFormat {
|
||||
/// BitNet b1.58 native format (type 30)
|
||||
BitnetT158,
|
||||
/// IQ1_S compatible format (type 19)
|
||||
IQ1S,
|
||||
}
|
||||
|
||||
/// Precision for non-quantized layers.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Precision {
|
||||
/// 16-bit floating point
|
||||
FP16,
|
||||
/// Brain floating point 16
|
||||
BF16,
|
||||
/// 32-bit floating point
|
||||
FP32,
|
||||
}
|
||||
|
||||
/// Core absmean ternary quantization algorithm.
|
||||
///
|
||||
/// Implements the BitNet b1.58 quantization formula:
|
||||
/// ```text
|
||||
/// gamma = mean(|block|) + epsilon
|
||||
/// normalized = block / gamma
|
||||
/// ternary = round(clamp(normalized, -1, 1))
|
||||
/// ```
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `block` - FP32 weight block (typically 256 elements)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Tuple of (ternary values, scale factor):
|
||||
/// - `Vec<i8>`: Ternary weights in {-1, 0, +1}
|
||||
/// - `f32`: Absmean scale factor (gamma)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// use ruvllm::bitnet::absmean_ternary;
|
||||
///
|
||||
/// let weights = vec![0.5, -0.3, 0.8, -0.1, 0.0, 0.4];
|
||||
/// let (ternary, scale) = absmean_ternary(&weights);
|
||||
///
|
||||
/// println!("Scale: {}", scale);
|
||||
/// println!("Ternary: {:?}", ternary); // e.g., [1, -1, 1, 0, 0, 1]
|
||||
/// ```
|
||||
pub fn absmean_ternary(block: &[f32]) -> (Vec<i8>, f32) {
|
||||
// Guard: empty block returns empty ternary with epsilon scale
|
||||
if block.is_empty() {
|
||||
return (vec![], 1e-8);
|
||||
}
|
||||
|
||||
// Compute absmean scale: gamma = mean(|W|)
|
||||
let sum_abs: f32 = block.iter().map(|&w| w.abs()).sum();
|
||||
let gamma = (sum_abs / block.len() as f32) + 1e-8;
|
||||
|
||||
// Normalize and quantize to {-1, 0, +1}
|
||||
let ternary: Vec<i8> = block
|
||||
.iter()
|
||||
.map(|&w| {
|
||||
let normalized = w / gamma;
|
||||
let clamped = normalized.clamp(-1.0, 1.0);
|
||||
clamped.round() as i8
|
||||
})
|
||||
.collect();
|
||||
|
||||
(ternary, gamma)
|
||||
}
|
||||
|
||||
/// Quantize a full FP32 tensor to ternary representation.
|
||||
///
|
||||
/// Processes the input tensor in blocks of `config.block_size`, applying
|
||||
/// absmean quantization to each block independently.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `weights` - FP32 weight tensor (flattened)
|
||||
/// * `shape` - Tensor shape (rows, cols)
|
||||
/// * `config` - Quantization configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `TernaryTensor` with packed 2-bit data and per-block scales
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the weight dimensions are invalid.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// use ruvllm::bitnet::{quantize_tensor, PtBitnetConfig};
|
||||
///
|
||||
/// let weights = vec![0.5; 512]; // 512 FP32 weights
|
||||
/// let shape = (2, 256);
|
||||
/// let config = PtBitnetConfig::default();
|
||||
///
|
||||
/// let ternary = quantize_tensor(&weights, shape, &config)?;
|
||||
/// println!("Compressed to {} bytes", ternary.memory_bytes());
|
||||
/// ```
|
||||
pub fn quantize_tensor(
|
||||
weights: &[f32],
|
||||
shape: (usize, usize),
|
||||
config: &PtBitnetConfig,
|
||||
) -> Result<TernaryTensor> {
|
||||
let (rows, cols) = shape;
|
||||
|
||||
if rows == 0 || cols == 0 {
|
||||
return Err(RuvLLMError::Model(format!(
|
||||
"Invalid tensor shape: dimensions must be non-zero, got {:?}",
|
||||
shape
|
||||
)));
|
||||
}
|
||||
|
||||
let block_size = config.block_size;
|
||||
if block_size == 0 {
|
||||
return Err(RuvLLMError::Model(
|
||||
"block_size must be non-zero".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let total_elements = rows.checked_mul(cols).ok_or_else(|| {
|
||||
RuvLLMError::Model(format!(
|
||||
"Integer overflow computing total elements for shape {:?}",
|
||||
shape
|
||||
))
|
||||
})?;
|
||||
|
||||
if weights.len() != total_elements {
|
||||
return Err(RuvLLMError::Model(format!(
|
||||
"Weight size mismatch: expected {} elements for shape {:?}, got {}",
|
||||
total_elements,
|
||||
shape,
|
||||
weights.len()
|
||||
)));
|
||||
}
|
||||
|
||||
// Use checked arithmetic to prevent overflow in block count
|
||||
let num_blocks = total_elements.checked_add(block_size - 1).ok_or_else(|| {
|
||||
RuvLLMError::Model("Integer overflow in block count calculation".to_string())
|
||||
})? / block_size;
|
||||
|
||||
let mut all_ternary = Vec::with_capacity(total_elements);
|
||||
let mut scales = Vec::with_capacity(num_blocks);
|
||||
|
||||
// Process each block
|
||||
for block_idx in 0..num_blocks {
|
||||
let start = block_idx * block_size;
|
||||
let end = (start + block_size).min(total_elements);
|
||||
let block = &weights[start..end];
|
||||
|
||||
let (ternary, scale) = absmean_ternary(block);
|
||||
all_ternary.extend_from_slice(&ternary);
|
||||
scales.push(scale);
|
||||
}
|
||||
|
||||
// Pack ternary values into 2-bit representation
|
||||
let packed_data = pack_ternary(&all_ternary);
|
||||
|
||||
Ok(TernaryTensor {
|
||||
packed_data,
|
||||
scales,
|
||||
shape,
|
||||
block_size,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_absmean_ternary_simple() {
|
||||
// Simple block with known values
|
||||
let block = vec![0.5, -0.5, 0.0, 1.0, -1.0, 0.25];
|
||||
let (ternary, scale) = absmean_ternary(&block);
|
||||
|
||||
// All values should be in {-1, 0, +1}
|
||||
assert!(ternary.iter().all(|&v| v >= -1 && v <= 1));
|
||||
|
||||
// Scale should be positive
|
||||
assert!(scale > 0.0);
|
||||
|
||||
// Check specific values
|
||||
// gamma ≈ (0.5 + 0.5 + 0.0 + 1.0 + 1.0 + 0.25) / 6 ≈ 0.542
|
||||
// 0.5 / 0.542 ≈ 0.92 → round(0.92) = 1
|
||||
// -0.5 / 0.542 ≈ -0.92 → round(-0.92) = -1
|
||||
// 0.0 / 0.542 = 0 → round(0) = 0
|
||||
assert_eq!(ternary[0], 1);
|
||||
assert_eq!(ternary[1], -1);
|
||||
assert_eq!(ternary[2], 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absmean_ternary_all_zeros() {
|
||||
let block = vec![0.0; 256];
|
||||
let (ternary, scale) = absmean_ternary(&block);
|
||||
|
||||
// All should quantize to 0
|
||||
assert!(ternary.iter().all(|&v| v == 0));
|
||||
|
||||
// Scale should be epsilon (1e-8)
|
||||
assert!(scale < 1e-7 && scale > 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absmean_ternary_large_values() {
|
||||
let block = vec![10.0, -10.0, 5.0, -5.0];
|
||||
let (ternary, _scale) = absmean_ternary(&block);
|
||||
|
||||
// All should saturate to ±1
|
||||
assert!(ternary[0] == 1 || ternary[0] == -1);
|
||||
assert!(ternary[1] == 1 || ternary[1] == -1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantize_tensor_simple() {
|
||||
let weights = vec![0.5; 512]; // 512 identical weights
|
||||
let shape = (2, 256);
|
||||
let config = PtBitnetConfig::default();
|
||||
|
||||
let ternary = quantize_tensor(&weights, shape, &config).unwrap();
|
||||
|
||||
assert_eq!(ternary.shape, shape);
|
||||
assert_eq!(ternary.block_size, 256);
|
||||
assert_eq!(ternary.num_blocks(), 2); // 512 / 256 = 2 blocks
|
||||
assert_eq!(ternary.scales.len(), 2);
|
||||
|
||||
// 512 elements packed in 2 bits each = 128 bytes
|
||||
assert_eq!(ternary.packed_data.len(), 128);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantize_tensor_size_mismatch() {
|
||||
let weights = vec![0.5; 100]; // Wrong size
|
||||
let shape = (2, 256); // Expects 512
|
||||
let config = PtBitnetConfig::default();
|
||||
|
||||
let result = quantize_tensor(&weights, shape, &config);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantize_tensor_memory_savings() {
|
||||
// Quantize a 1MB FP32 tensor (256K elements)
|
||||
let weights = vec![0.5; 256 * 1024];
|
||||
let shape = (512, 512);
|
||||
let config = PtBitnetConfig::default();
|
||||
|
||||
let ternary = quantize_tensor(&weights, shape, &config).unwrap();
|
||||
|
||||
let original_bytes = weights.len() * 4; // FP32
|
||||
let compressed_bytes = ternary.memory_bytes();
|
||||
|
||||
// Should be ~16x compression (32 bits → 2 bits + scale overhead)
|
||||
let compression_ratio = original_bytes as f32 / compressed_bytes as f32;
|
||||
assert!(compression_ratio > 10.0); // At least 10x compression
|
||||
assert!(compression_ratio < 20.0); // Less than 20x (due to scales)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_default() {
|
||||
let config = PtBitnetConfig::default();
|
||||
assert_eq!(config.block_size, 256);
|
||||
assert_eq!(config.calibration_samples, 1000);
|
||||
assert!(config.optimize_scales);
|
||||
assert_eq!(config.layers_to_quantize, LayerMask::ExpertsOnly);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_layer_mask_variants() {
|
||||
let experts = LayerMask::ExpertsOnly;
|
||||
let all = LayerMask::All;
|
||||
let custom = LayerMask::Custom(vec!["layer.0".to_string()]);
|
||||
|
||||
assert_ne!(experts, all);
|
||||
assert_ne!(all, custom);
|
||||
assert_ne!(experts, custom);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user