Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,175 @@
//! FlashAttention demonstration
//!
//! Shows how to use FlashAttention-style tiled attention for CPU inference.
use ruvector_mincut_gated_transformer::flash_attention::{
flash_attention_forward, flash_attention_forward_i8, flash_mha, FlashAttentionConfig,
};
fn main() {
println!("=== FlashAttention CPU Demo ===\n");
// Configuration for 64-dim attention head
let config = FlashAttentionConfig::for_head_dim(64);
println!("Configuration:");
println!(" Block size (Q): {}", config.block_size_q);
println!(" Block size (KV): {}", config.block_size_kv);
println!(" Head dimension: {}", config.head_dim);
println!(" Causal masking: {}", config.causal);
println!(" Softmax scale: {:.4}\n", config.softmax_scale);
// Example 1: Single-head attention
{
println!("Example 1: Single-head attention (128 tokens, 64 dims)");
let seq_len = 128;
let head_dim = 64;
// Create random-like input (deterministic for demo)
let q: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let k: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let v: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let mut output = vec![0.0f32; seq_len * head_dim];
flash_attention_forward(&config, &q, &k, &v, seq_len, seq_len, &mut output);
println!(" ✓ Computed attention output: {} elements", output.len());
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
}
// Example 2: Multi-head attention
{
println!("Example 2: Multi-head attention (8 heads, 64 tokens, 64 dims)");
let num_heads = 8;
let seq_len = 64;
let head_dim = 64;
let total_size = num_heads * seq_len * head_dim;
let q: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
let k: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
let v: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
let mut output = vec![0.0f32; total_size];
flash_mha(
&config,
&q,
&k,
&v,
num_heads,
seq_len,
seq_len,
&mut output,
);
println!(
" ✓ Computed multi-head attention: {} elements",
output.len()
);
println!(" ✓ Output per head: {} elements", seq_len * head_dim);
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
}
// Example 3: INT8 quantized attention
{
println!("Example 3: INT8 quantized attention (64 tokens, 64 dims)");
let seq_len = 64;
let head_dim = 64;
// Create FP32 data and quantize to INT8
let q_f32: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let k_f32: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let v_f32: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
// Quantization scales
let q_scale = 0.01f32;
let k_scale = 0.01f32;
let v_scale = 0.01f32;
// Quantize to INT8
let q_i8: Vec<i8> = q_f32
.iter()
.map(|&x| (x / q_scale).round().clamp(-128.0, 127.0) as i8)
.collect();
let k_i8: Vec<i8> = k_f32
.iter()
.map(|&x| (x / k_scale).round().clamp(-128.0, 127.0) as i8)
.collect();
let v_i8: Vec<i8> = v_f32
.iter()
.map(|&x| (x / v_scale).round().clamp(-128.0, 127.0) as i8)
.collect();
let mut output = vec![0.0f32; seq_len * head_dim];
flash_attention_forward_i8(
&config,
&q_i8,
&k_i8,
&v_i8,
q_scale,
k_scale,
v_scale,
seq_len,
seq_len,
&mut output,
);
println!(" ✓ Computed INT8 quantized attention");
println!(" ✓ Memory savings: 4× (INT8 vs FP32)");
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
}
// Example 4: Configuration for long sequences
{
println!("Example 4: Optimized config for long sequences (512 tokens)");
let long_config = FlashAttentionConfig::for_long_sequence(64);
println!(
" Block size (Q): {} (smaller for cache reuse)",
long_config.block_size_q
);
println!(
" Block size (KV): {} (larger for efficiency)",
long_config.block_size_kv
);
let seq_len = 512;
let head_dim = 64;
let q: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let k: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let v: Vec<f32> = (0..seq_len * head_dim)
.map(|i| ((i % 100) as f32) * 0.01)
.collect();
let mut output = vec![0.0f32; seq_len * head_dim];
flash_attention_forward(&long_config, &q, &k, &v, seq_len, seq_len, &mut output);
println!(" ✓ Computed attention for {} tokens", seq_len);
println!(" ✓ Memory efficient: O(n) instead of O(n²)");
println!(" ✓ Cache efficient: Tiled for L1/L2 cache\n");
}
println!("=== All examples completed successfully! ===");
}

View File

@@ -0,0 +1,116 @@
//! Example demonstrating Mamba State Space Model usage.
//!
//! This example shows:
//! 1. Creating and configuring a Mamba layer
//! 2. Single-step (recurrent) inference
//! 3. Sequence processing
//! 4. State persistence across timesteps
use ruvector_mincut_gated_transformer::mamba::{MambaConfig, MambaLayer, MambaState, MambaWeights};
fn main() {
println!("=== Mamba State Space Model Example ===\n");
// Create configuration
let config = MambaConfig {
d_model: 128,
d_state: 16,
d_conv: 4,
expand: 2,
dt_rank: 16,
dt_min: 0.001,
dt_max: 0.1,
};
println!("Configuration:");
println!(" Model dimension: {}", config.d_model);
println!(" State dimension: {}", config.d_state);
println!(" Inner dimension: {}", config.d_inner());
println!(" Convolution width: {}", config.d_conv);
println!();
// Create layer and initialize weights
let layer = MambaLayer::new(config.clone());
let weights = MambaWeights::empty(&config);
println!("Layer created with {} parameters", {
let d_inner = config.d_inner();
config.d_model * d_inner * 2 // in_proj
+ d_inner * config.d_conv // conv1d
+ d_inner * (config.dt_rank + config.d_state * 2) // x_proj
+ config.dt_rank * d_inner // dt_proj
+ d_inner * config.d_state // a_log
+ d_inner // d
+ d_inner * config.d_model // out_proj
});
println!();
// Example 1: Single-step inference
println!("Example 1: Single-step inference");
let mut state = MambaState::new(&config);
let input = vec![0.1; config.d_model];
println!("Processing single token...");
let output = layer.forward_step(&weights, &input, &mut state);
println!(" Input shape: [{}]", input.len());
println!(" Output shape: [{}]", output.len());
println!(" State updated: {}", state.h.iter().any(|&x| x != 0.0));
println!();
// Example 2: Sequential processing with state
println!("Example 2: Sequential processing");
let mut state = MambaState::new(&config);
let sequence_length = 5;
for t in 0..sequence_length {
let input = vec![0.1 * (t as f32 + 1.0); config.d_model];
let output = layer.forward_step(&weights, &input, &mut state);
println!(" Step {}: output[0] = {:.6}", t, output[0]);
}
println!();
// Example 3: Sequence mode
println!("Example 3: Sequence mode (parallel)");
let seq_len = 4;
let input_seq = vec![0.2; seq_len * config.d_model];
println!("Processing sequence of length {}...", seq_len);
let output_seq = layer.forward_sequence(&weights, &input_seq, seq_len);
println!(" Input shape: [{}, {}]", seq_len, config.d_model);
println!(" Output shape: [{}, {}]", seq_len, config.d_model);
println!(" First output: {:.6}", output_seq[0]);
println!();
// Example 4: State reset
println!("Example 4: State persistence and reset");
let mut state = MambaState::new(&config);
let input1 = vec![0.5; config.d_model];
let input2 = vec![0.3; config.d_model];
let out1 = layer.forward_step(&weights, &input1, &mut state);
println!(" First forward: output[0] = {:.6}", out1[0]);
let out2 = layer.forward_step(&weights, &input2, &mut state);
println!(" Second forward: output[0] = {:.6}", out2[0]);
state.reset();
let out1_reset = layer.forward_step(&weights, &input1, &mut state);
println!(" After reset: output[0] = {:.6}", out1_reset[0]);
println!(
" Matches first: {}",
(out1[0] - out1_reset[0]).abs() < 1e-5
);
println!();
// Performance characteristics
println!("Performance Characteristics:");
println!(" Complexity per step: O(N) vs O(N²) for attention");
println!(" Memory per step: O(1) vs O(N) for attention");
println!(
" State size: {} floats",
state.h.len() + state.conv_state.len()
);
println!();
println!("=== Example Complete ===");
}

View File

@@ -0,0 +1,274 @@
//! Example: Scoring mode with gate packets and spike packets.
//!
//! Demonstrates the primary use case: classification, routing, tool selection,
//! and anomaly scoring under mincut-gated coherence control.
use ruvector_mincut_gated_transformer::{
GateDecision, GatePacket, GatePolicy, InferInput, InferOutput, MincutGatedTransformer,
QuantizedWeights, SpikePacket, TransformerConfig,
};
fn main() {
println!("=== Mincut Gated Transformer Scorer Example ===\n");
// Create transformer with micro config (suitable for edge deployment)
let config = TransformerConfig::micro();
let policy = GatePolicy::default();
let weights = QuantizedWeights::empty(&config);
let mut transformer = MincutGatedTransformer::new(config.clone(), policy, weights)
.expect("Failed to create transformer");
println!("Transformer Configuration:");
println!(" Sequence length: {}", config.seq_len_max);
println!(" Hidden dimension: {}", config.hidden);
println!(" Heads: {}", config.heads);
println!(" Layers: {}", config.layers);
println!(" Window: {}", config.window_normal);
println!(" Buffer size: {} bytes\n", config.total_buffer_bytes());
// Simulate different scenarios
// Scenario 1: Normal operation (high coherence)
println!("--- Scenario 1: Normal Operation (High Coherence) ---");
let gate_normal = GatePacket {
lambda: 100,
lambda_prev: 95,
boundary_edges: 5,
boundary_concentration_q15: 8192, // ~25%
partition_count: 3,
flags: 0,
};
run_inference(&mut transformer, &config, gate_normal, None, "normal");
// Scenario 2: Boundary spike (reduced scope)
println!("\n--- Scenario 2: Boundary Spike (Reduced Scope) ---");
let gate_boundary = GatePacket {
lambda: 100,
lambda_prev: 95,
boundary_edges: 30, // Above threshold - triggers ReduceScope
boundary_concentration_q15: 16000,
partition_count: 5,
flags: 0,
};
run_inference(
&mut transformer,
&config,
gate_boundary,
None,
"boundary_spike",
);
// Scenario 3: Lambda drop (flush KV)
println!("\n--- Scenario 3: Lambda Drop (Flush KV) ---");
let gate_drop = GatePacket {
lambda: 40,
lambda_prev: 100, // 60% drop
boundary_edges: 5,
boundary_concentration_q15: 8192,
partition_count: 3,
flags: 0,
};
run_inference(&mut transformer, &config, gate_drop, None, "lambda_drop");
// Scenario 4: Low coherence (quarantine)
println!("\n--- Scenario 4: Low Coherence (Quarantine) ---");
let gate_low = GatePacket {
lambda: 10, // Below minimum
lambda_prev: 50,
boundary_edges: 5,
boundary_concentration_q15: 8192,
partition_count: 3,
flags: 0,
};
run_inference(&mut transformer, &config, gate_low, None, "low_coherence");
// Scenario 5: Force safe mode
println!("\n--- Scenario 5: Force Safe Mode ---");
let gate_safe = GatePacket {
lambda: 100,
lambda_prev: 95,
boundary_edges: 5,
boundary_concentration_q15: 8192,
partition_count: 3,
flags: GatePacket::FLAG_FORCE_SAFE,
};
run_inference(&mut transformer, &config, gate_safe, None, "force_safe");
// Scenario 6: Skip mode
println!("\n--- Scenario 6: Skip Mode ---");
let gate_skip = GatePacket {
lambda: 100,
flags: GatePacket::FLAG_SKIP,
..Default::default()
};
run_inference(&mut transformer, &config, gate_skip, None, "skip");
// Scenario 7: With spike packet (active)
println!("\n--- Scenario 7: Active Spike Packet ---");
let gate_spike = GatePacket {
lambda: 100,
lambda_prev: 95,
boundary_edges: 5,
..Default::default()
};
let spike_active = SpikePacket {
fired: 1,
rate_q15: 10000,
novelty_q15: 15000,
top_len: 4,
top_idx: {
let mut arr = [0u16; 16];
arr[0] = 2;
arr[1] = 5;
arr[2] = 10;
arr[3] = 15;
arr
},
top_w_q15: {
let mut arr = [0u16; 16];
arr[0] = 16384;
arr[1] = 8192;
arr[2] = 4096;
arr[3] = 2048;
arr
},
flags: SpikePacket::FLAG_SPARSE_MASK,
};
run_inference(
&mut transformer,
&config,
gate_spike,
Some(spike_active),
"spike_active",
);
// Scenario 8: With spike packet (inactive - skip)
println!("\n--- Scenario 8: Inactive Spike Packet (Skip) ---");
let spike_inactive = SpikePacket {
fired: 0, // Not fired
rate_q15: 500,
novelty_q15: 1000,
..Default::default()
};
run_inference(
&mut transformer,
&config,
gate_spike,
Some(spike_inactive),
"spike_inactive",
);
// Scenario 9: Spike storm
println!("\n--- Scenario 9: Spike Storm (Freeze) ---");
let spike_storm = SpikePacket {
fired: 1,
rate_q15: 30000, // Very high rate
novelty_q15: 5000,
..Default::default()
};
run_inference(
&mut transformer,
&config,
gate_spike,
Some(spike_storm),
"spike_storm",
);
println!("\n=== Example Complete ===");
}
fn run_inference(
transformer: &mut MincutGatedTransformer,
config: &TransformerConfig,
gate: GatePacket,
spike: Option<SpikePacket>,
scenario: &str,
) {
// Reset transformer state
transformer.reset();
// Create input tokens
let tokens: Vec<u32> = (0..16).collect();
let mut input = InferInput::from_tokens(&tokens, gate);
if let Some(sp) = spike {
input = input.with_spikes(sp);
}
// Allocate output buffer
let mut logits = vec![0i32; config.logits as usize];
let mut output = InferOutput::new(&mut logits);
// Run inference
let result = transformer.infer(&input, &mut output);
match result {
Ok(()) => {
let witness = &output.witness;
let stats = &output.stats;
println!(" Scenario: {}", scenario);
println!(" Decision: {:?}", witness.decision);
println!(" Reason: {:?}", witness.reason);
println!(
" Lambda: {} -> {} (delta: {})",
witness.lambda_prev, witness.lambda, witness.lambda_delta
);
println!(
" Effective seq_len: {}, window: {}",
witness.effective_seq_len, witness.effective_window
);
println!(
" KV writes: {}, External writes: {}",
if witness.kv_writes_enabled == 1 {
"enabled"
} else {
"disabled"
},
if witness.external_writes_enabled == 1 {
"enabled"
} else {
"disabled"
}
);
println!(
" Stats: tier={}, layers={}, skipped={}",
stats.tier, stats.layers_executed, stats.skipped
);
// Demonstrate orchestrator decision logic
print!(" Orchestrator action: ");
match witness.decision {
GateDecision::Allow => {
println!("Proceed with tool execution and memory persistence");
}
GateDecision::ReduceScope => {
println!("Proceed with reduced confidence, skip risky tools");
}
GateDecision::FlushKv => {
println!("Clear context, rebuild state from fresh inputs");
}
GateDecision::FreezeWrites => {
println!("Read-only mode, defer all state changes");
}
GateDecision::QuarantineUpdates => {
println!("Discard results, request human review");
}
}
}
Err(e) => {
println!(" Error: {:?}", e);
}
}
}