Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
@@ -0,0 +1,175 @@
|
||||
//! FlashAttention demonstration
|
||||
//!
|
||||
//! Shows how to use FlashAttention-style tiled attention for CPU inference.
|
||||
|
||||
use ruvector_mincut_gated_transformer::flash_attention::{
|
||||
flash_attention_forward, flash_attention_forward_i8, flash_mha, FlashAttentionConfig,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
println!("=== FlashAttention CPU Demo ===\n");
|
||||
|
||||
// Configuration for 64-dim attention head
|
||||
let config = FlashAttentionConfig::for_head_dim(64);
|
||||
println!("Configuration:");
|
||||
println!(" Block size (Q): {}", config.block_size_q);
|
||||
println!(" Block size (KV): {}", config.block_size_kv);
|
||||
println!(" Head dimension: {}", config.head_dim);
|
||||
println!(" Causal masking: {}", config.causal);
|
||||
println!(" Softmax scale: {:.4}\n", config.softmax_scale);
|
||||
|
||||
// Example 1: Single-head attention
|
||||
{
|
||||
println!("Example 1: Single-head attention (128 tokens, 64 dims)");
|
||||
|
||||
let seq_len = 128;
|
||||
let head_dim = 64;
|
||||
|
||||
// Create random-like input (deterministic for demo)
|
||||
let q: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let k: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let v: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
|
||||
let mut output = vec![0.0f32; seq_len * head_dim];
|
||||
|
||||
flash_attention_forward(&config, &q, &k, &v, seq_len, seq_len, &mut output);
|
||||
|
||||
println!(" ✓ Computed attention output: {} elements", output.len());
|
||||
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
|
||||
}
|
||||
|
||||
// Example 2: Multi-head attention
|
||||
{
|
||||
println!("Example 2: Multi-head attention (8 heads, 64 tokens, 64 dims)");
|
||||
|
||||
let num_heads = 8;
|
||||
let seq_len = 64;
|
||||
let head_dim = 64;
|
||||
|
||||
let total_size = num_heads * seq_len * head_dim;
|
||||
let q: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
|
||||
let k: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
|
||||
let v: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
|
||||
|
||||
let mut output = vec![0.0f32; total_size];
|
||||
|
||||
flash_mha(
|
||||
&config,
|
||||
&q,
|
||||
&k,
|
||||
&v,
|
||||
num_heads,
|
||||
seq_len,
|
||||
seq_len,
|
||||
&mut output,
|
||||
);
|
||||
|
||||
println!(
|
||||
" ✓ Computed multi-head attention: {} elements",
|
||||
output.len()
|
||||
);
|
||||
println!(" ✓ Output per head: {} elements", seq_len * head_dim);
|
||||
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
|
||||
}
|
||||
|
||||
// Example 3: INT8 quantized attention
|
||||
{
|
||||
println!("Example 3: INT8 quantized attention (64 tokens, 64 dims)");
|
||||
|
||||
let seq_len = 64;
|
||||
let head_dim = 64;
|
||||
|
||||
// Create FP32 data and quantize to INT8
|
||||
let q_f32: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let k_f32: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let v_f32: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
|
||||
// Quantization scales
|
||||
let q_scale = 0.01f32;
|
||||
let k_scale = 0.01f32;
|
||||
let v_scale = 0.01f32;
|
||||
|
||||
// Quantize to INT8
|
||||
let q_i8: Vec<i8> = q_f32
|
||||
.iter()
|
||||
.map(|&x| (x / q_scale).round().clamp(-128.0, 127.0) as i8)
|
||||
.collect();
|
||||
let k_i8: Vec<i8> = k_f32
|
||||
.iter()
|
||||
.map(|&x| (x / k_scale).round().clamp(-128.0, 127.0) as i8)
|
||||
.collect();
|
||||
let v_i8: Vec<i8> = v_f32
|
||||
.iter()
|
||||
.map(|&x| (x / v_scale).round().clamp(-128.0, 127.0) as i8)
|
||||
.collect();
|
||||
|
||||
let mut output = vec![0.0f32; seq_len * head_dim];
|
||||
|
||||
flash_attention_forward_i8(
|
||||
&config,
|
||||
&q_i8,
|
||||
&k_i8,
|
||||
&v_i8,
|
||||
q_scale,
|
||||
k_scale,
|
||||
v_scale,
|
||||
seq_len,
|
||||
seq_len,
|
||||
&mut output,
|
||||
);
|
||||
|
||||
println!(" ✓ Computed INT8 quantized attention");
|
||||
println!(" ✓ Memory savings: 4× (INT8 vs FP32)");
|
||||
println!(" ✓ First 5 output values: {:?}\n", &output[0..5]);
|
||||
}
|
||||
|
||||
// Example 4: Configuration for long sequences
|
||||
{
|
||||
println!("Example 4: Optimized config for long sequences (512 tokens)");
|
||||
|
||||
let long_config = FlashAttentionConfig::for_long_sequence(64);
|
||||
println!(
|
||||
" Block size (Q): {} (smaller for cache reuse)",
|
||||
long_config.block_size_q
|
||||
);
|
||||
println!(
|
||||
" Block size (KV): {} (larger for efficiency)",
|
||||
long_config.block_size_kv
|
||||
);
|
||||
|
||||
let seq_len = 512;
|
||||
let head_dim = 64;
|
||||
|
||||
let q: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let k: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
let v: Vec<f32> = (0..seq_len * head_dim)
|
||||
.map(|i| ((i % 100) as f32) * 0.01)
|
||||
.collect();
|
||||
|
||||
let mut output = vec![0.0f32; seq_len * head_dim];
|
||||
|
||||
flash_attention_forward(&long_config, &q, &k, &v, seq_len, seq_len, &mut output);
|
||||
|
||||
println!(" ✓ Computed attention for {} tokens", seq_len);
|
||||
println!(" ✓ Memory efficient: O(n) instead of O(n²)");
|
||||
println!(" ✓ Cache efficient: Tiled for L1/L2 cache\n");
|
||||
}
|
||||
|
||||
println!("=== All examples completed successfully! ===");
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
//! Example demonstrating Mamba State Space Model usage.
|
||||
//!
|
||||
//! This example shows:
|
||||
//! 1. Creating and configuring a Mamba layer
|
||||
//! 2. Single-step (recurrent) inference
|
||||
//! 3. Sequence processing
|
||||
//! 4. State persistence across timesteps
|
||||
|
||||
use ruvector_mincut_gated_transformer::mamba::{MambaConfig, MambaLayer, MambaState, MambaWeights};
|
||||
|
||||
fn main() {
|
||||
println!("=== Mamba State Space Model Example ===\n");
|
||||
|
||||
// Create configuration
|
||||
let config = MambaConfig {
|
||||
d_model: 128,
|
||||
d_state: 16,
|
||||
d_conv: 4,
|
||||
expand: 2,
|
||||
dt_rank: 16,
|
||||
dt_min: 0.001,
|
||||
dt_max: 0.1,
|
||||
};
|
||||
|
||||
println!("Configuration:");
|
||||
println!(" Model dimension: {}", config.d_model);
|
||||
println!(" State dimension: {}", config.d_state);
|
||||
println!(" Inner dimension: {}", config.d_inner());
|
||||
println!(" Convolution width: {}", config.d_conv);
|
||||
println!();
|
||||
|
||||
// Create layer and initialize weights
|
||||
let layer = MambaLayer::new(config.clone());
|
||||
let weights = MambaWeights::empty(&config);
|
||||
|
||||
println!("Layer created with {} parameters", {
|
||||
let d_inner = config.d_inner();
|
||||
config.d_model * d_inner * 2 // in_proj
|
||||
+ d_inner * config.d_conv // conv1d
|
||||
+ d_inner * (config.dt_rank + config.d_state * 2) // x_proj
|
||||
+ config.dt_rank * d_inner // dt_proj
|
||||
+ d_inner * config.d_state // a_log
|
||||
+ d_inner // d
|
||||
+ d_inner * config.d_model // out_proj
|
||||
});
|
||||
println!();
|
||||
|
||||
// Example 1: Single-step inference
|
||||
println!("Example 1: Single-step inference");
|
||||
let mut state = MambaState::new(&config);
|
||||
let input = vec![0.1; config.d_model];
|
||||
|
||||
println!("Processing single token...");
|
||||
let output = layer.forward_step(&weights, &input, &mut state);
|
||||
println!(" Input shape: [{}]", input.len());
|
||||
println!(" Output shape: [{}]", output.len());
|
||||
println!(" State updated: {}", state.h.iter().any(|&x| x != 0.0));
|
||||
println!();
|
||||
|
||||
// Example 2: Sequential processing with state
|
||||
println!("Example 2: Sequential processing");
|
||||
let mut state = MambaState::new(&config);
|
||||
let sequence_length = 5;
|
||||
|
||||
for t in 0..sequence_length {
|
||||
let input = vec![0.1 * (t as f32 + 1.0); config.d_model];
|
||||
let output = layer.forward_step(&weights, &input, &mut state);
|
||||
println!(" Step {}: output[0] = {:.6}", t, output[0]);
|
||||
}
|
||||
println!();
|
||||
|
||||
// Example 3: Sequence mode
|
||||
println!("Example 3: Sequence mode (parallel)");
|
||||
let seq_len = 4;
|
||||
let input_seq = vec![0.2; seq_len * config.d_model];
|
||||
|
||||
println!("Processing sequence of length {}...", seq_len);
|
||||
let output_seq = layer.forward_sequence(&weights, &input_seq, seq_len);
|
||||
println!(" Input shape: [{}, {}]", seq_len, config.d_model);
|
||||
println!(" Output shape: [{}, {}]", seq_len, config.d_model);
|
||||
println!(" First output: {:.6}", output_seq[0]);
|
||||
println!();
|
||||
|
||||
// Example 4: State reset
|
||||
println!("Example 4: State persistence and reset");
|
||||
let mut state = MambaState::new(&config);
|
||||
let input1 = vec![0.5; config.d_model];
|
||||
let input2 = vec![0.3; config.d_model];
|
||||
|
||||
let out1 = layer.forward_step(&weights, &input1, &mut state);
|
||||
println!(" First forward: output[0] = {:.6}", out1[0]);
|
||||
|
||||
let out2 = layer.forward_step(&weights, &input2, &mut state);
|
||||
println!(" Second forward: output[0] = {:.6}", out2[0]);
|
||||
|
||||
state.reset();
|
||||
let out1_reset = layer.forward_step(&weights, &input1, &mut state);
|
||||
println!(" After reset: output[0] = {:.6}", out1_reset[0]);
|
||||
println!(
|
||||
" Matches first: {}",
|
||||
(out1[0] - out1_reset[0]).abs() < 1e-5
|
||||
);
|
||||
println!();
|
||||
|
||||
// Performance characteristics
|
||||
println!("Performance Characteristics:");
|
||||
println!(" Complexity per step: O(N) vs O(N²) for attention");
|
||||
println!(" Memory per step: O(1) vs O(N) for attention");
|
||||
println!(
|
||||
" State size: {} floats",
|
||||
state.h.len() + state.conv_state.len()
|
||||
);
|
||||
println!();
|
||||
|
||||
println!("=== Example Complete ===");
|
||||
}
|
||||
274
crates/ruvector-mincut-gated-transformer/examples/scorer.rs
Normal file
274
crates/ruvector-mincut-gated-transformer/examples/scorer.rs
Normal file
@@ -0,0 +1,274 @@
|
||||
//! Example: Scoring mode with gate packets and spike packets.
|
||||
//!
|
||||
//! Demonstrates the primary use case: classification, routing, tool selection,
|
||||
//! and anomaly scoring under mincut-gated coherence control.
|
||||
|
||||
use ruvector_mincut_gated_transformer::{
|
||||
GateDecision, GatePacket, GatePolicy, InferInput, InferOutput, MincutGatedTransformer,
|
||||
QuantizedWeights, SpikePacket, TransformerConfig,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
println!("=== Mincut Gated Transformer Scorer Example ===\n");
|
||||
|
||||
// Create transformer with micro config (suitable for edge deployment)
|
||||
let config = TransformerConfig::micro();
|
||||
let policy = GatePolicy::default();
|
||||
let weights = QuantizedWeights::empty(&config);
|
||||
|
||||
let mut transformer = MincutGatedTransformer::new(config.clone(), policy, weights)
|
||||
.expect("Failed to create transformer");
|
||||
|
||||
println!("Transformer Configuration:");
|
||||
println!(" Sequence length: {}", config.seq_len_max);
|
||||
println!(" Hidden dimension: {}", config.hidden);
|
||||
println!(" Heads: {}", config.heads);
|
||||
println!(" Layers: {}", config.layers);
|
||||
println!(" Window: {}", config.window_normal);
|
||||
println!(" Buffer size: {} bytes\n", config.total_buffer_bytes());
|
||||
|
||||
// Simulate different scenarios
|
||||
|
||||
// Scenario 1: Normal operation (high coherence)
|
||||
println!("--- Scenario 1: Normal Operation (High Coherence) ---");
|
||||
let gate_normal = GatePacket {
|
||||
lambda: 100,
|
||||
lambda_prev: 95,
|
||||
boundary_edges: 5,
|
||||
boundary_concentration_q15: 8192, // ~25%
|
||||
partition_count: 3,
|
||||
flags: 0,
|
||||
};
|
||||
|
||||
run_inference(&mut transformer, &config, gate_normal, None, "normal");
|
||||
|
||||
// Scenario 2: Boundary spike (reduced scope)
|
||||
println!("\n--- Scenario 2: Boundary Spike (Reduced Scope) ---");
|
||||
let gate_boundary = GatePacket {
|
||||
lambda: 100,
|
||||
lambda_prev: 95,
|
||||
boundary_edges: 30, // Above threshold - triggers ReduceScope
|
||||
boundary_concentration_q15: 16000,
|
||||
partition_count: 5,
|
||||
flags: 0,
|
||||
};
|
||||
|
||||
run_inference(
|
||||
&mut transformer,
|
||||
&config,
|
||||
gate_boundary,
|
||||
None,
|
||||
"boundary_spike",
|
||||
);
|
||||
|
||||
// Scenario 3: Lambda drop (flush KV)
|
||||
println!("\n--- Scenario 3: Lambda Drop (Flush KV) ---");
|
||||
let gate_drop = GatePacket {
|
||||
lambda: 40,
|
||||
lambda_prev: 100, // 60% drop
|
||||
boundary_edges: 5,
|
||||
boundary_concentration_q15: 8192,
|
||||
partition_count: 3,
|
||||
flags: 0,
|
||||
};
|
||||
|
||||
run_inference(&mut transformer, &config, gate_drop, None, "lambda_drop");
|
||||
|
||||
// Scenario 4: Low coherence (quarantine)
|
||||
println!("\n--- Scenario 4: Low Coherence (Quarantine) ---");
|
||||
let gate_low = GatePacket {
|
||||
lambda: 10, // Below minimum
|
||||
lambda_prev: 50,
|
||||
boundary_edges: 5,
|
||||
boundary_concentration_q15: 8192,
|
||||
partition_count: 3,
|
||||
flags: 0,
|
||||
};
|
||||
|
||||
run_inference(&mut transformer, &config, gate_low, None, "low_coherence");
|
||||
|
||||
// Scenario 5: Force safe mode
|
||||
println!("\n--- Scenario 5: Force Safe Mode ---");
|
||||
let gate_safe = GatePacket {
|
||||
lambda: 100,
|
||||
lambda_prev: 95,
|
||||
boundary_edges: 5,
|
||||
boundary_concentration_q15: 8192,
|
||||
partition_count: 3,
|
||||
flags: GatePacket::FLAG_FORCE_SAFE,
|
||||
};
|
||||
|
||||
run_inference(&mut transformer, &config, gate_safe, None, "force_safe");
|
||||
|
||||
// Scenario 6: Skip mode
|
||||
println!("\n--- Scenario 6: Skip Mode ---");
|
||||
let gate_skip = GatePacket {
|
||||
lambda: 100,
|
||||
flags: GatePacket::FLAG_SKIP,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
run_inference(&mut transformer, &config, gate_skip, None, "skip");
|
||||
|
||||
// Scenario 7: With spike packet (active)
|
||||
println!("\n--- Scenario 7: Active Spike Packet ---");
|
||||
let gate_spike = GatePacket {
|
||||
lambda: 100,
|
||||
lambda_prev: 95,
|
||||
boundary_edges: 5,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let spike_active = SpikePacket {
|
||||
fired: 1,
|
||||
rate_q15: 10000,
|
||||
novelty_q15: 15000,
|
||||
top_len: 4,
|
||||
top_idx: {
|
||||
let mut arr = [0u16; 16];
|
||||
arr[0] = 2;
|
||||
arr[1] = 5;
|
||||
arr[2] = 10;
|
||||
arr[3] = 15;
|
||||
arr
|
||||
},
|
||||
top_w_q15: {
|
||||
let mut arr = [0u16; 16];
|
||||
arr[0] = 16384;
|
||||
arr[1] = 8192;
|
||||
arr[2] = 4096;
|
||||
arr[3] = 2048;
|
||||
arr
|
||||
},
|
||||
flags: SpikePacket::FLAG_SPARSE_MASK,
|
||||
};
|
||||
|
||||
run_inference(
|
||||
&mut transformer,
|
||||
&config,
|
||||
gate_spike,
|
||||
Some(spike_active),
|
||||
"spike_active",
|
||||
);
|
||||
|
||||
// Scenario 8: With spike packet (inactive - skip)
|
||||
println!("\n--- Scenario 8: Inactive Spike Packet (Skip) ---");
|
||||
let spike_inactive = SpikePacket {
|
||||
fired: 0, // Not fired
|
||||
rate_q15: 500,
|
||||
novelty_q15: 1000,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
run_inference(
|
||||
&mut transformer,
|
||||
&config,
|
||||
gate_spike,
|
||||
Some(spike_inactive),
|
||||
"spike_inactive",
|
||||
);
|
||||
|
||||
// Scenario 9: Spike storm
|
||||
println!("\n--- Scenario 9: Spike Storm (Freeze) ---");
|
||||
let spike_storm = SpikePacket {
|
||||
fired: 1,
|
||||
rate_q15: 30000, // Very high rate
|
||||
novelty_q15: 5000,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
run_inference(
|
||||
&mut transformer,
|
||||
&config,
|
||||
gate_spike,
|
||||
Some(spike_storm),
|
||||
"spike_storm",
|
||||
);
|
||||
|
||||
println!("\n=== Example Complete ===");
|
||||
}
|
||||
|
||||
fn run_inference(
|
||||
transformer: &mut MincutGatedTransformer,
|
||||
config: &TransformerConfig,
|
||||
gate: GatePacket,
|
||||
spike: Option<SpikePacket>,
|
||||
scenario: &str,
|
||||
) {
|
||||
// Reset transformer state
|
||||
transformer.reset();
|
||||
|
||||
// Create input tokens
|
||||
let tokens: Vec<u32> = (0..16).collect();
|
||||
let mut input = InferInput::from_tokens(&tokens, gate);
|
||||
|
||||
if let Some(sp) = spike {
|
||||
input = input.with_spikes(sp);
|
||||
}
|
||||
|
||||
// Allocate output buffer
|
||||
let mut logits = vec![0i32; config.logits as usize];
|
||||
let mut output = InferOutput::new(&mut logits);
|
||||
|
||||
// Run inference
|
||||
let result = transformer.infer(&input, &mut output);
|
||||
|
||||
match result {
|
||||
Ok(()) => {
|
||||
let witness = &output.witness;
|
||||
let stats = &output.stats;
|
||||
|
||||
println!(" Scenario: {}", scenario);
|
||||
println!(" Decision: {:?}", witness.decision);
|
||||
println!(" Reason: {:?}", witness.reason);
|
||||
println!(
|
||||
" Lambda: {} -> {} (delta: {})",
|
||||
witness.lambda_prev, witness.lambda, witness.lambda_delta
|
||||
);
|
||||
println!(
|
||||
" Effective seq_len: {}, window: {}",
|
||||
witness.effective_seq_len, witness.effective_window
|
||||
);
|
||||
println!(
|
||||
" KV writes: {}, External writes: {}",
|
||||
if witness.kv_writes_enabled == 1 {
|
||||
"enabled"
|
||||
} else {
|
||||
"disabled"
|
||||
},
|
||||
if witness.external_writes_enabled == 1 {
|
||||
"enabled"
|
||||
} else {
|
||||
"disabled"
|
||||
}
|
||||
);
|
||||
println!(
|
||||
" Stats: tier={}, layers={}, skipped={}",
|
||||
stats.tier, stats.layers_executed, stats.skipped
|
||||
);
|
||||
|
||||
// Demonstrate orchestrator decision logic
|
||||
print!(" Orchestrator action: ");
|
||||
match witness.decision {
|
||||
GateDecision::Allow => {
|
||||
println!("Proceed with tool execution and memory persistence");
|
||||
}
|
||||
GateDecision::ReduceScope => {
|
||||
println!("Proceed with reduced confidence, skip risky tools");
|
||||
}
|
||||
GateDecision::FlushKv => {
|
||||
println!("Clear context, rebuild state from fresh inputs");
|
||||
}
|
||||
GateDecision::FreezeWrites => {
|
||||
println!("Read-only mode, defer all state changes");
|
||||
}
|
||||
GateDecision::QuarantineUpdates => {
|
||||
println!("Discard results, request human review");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
println!(" Error: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user