Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/crates/ruvector-mincut-gated-transformer/examples/flash_attention_demo.rs
+++ b/crates/ruvector-mincut-gated-transformer/examples/flash_attention_demo.rs
@@ -0,0 +1,175 @@
+//! FlashAttention demonstration
+//!
+//! Shows how to use FlashAttention-style tiled attention for CPU inference.
+
+use ruvector_mincut_gated_transformer::flash_attention::{
+    flash_attention_forward, flash_attention_forward_i8, flash_mha, FlashAttentionConfig,
+};
+
+fn main() {
+    println!("=== FlashAttention CPU Demo ===\n");
+
+    // Configuration for 64-dim attention head
+    let config = FlashAttentionConfig::for_head_dim(64);
+    println!("Configuration:");
+    println!("  Block size (Q): {}", config.block_size_q);
+    println!("  Block size (KV): {}", config.block_size_kv);
+    println!("  Head dimension: {}", config.head_dim);
+    println!("  Causal masking: {}", config.causal);
+    println!("  Softmax scale: {:.4}\n", config.softmax_scale);
+
+    // Example 1: Single-head attention
+    {
+        println!("Example 1: Single-head attention (128 tokens, 64 dims)");
+
+        let seq_len = 128;
+        let head_dim = 64;
+
+        // Create random-like input (deterministic for demo)
+        let q: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let k: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let v: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+
+        let mut output = vec![0.0f32; seq_len * head_dim];
+
+        flash_attention_forward(&config, &q, &k, &v, seq_len, seq_len, &mut output);
+
+        println!("  ✓ Computed attention output: {} elements", output.len());
+        println!("  ✓ First 5 output values: {:?}\n", &output[0..5]);
+    }
+
+    // Example 2: Multi-head attention
+    {
+        println!("Example 2: Multi-head attention (8 heads, 64 tokens, 64 dims)");
+
+        let num_heads = 8;
+        let seq_len = 64;
+        let head_dim = 64;
+
+        let total_size = num_heads * seq_len * head_dim;
+        let q: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
+        let k: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
+        let v: Vec<f32> = (0..total_size).map(|i| ((i % 100) as f32) * 0.01).collect();
+
+        let mut output = vec![0.0f32; total_size];
+
+        flash_mha(
+            &config,
+            &q,
+            &k,
+            &v,
+            num_heads,
+            seq_len,
+            seq_len,
+            &mut output,
+        );
+
+        println!(
+            "  ✓ Computed multi-head attention: {} elements",
+            output.len()
+        );
+        println!("  ✓ Output per head: {} elements", seq_len * head_dim);
+        println!("  ✓ First 5 output values: {:?}\n", &output[0..5]);
+    }
+
+    // Example 3: INT8 quantized attention
+    {
+        println!("Example 3: INT8 quantized attention (64 tokens, 64 dims)");
+
+        let seq_len = 64;
+        let head_dim = 64;
+
+        // Create FP32 data and quantize to INT8
+        let q_f32: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let k_f32: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let v_f32: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+
+        // Quantization scales
+        let q_scale = 0.01f32;
+        let k_scale = 0.01f32;
+        let v_scale = 0.01f32;
+
+        // Quantize to INT8
+        let q_i8: Vec<i8> = q_f32
+            .iter()
+            .map(|&x| (x / q_scale).round().clamp(-128.0, 127.0) as i8)
+            .collect();
+        let k_i8: Vec<i8> = k_f32
+            .iter()
+            .map(|&x| (x / k_scale).round().clamp(-128.0, 127.0) as i8)
+            .collect();
+        let v_i8: Vec<i8> = v_f32
+            .iter()
+            .map(|&x| (x / v_scale).round().clamp(-128.0, 127.0) as i8)
+            .collect();
+
+        let mut output = vec![0.0f32; seq_len * head_dim];
+
+        flash_attention_forward_i8(
+            &config,
+            &q_i8,
+            &k_i8,
+            &v_i8,
+            q_scale,
+            k_scale,
+            v_scale,
+            seq_len,
+            seq_len,
+            &mut output,
+        );
+
+        println!("  ✓ Computed INT8 quantized attention");
+        println!("  ✓ Memory savings: 4× (INT8 vs FP32)");
+        println!("  ✓ First 5 output values: {:?}\n", &output[0..5]);
+    }
+
+    // Example 4: Configuration for long sequences
+    {
+        println!("Example 4: Optimized config for long sequences (512 tokens)");
+
+        let long_config = FlashAttentionConfig::for_long_sequence(64);
+        println!(
+            "  Block size (Q): {} (smaller for cache reuse)",
+            long_config.block_size_q
+        );
+        println!(
+            "  Block size (KV): {} (larger for efficiency)",
+            long_config.block_size_kv
+        );
+
+        let seq_len = 512;
+        let head_dim = 64;
+
+        let q: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let k: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+        let v: Vec<f32> = (0..seq_len * head_dim)
+            .map(|i| ((i % 100) as f32) * 0.01)
+            .collect();
+
+        let mut output = vec![0.0f32; seq_len * head_dim];
+
+        flash_attention_forward(&long_config, &q, &k, &v, seq_len, seq_len, &mut output);
+
+        println!("  ✓ Computed attention for {} tokens", seq_len);
+        println!("  ✓ Memory efficient: O(n) instead of O(n²)");
+        println!("  ✓ Cache efficient: Tiled for L1/L2 cache\n");
+    }
+
+    println!("=== All examples completed successfully! ===");
+}
--- a/crates/ruvector-mincut-gated-transformer/examples/mamba_example.rs
+++ b/crates/ruvector-mincut-gated-transformer/examples/mamba_example.rs
@@ -0,0 +1,116 @@
+//! Example demonstrating Mamba State Space Model usage.
+//!
+//! This example shows:
+//! 1. Creating and configuring a Mamba layer
+//! 2. Single-step (recurrent) inference
+//! 3. Sequence processing
+//! 4. State persistence across timesteps
+
+use ruvector_mincut_gated_transformer::mamba::{MambaConfig, MambaLayer, MambaState, MambaWeights};
+
+fn main() {
+    println!("=== Mamba State Space Model Example ===\n");
+
+    // Create configuration
+    let config = MambaConfig {
+        d_model: 128,
+        d_state: 16,
+        d_conv: 4,
+        expand: 2,
+        dt_rank: 16,
+        dt_min: 0.001,
+        dt_max: 0.1,
+    };
+
+    println!("Configuration:");
+    println!("  Model dimension: {}", config.d_model);
+    println!("  State dimension: {}", config.d_state);
+    println!("  Inner dimension: {}", config.d_inner());
+    println!("  Convolution width: {}", config.d_conv);
+    println!();
+
+    // Create layer and initialize weights
+    let layer = MambaLayer::new(config.clone());
+    let weights = MambaWeights::empty(&config);
+
+    println!("Layer created with {} parameters", {
+        let d_inner = config.d_inner();
+        config.d_model * d_inner * 2 // in_proj
+            + d_inner * config.d_conv // conv1d
+            + d_inner * (config.dt_rank + config.d_state * 2) // x_proj
+            + config.dt_rank * d_inner // dt_proj
+            + d_inner * config.d_state // a_log
+            + d_inner // d
+            + d_inner * config.d_model // out_proj
+    });
+    println!();
+
+    // Example 1: Single-step inference
+    println!("Example 1: Single-step inference");
+    let mut state = MambaState::new(&config);
+    let input = vec![0.1; config.d_model];
+
+    println!("Processing single token...");
+    let output = layer.forward_step(&weights, &input, &mut state);
+    println!("  Input shape: [{}]", input.len());
+    println!("  Output shape: [{}]", output.len());
+    println!("  State updated: {}", state.h.iter().any(|&x| x != 0.0));
+    println!();
+
+    // Example 2: Sequential processing with state
+    println!("Example 2: Sequential processing");
+    let mut state = MambaState::new(&config);
+    let sequence_length = 5;
+
+    for t in 0..sequence_length {
+        let input = vec![0.1 * (t as f32 + 1.0); config.d_model];
+        let output = layer.forward_step(&weights, &input, &mut state);
+        println!("  Step {}: output[0] = {:.6}", t, output[0]);
+    }
+    println!();
+
+    // Example 3: Sequence mode
+    println!("Example 3: Sequence mode (parallel)");
+    let seq_len = 4;
+    let input_seq = vec![0.2; seq_len * config.d_model];
+
+    println!("Processing sequence of length {}...", seq_len);
+    let output_seq = layer.forward_sequence(&weights, &input_seq, seq_len);
+    println!("  Input shape: [{}, {}]", seq_len, config.d_model);
+    println!("  Output shape: [{}, {}]", seq_len, config.d_model);
+    println!("  First output: {:.6}", output_seq[0]);
+    println!();
+
+    // Example 4: State reset
+    println!("Example 4: State persistence and reset");
+    let mut state = MambaState::new(&config);
+    let input1 = vec![0.5; config.d_model];
+    let input2 = vec![0.3; config.d_model];
+
+    let out1 = layer.forward_step(&weights, &input1, &mut state);
+    println!("  First forward: output[0] = {:.6}", out1[0]);
+
+    let out2 = layer.forward_step(&weights, &input2, &mut state);
+    println!("  Second forward: output[0] = {:.6}", out2[0]);
+
+    state.reset();
+    let out1_reset = layer.forward_step(&weights, &input1, &mut state);
+    println!("  After reset: output[0] = {:.6}", out1_reset[0]);
+    println!(
+        "  Matches first: {}",
+        (out1[0] - out1_reset[0]).abs() < 1e-5
+    );
+    println!();
+
+    // Performance characteristics
+    println!("Performance Characteristics:");
+    println!("  Complexity per step: O(N) vs O(N²) for attention");
+    println!("  Memory per step: O(1) vs O(N) for attention");
+    println!(
+        "  State size: {} floats",
+        state.h.len() + state.conv_state.len()
+    );
+    println!();
+
+    println!("=== Example Complete ===");
+}
--- a/crates/ruvector-mincut-gated-transformer/examples/scorer.rs
+++ b/crates/ruvector-mincut-gated-transformer/examples/scorer.rs
@@ -0,0 +1,274 @@
+//! Example: Scoring mode with gate packets and spike packets.
+//!
+//! Demonstrates the primary use case: classification, routing, tool selection,
+//! and anomaly scoring under mincut-gated coherence control.
+
+use ruvector_mincut_gated_transformer::{
+    GateDecision, GatePacket, GatePolicy, InferInput, InferOutput, MincutGatedTransformer,
+    QuantizedWeights, SpikePacket, TransformerConfig,
+};
+
+fn main() {
+    println!("=== Mincut Gated Transformer Scorer Example ===\n");
+
+    // Create transformer with micro config (suitable for edge deployment)
+    let config = TransformerConfig::micro();
+    let policy = GatePolicy::default();
+    let weights = QuantizedWeights::empty(&config);
+
+    let mut transformer = MincutGatedTransformer::new(config.clone(), policy, weights)
+        .expect("Failed to create transformer");
+
+    println!("Transformer Configuration:");
+    println!("  Sequence length: {}", config.seq_len_max);
+    println!("  Hidden dimension: {}", config.hidden);
+    println!("  Heads: {}", config.heads);
+    println!("  Layers: {}", config.layers);
+    println!("  Window: {}", config.window_normal);
+    println!("  Buffer size: {} bytes\n", config.total_buffer_bytes());
+
+    // Simulate different scenarios
+
+    // Scenario 1: Normal operation (high coherence)
+    println!("--- Scenario 1: Normal Operation (High Coherence) ---");
+    let gate_normal = GatePacket {
+        lambda: 100,
+        lambda_prev: 95,
+        boundary_edges: 5,
+        boundary_concentration_q15: 8192, // ~25%
+        partition_count: 3,
+        flags: 0,
+    };
+
+    run_inference(&mut transformer, &config, gate_normal, None, "normal");
+
+    // Scenario 2: Boundary spike (reduced scope)
+    println!("\n--- Scenario 2: Boundary Spike (Reduced Scope) ---");
+    let gate_boundary = GatePacket {
+        lambda: 100,
+        lambda_prev: 95,
+        boundary_edges: 30, // Above threshold - triggers ReduceScope
+        boundary_concentration_q15: 16000,
+        partition_count: 5,
+        flags: 0,
+    };
+
+    run_inference(
+        &mut transformer,
+        &config,
+        gate_boundary,
+        None,
+        "boundary_spike",
+    );
+
+    // Scenario 3: Lambda drop (flush KV)
+    println!("\n--- Scenario 3: Lambda Drop (Flush KV) ---");
+    let gate_drop = GatePacket {
+        lambda: 40,
+        lambda_prev: 100, // 60% drop
+        boundary_edges: 5,
+        boundary_concentration_q15: 8192,
+        partition_count: 3,
+        flags: 0,
+    };
+
+    run_inference(&mut transformer, &config, gate_drop, None, "lambda_drop");
+
+    // Scenario 4: Low coherence (quarantine)
+    println!("\n--- Scenario 4: Low Coherence (Quarantine) ---");
+    let gate_low = GatePacket {
+        lambda: 10, // Below minimum
+        lambda_prev: 50,
+        boundary_edges: 5,
+        boundary_concentration_q15: 8192,
+        partition_count: 3,
+        flags: 0,
+    };
+
+    run_inference(&mut transformer, &config, gate_low, None, "low_coherence");
+
+    // Scenario 5: Force safe mode
+    println!("\n--- Scenario 5: Force Safe Mode ---");
+    let gate_safe = GatePacket {
+        lambda: 100,
+        lambda_prev: 95,
+        boundary_edges: 5,
+        boundary_concentration_q15: 8192,
+        partition_count: 3,
+        flags: GatePacket::FLAG_FORCE_SAFE,
+    };
+
+    run_inference(&mut transformer, &config, gate_safe, None, "force_safe");
+
+    // Scenario 6: Skip mode
+    println!("\n--- Scenario 6: Skip Mode ---");
+    let gate_skip = GatePacket {
+        lambda: 100,
+        flags: GatePacket::FLAG_SKIP,
+        ..Default::default()
+    };
+
+    run_inference(&mut transformer, &config, gate_skip, None, "skip");
+
+    // Scenario 7: With spike packet (active)
+    println!("\n--- Scenario 7: Active Spike Packet ---");
+    let gate_spike = GatePacket {
+        lambda: 100,
+        lambda_prev: 95,
+        boundary_edges: 5,
+        ..Default::default()
+    };
+
+    let spike_active = SpikePacket {
+        fired: 1,
+        rate_q15: 10000,
+        novelty_q15: 15000,
+        top_len: 4,
+        top_idx: {
+            let mut arr = [0u16; 16];
+            arr[0] = 2;
+            arr[1] = 5;
+            arr[2] = 10;
+            arr[3] = 15;
+            arr
+        },
+        top_w_q15: {
+            let mut arr = [0u16; 16];
+            arr[0] = 16384;
+            arr[1] = 8192;
+            arr[2] = 4096;
+            arr[3] = 2048;
+            arr
+        },
+        flags: SpikePacket::FLAG_SPARSE_MASK,
+    };
+
+    run_inference(
+        &mut transformer,
+        &config,
+        gate_spike,
+        Some(spike_active),
+        "spike_active",
+    );
+
+    // Scenario 8: With spike packet (inactive - skip)
+    println!("\n--- Scenario 8: Inactive Spike Packet (Skip) ---");
+    let spike_inactive = SpikePacket {
+        fired: 0, // Not fired
+        rate_q15: 500,
+        novelty_q15: 1000,
+        ..Default::default()
+    };
+
+    run_inference(
+        &mut transformer,
+        &config,
+        gate_spike,
+        Some(spike_inactive),
+        "spike_inactive",
+    );
+
+    // Scenario 9: Spike storm
+    println!("\n--- Scenario 9: Spike Storm (Freeze) ---");
+    let spike_storm = SpikePacket {
+        fired: 1,
+        rate_q15: 30000, // Very high rate
+        novelty_q15: 5000,
+        ..Default::default()
+    };
+
+    run_inference(
+        &mut transformer,
+        &config,
+        gate_spike,
+        Some(spike_storm),
+        "spike_storm",
+    );
+
+    println!("\n=== Example Complete ===");
+}
+
+fn run_inference(
+    transformer: &mut MincutGatedTransformer,
+    config: &TransformerConfig,
+    gate: GatePacket,
+    spike: Option<SpikePacket>,
+    scenario: &str,
+) {
+    // Reset transformer state
+    transformer.reset();
+
+    // Create input tokens
+    let tokens: Vec<u32> = (0..16).collect();
+    let mut input = InferInput::from_tokens(&tokens, gate);
+
+    if let Some(sp) = spike {
+        input = input.with_spikes(sp);
+    }
+
+    // Allocate output buffer
+    let mut logits = vec![0i32; config.logits as usize];
+    let mut output = InferOutput::new(&mut logits);
+
+    // Run inference
+    let result = transformer.infer(&input, &mut output);
+
+    match result {
+        Ok(()) => {
+            let witness = &output.witness;
+            let stats = &output.stats;
+
+            println!("  Scenario: {}", scenario);
+            println!("  Decision: {:?}", witness.decision);
+            println!("  Reason: {:?}", witness.reason);
+            println!(
+                "  Lambda: {} -> {} (delta: {})",
+                witness.lambda_prev, witness.lambda, witness.lambda_delta
+            );
+            println!(
+                "  Effective seq_len: {}, window: {}",
+                witness.effective_seq_len, witness.effective_window
+            );
+            println!(
+                "  KV writes: {}, External writes: {}",
+                if witness.kv_writes_enabled == 1 {
+                    "enabled"
+                } else {
+                    "disabled"
+                },
+                if witness.external_writes_enabled == 1 {
+                    "enabled"
+                } else {
+                    "disabled"
+                }
+            );
+            println!(
+                "  Stats: tier={}, layers={}, skipped={}",
+                stats.tier, stats.layers_executed, stats.skipped
+            );
+
+            // Demonstrate orchestrator decision logic
+            print!("  Orchestrator action: ");
+            match witness.decision {
+                GateDecision::Allow => {
+                    println!("Proceed with tool execution and memory persistence");
+                }
+                GateDecision::ReduceScope => {
+                    println!("Proceed with reduced confidence, skip risky tools");
+                }
+                GateDecision::FlushKv => {
+                    println!("Clear context, rebuild state from fresh inputs");
+                }
+                GateDecision::FreezeWrites => {
+                    println!("Read-only mode, defer all state changes");
+                }
+                GateDecision::QuarantineUpdates => {
+                    println!("Discard results, request human review");
+                }
+            }
+        }
+        Err(e) => {
+            println!("  Error: {:?}", e);
+        }
+    }
+}