Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/crates/ruvector-attention/Cargo.toml
+++ b/vendor/ruvector/crates/ruvector-attention/Cargo.toml
@@ -0,0 +1,51 @@
+[package]
+name = "ruvector-attention"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+description = "Attention mechanisms for ruvector - geometric, graph, and sparse attention"
+keywords = ["attention", "machine-learning", "vector-search", "graph-attention"]
+categories = ["algorithms", "science"]
+
+[lib]
+crate-type = ["rlib"]
+
+[features]
+default = ["simd"]
+simd = []
+wasm = []
+napi = ["dep:napi-derive", "dep:napi"]
+# Enable advanced math-based attention mechanisms
+math = ["dep:ruvector-math"]
+# Enable sheaf attention (Coherence-Gated Transformer per ADR-015)
+sheaf = []
+
+[dependencies]
+thiserror = "1.0"
+rayon = "1.10"
+serde = { version = "1.0", features = ["derive"] }
+rand = "0.8"
+napi = { version = "2", optional = true }
+napi-derive = { version = "2", optional = true }
+
+# Advanced math primitives for OT, mixed-curvature, and topology-gated attention
+ruvector-math = { version = "2.0", path = "../ruvector-math", optional = true }
+
+[dev-dependencies]
+criterion = "0.5"
+approx = "0.5"
+rand = "0.8"
+
+[[bench]]
+name = "attention_bench"
+harness = false
+
+[[bench]]
+name = "attention_benchmarks"
+harness = false
+
+[[bin]]
+name = "bench_runner"
+path = "benches/attention_benchmarks.rs"
--- a/vendor/ruvector/crates/ruvector-attention/LICENSE
+++ b/vendor/ruvector/crates/ruvector-attention/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 rUv
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/ruvector/crates/ruvector-attention/README.md
+++ b/vendor/ruvector/crates/ruvector-attention/README.md
@@ -0,0 +1,861 @@
+# ruvector-attention
+
+[![Crates.io](https://img.shields.io/crates/v/ruvector-attention.svg)](https://crates.io/crates/ruvector-attention)
+[![Documentation](https://docs.rs/ruvector-attention/badge.svg)](https://docs.rs/ruvector-attention)
+[![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg)](LICENSE)
+[![Tests](https://img.shields.io/badge/tests-142%20passing-brightgreen.svg)]()
+
+**46 attention mechanisms grounded in 7 mathematical theories -- from Flash Attention to optimal transport -- in one crate.**
+
+```bash
+cargo add ruvector-attention
+```
+
+Attention is the core operation in transformers, vector search, and graph neural networks, but most libraries give you one or two flavors and call it done. `ruvector-attention` ships 46 mechanisms spanning standard dot-product, sparse (Flash, linear, local-global), geometric (hyperbolic, mixed-curvature), graph (GAT, RoPE), and mixture-of-experts -- all SIMD-accelerated with quantization support. Pick the right attention for your data shape instead of forcing everything through softmax(QK^T/sqrt(d))V.
+
+| | ruvector-attention | PyTorch `nn.MultiheadAttention` | FlashAttention (standalone) | xFormers |
+|---|---|---|---|---|
+| **Mechanism count** | 46 | 1 (scaled dot-product) | 1 (Flash) | ~5 |
+| **Geometric attention** | Hyperbolic, spherical, mixed-curvature | No | No | No |
+| **Graph attention** | Edge-featured GAT, RoPE for graphs | No | No | Limited |
+| **Optimal transport** | Sliced Wasserstein, centroid OT | No | No | No |
+| **Topology-gated** | Coherence-based mode switching | No | No | No |
+| **Quantization** | Per-component (8-bit E, 5-bit H/S) | Via separate tools | No | Limited |
+| **Language** | Rust (with WASM target) | Python/C++ | CUDA only | Python/CUDA |
+| **SIMD acceleration** | Built in (4-way unrolled) | Via backend | CUDA only | Via backend |
+
+| Feature | What It Does | Why It Matters |
+|---------|-------------|----------------|
+| **Flash Attention** | O(n) memory tiled computation | Process long sequences without running out of memory |
+| **Mixed Curvature Fusion** | Combines Euclidean, hyperbolic, and spherical spaces in one pass | Model hierarchies, clusters, and flat data simultaneously |
+| **Optimal Transport Attention** | Uses Wasserstein distance instead of dot-product similarity | Better distribution matching for retrieval and generation |
+| **Topology-Gated Switching** | Automatically picks attention mode based on local coherence | Self-adapts to data characteristics without manual tuning |
+| **Information Bottleneck** | Compresses attention via KL minimization | Keeps only the signal, discards noise |
+| **PDE/Diffusion Attention** | Runs heat equation on a similarity graph | Smooth, noise-robust attention for irregular data |
+| **Unified Diagnostics** | Health monitoring and automatic mode selection across all 7 theories | One report tells you which attention works best for your data |
+
+> Part of the [RuVector](https://github.com/ruvnet/ruvector) ecosystem -- the self-learning vector database with graph intelligence.
+
+## Supported Attention Mechanisms
+
+### Standard Attention
+- **Scaled Dot-Product**: `softmax(QK^T / √d)V`
+- **Multi-Head**: Parallel attention heads with diverse representations
+
+### Sparse Attention (Memory Efficient)
+- **Flash Attention**: O(n) memory complexity with tiled computation
+- **Linear Attention**: O(n) complexity using kernel approximation
+- **Local-Global**: Sliding window + global tokens (Longformer-style)
+
+### Geometric Attention
+- **Hyperbolic Attention**: Attention in hyperbolic space for hierarchical data
+- **Mixed Curvature**: Dynamic curvature for complex geometries
+
+### Graph Attention
+- **Edge-Featured GAT**: Graph attention with edge features
+- **RoPE**: Rotary Position Embeddings for graphs
+
+### Mixture-of-Experts
+- **MoE Attention**: Learned routing to specialized expert modules
+- **Top-k Routing**: Efficient expert selection
+
+## 7 Mathematical Theories
+
+This crate implements attention mechanisms grounded in 7 distinct mathematical theories:
+
+| # | Theory | Module | Key Types | Use Case |
+|---|--------|--------|-----------|----------|
+| 1 | **Optimal Transport** | `transport` | `SlicedWassersteinAttention`, `CentroidOTAttention` | Distribution matching, Earth mover distance |
+| 2 | **Mixed Curvature** | `curvature` | `MixedCurvatureFusedAttention`, `TangentSpaceMapper` | Product spaces E^e × H^h × S^s |
+| 3 | **Topology** | `topology` | `TopologyGatedAttention`, `WindowCoherence` | Coherence-based mode switching |
+| 4 | **Information Geometry** | `info_geometry` | `FisherMetric`, `NaturalGradient` | Natural gradient descent |
+| 5 | **Information Bottleneck** | `info_bottleneck` | `InformationBottleneck`, `KLDivergence` | Compression via KL minimization |
+| 6 | **PDE/Diffusion** | `pde_attention` | `DiffusionAttention`, `GraphLaplacian` | Heat equation on similarity graph |
+| 7 | **Unified Diagnostics** | `unified_report` | `GeometryReport`, `ReportBuilder` | Health monitoring & mode selection |
+
+### Theory 1: Optimal Transport Attention
+
+Attention as mass transport between query and key distributions using Wasserstein distance.
+
+```rust
+use ruvector_attention::{SlicedWassersteinAttention, SlicedWassersteinConfig};
+
+// Configure Sliced Wasserstein with 16 random projections
+let config = SlicedWassersteinConfig {
+    num_projections: 16,
+    num_candidates: 64,
+    dim: 512,
+    ..Default::default()
+};
+
+let ot_attention = SlicedWassersteinAttention::new(config);
+
+// Compute OT-based attention scores
+let query = vec![0.5; 512];
+let keys: Vec<&[f32]> = key_data.iter().map(|k| k.as_slice()).collect();
+let values: Vec<&[f32]> = value_data.iter().map(|v| v.as_slice()).collect();
+
+let output = ot_attention.compute_sliced(&query, &keys, &values)?;
+```
+
+**Key Features:**
+- Sliced Wasserstein with cached sorted projections
+- Two-stage filtering: cheap dot-product → expensive OT kernel
+- Centroid OT: cluster keys into M centroids for O(M) transport
+
+### Theory 2: Mixed Curvature Attention
+
+Attention in product manifolds combining Euclidean (E), Hyperbolic (H), and Spherical (S) spaces.
+
+```rust
+use ruvector_attention::{
+    MixedCurvatureFusedAttention, FusedCurvatureConfig,
+    TangentSpaceMapper, TangentSpaceConfig
+};
+
+// Configure mixed curvature with component dimensions
+let config = FusedCurvatureConfig {
+    euclidean_dim: 256,
+    hyperbolic_dim: 128,
+    spherical_dim: 128,
+    curvature_h: -1.0,  // Negative for hyperbolic
+    curvature_s: 1.0,   // Positive for spherical
+    ..Default::default()
+};
+
+let mixed_attention = MixedCurvatureFusedAttention::new(config);
+
+// Map hyperbolic vectors to tangent space for efficient computation
+let mapper = TangentSpaceMapper::new(TangentSpaceConfig::default());
+let tangent_keys = mapper.map_to_tangent(&hyperbolic_keys);
+```
+
+**Key Features:**
+- Tangent space mapping (avoids expensive geodesic computations)
+- Fused dot kernel: single vectorized loop for E+H+S similarities
+- Per-head learned mixing weights
+- Component quantization: 8-bit E, 5-bit H/S
+
+### Theory 3: Topology-Gated Attention
+
+Adaptive attention that switches modes based on local coherence metrics.
+
+```rust
+use ruvector_attention::{
+    TopologyGatedAttention, TopologyGatedConfig,
+    AttentionMode, PolicyConfig, CoherenceMetric
+};
+
+let config = TopologyGatedConfig {
+    dim: 512,
+    policy: PolicyConfig {
+        stable_threshold: 0.8,    // High coherence → Stable mode
+        cautious_threshold: 0.5,  // Medium → Cautious mode
+        freeze_threshold: 0.3,    // Low → Freeze mode
+        hysteresis: 0.05,         // Prevents mode oscillation
+        ..Default::default()
+    },
+    ..Default::default()
+};
+
+let gated = TopologyGatedAttention::new(config);
+
+// Attention automatically adjusts based on window coherence
+let output = gated.compute_gated(&query, &keys, &values)?;
+let mode = gated.current_mode(); // Stable, Cautious, or Freeze
+```
+
+**Coherence Metrics:**
+| Metric | Description |
+|--------|-------------|
+| `BoundaryMass` | Mass near window boundaries |
+| `CutProxy` | Proxy for graph cut quality |
+| `Disagreement` | Variance in attention weights |
+| `SimilarityVariance` | Local similarity variance |
+
+### Theory 4: Information Geometry
+
+Natural gradient optimization using the Fisher Information Matrix.
+
+```rust
+use ruvector_attention::{FisherMetric, FisherConfig, NaturalGradient, NaturalGradientConfig};
+
+// Fisher metric for probability distributions
+let fisher = FisherMetric::new(FisherConfig {
+    eps: 1e-8,
+    max_cg_iters: 50,
+    cg_tol: 1e-6,
+});
+
+// Compute F * v (Fisher-vector product)
+let probs = vec![0.25, 0.25, 0.25, 0.25];
+let direction = vec![0.1, -0.1, 0.05, -0.05];
+let fv = fisher.apply(&probs, &direction);
+
+// Natural gradient optimizer
+let ng = NaturalGradient::new(NaturalGradientConfig {
+    lr: 0.1,
+    use_diagonal: false,  // Full CG solve (more accurate)
+    fisher: FisherConfig::default(),
+});
+
+// Update logits using natural gradient: θ ← θ - lr * F^{-1} * ∇L
+let new_logits = ng.step_logits(&logits, &grad_logits);
+```
+
+**Key Features:**
+- Conjugate gradient solver for F^{-1} * v
+- Diagonal approximation for speed
+- SIMD-accelerated matrix-vector operations
+
+### Theory 5: Information Bottleneck
+
+Attention compression via the Information Bottleneck principle.
+
+```rust
+use ruvector_attention::{InformationBottleneck, IBConfig, KLDivergence, DiagonalGaussian};
+
+// Information bottleneck layer
+let ib = InformationBottleneck::new(IBConfig {
+    beta: 0.1,        // Compression strength
+    z_dim: 64,        // Bottleneck dimension
+    anneal_steps: 1000,
+    ..Default::default()
+});
+
+// Compute KL divergence between Gaussian and unit normal
+let gaussian = DiagonalGaussian {
+    mean: vec![0.1; 64],
+    log_var: vec![-1.0; 64],
+};
+let kl = KLDivergence::gaussian_to_unit(&gaussian);
+
+// Compress attention weights
+let (compressed, kl_loss) = ib.compress_attention_weights(&weights, temperature);
+
+// Reparameterized sampling
+let z = ib.sample(&mean, &log_var, &epsilon);
+```
+
+**Key Features:**
+- KL divergence: Gaussian→Unit, Categorical, Jensen-Shannon
+- Variational Information Bottleneck (VIB)
+- Temperature annealing for curriculum learning
+
+### Theory 6: PDE/Diffusion Attention
+
+Attention as heat diffusion on the key similarity graph.
+
+```rust
+use ruvector_attention::{
+    DiffusionAttention, DiffusionConfig,
+    GraphLaplacian, LaplacianType
+};
+
+// Build graph Laplacian from keys
+let laplacian = GraphLaplacian::from_keys(
+    &keys,
+    sigma,  // Gaussian kernel bandwidth
+    LaplacianType::SymmetricNormalized
+);
+
+// Diffusion attention with heat equation
+let config = DiffusionConfig {
+    t: 1.0,           // Diffusion time
+    num_steps: 10,    // Discretization steps
+    sigma: 1.0,       // Kernel bandwidth
+    use_knn: true,    // Sparse Laplacian
+    k: 16,            // k-NN neighbors
+    laplacian_type: LaplacianType::SymmetricNormalized,
+    ..Default::default()
+};
+
+let diffusion = DiffusionAttention::new(config);
+
+// Compute diffused attention
+let output = diffusion.compute_diffusion(&query, &keys, &values)?;
+
+// Multi-scale diffusion (captures different granularities)
+let scales = diffusion.compute_multiscale(&query, &keys, 4);
+```
+
+**Laplacian Types:**
+| Type | Formula | Properties |
+|------|---------|------------|
+| `Unnormalized` | D - W | Graph spectrum analysis |
+| `SymmetricNormalized` | I - D^{-1/2}WD^{-1/2} | Symmetric, eigenvalues in [0,2] |
+| `RandomWalk` | I - D^{-1}W | Probability transitions |
+
+### Theory 7: Unified Geometry Report
+
+Diagnostic dashboard combining all metrics for intelligent attention mode selection.
+
+```rust
+use ruvector_attention::{
+    ReportBuilder, ReportConfig, GeometryReport,
+    MetricType, AttentionRecommendation
+};
+
+// Build comprehensive geometry report
+let report = ReportBuilder::new(ReportConfig::default())
+    .with_ot_distance(0.15)
+    .with_topology_coherence(0.82)
+    .with_ib_kl(0.05)
+    .with_diffusion_energy(0.3)
+    .with_attention_entropy(2.1)
+    .build();
+
+// Get health score (0-1)
+println!("Health: {:.2}", report.health_score);
+
+// Get automatic attention mode recommendation
+match report.recommendation {
+    AttentionRecommendation::Standard => { /* Use standard attention */ }
+    AttentionRecommendation::Sparse => { /* Switch to sparse */ }
+    AttentionRecommendation::Geometric => { /* Use hyperbolic/mixed */ }
+    AttentionRecommendation::Diffusion => { /* Use diffusion attention */ }
+}
+
+// Check individual metrics
+for metric in &report.metrics {
+    println!("{:?}: {} ({})",
+        metric.metric_type,
+        metric.value,
+        metric.status()
+    );
+}
+```
+
+**Metrics Tracked:**
+| Metric | Healthy Range | Warning | Critical |
+|--------|---------------|---------|----------|
+| OT Distance | 0.0 - 0.5 | > 0.3 | > 0.7 |
+| Topology Coherence | 0.5 - 1.0 | < 0.3 | < 0.1 |
+| IB KL | 0.0 - 0.2 | > 0.5 | > 1.0 |
+| Diffusion Energy | 0.0 - 1.0 | > 2.0 | > 5.0 |
+| Attention Entropy | 1.0 - 4.0 | < 0.5 | < 0.1 |
+
+## Quick Start
+
+```rust
+use ruvector_attention::sdk::*;
+
+// Simple multi-head attention
+let attention = multi_head(768, 12)
+    .dropout(0.1)
+    .causal(true)
+    .build()?;
+
+// Use preset configurations
+let bert = AttentionPreset::Bert.builder(768).build()?;
+let gpt = AttentionPreset::Gpt.builder(768).build()?;
+
+// Build pipelines with normalization
+let pipeline = AttentionPipeline::new()
+    .add_attention(attention)
+    .add_norm(NormType::LayerNorm)
+    .add_residual();
+
+// Compute attention
+let query = vec![0.5; 768];
+let keys = vec![&query[..]; 10];
+let values = vec![&query[..]; 10];
+
+let output = pipeline.run(&query, &keys, &values)?;
+```
+
+## Installation
+
+Add to your `Cargo.toml`:
+
+```toml
+[dependencies]
+ruvector-attention = "0.1"
+```
+
+Or with specific features:
+
+```toml
+[dependencies]
+ruvector-attention = { version = "0.1", features = ["simd", "wasm"] }
+```
+
+## SDK Overview
+
+### Builder API
+
+The builder provides a fluent interface for configuring attention:
+
+```rust
+use ruvector_attention::sdk::*;
+
+// Flash attention for long sequences
+let flash = flash(1024, 128)  // dim, block_size
+    .causal(true)
+    .dropout(0.1)
+    .build()?;
+
+// Linear attention for O(n) complexity
+let linear = linear(512, 256)  // dim, num_features
+    .build()?;
+
+// MoE attention with 8 experts
+let moe = moe(512, 8, 2)  // dim, num_experts, top_k
+    .expert_capacity(1.25)
+    .jitter_noise(0.01)
+    .build()?;
+
+// Hyperbolic attention for hierarchies
+let hyperbolic = hyperbolic(512, -1.0)  // dim, curvature
+    .build()?;
+```
+
+### Pipeline API
+
+Compose attention with pre/post processing:
+
+```rust
+use ruvector_attention::sdk::*;
+
+let attention = multi_head(768, 12).build()?;
+
+let pipeline = AttentionPipeline::new()
+    .add_norm(NormType::LayerNorm)     // Pre-normalization
+    .add_attention(attention)           // Attention layer
+    .add_dropout(0.1)                   // Dropout
+    .add_residual()                     // Residual connection
+    .add_norm(NormType::RMSNorm);      // Post-normalization
+
+let output = pipeline.run(&query, &keys, &values)?;
+```
+
+### Preset Configurations
+
+Pre-configured attention for popular models:
+
+```rust
+use ruvector_attention::sdk::presets::*;
+
+// Model-specific presets
+let bert = AttentionPreset::Bert.builder(768).build()?;
+let gpt = AttentionPreset::Gpt.builder(768).build()?;
+let longformer = AttentionPreset::Longformer.builder(512).build()?;
+let flash = AttentionPreset::FlashOptimized.builder(1024).build()?;
+let t5 = AttentionPreset::T5.builder(768).build()?;
+let vit = AttentionPreset::ViT.builder(768).build()?;
+
+// Smart selection based on use case
+let attention = for_sequences(512, max_len).build()?;  // Auto-select by length
+let graph_attn = for_graphs(256, hierarchical).build()?;  // Graph attention
+let fast_attn = for_large_scale(1024).build()?;  // Flash attention
+
+// By model name
+let bert = from_model_name("bert", 768)?;
+let gpt2 = from_model_name("gpt2", 768)?;
+```
+
+## Architecture
+
+```
+ruvector-attention/
+├── src/
+│   ├── lib.rs                 # Main crate entry
+│   ├── error.rs               # Error types
+│   ├── traits.rs              # Core attention traits
+│   │
+│   ├── attention/             # Standard attention
+│   │   ├── scaled_dot_product.rs
+│   │   └── multi_head.rs
+│   │
+│   ├── sparse/                # Sparse attention (O(n) memory)
+│   │   ├── flash.rs           # Flash attention (tiled)
+│   │   ├── linear.rs          # Kernel approximation
+│   │   └── local_global.rs    # Longformer-style
+│   │
+│   ├── graph/                 # Graph attention
+│   │   ├── edge_featured.rs   # GAT with edge features
+│   │   ├── dual_space.rs      # Dual-space attention
+│   │   └── rope.rs            # Rotary embeddings
+│   │
+│   ├── hyperbolic/            # Hyperbolic geometry
+│   │   ├── hyperbolic_attention.rs
+│   │   ├── mixed_curvature.rs
+│   │   └── poincare.rs
+│   │
+│   ├── moe/                   # Mixture-of-Experts
+│   │   ├── expert.rs          # Expert modules
+│   │   ├── router.rs          # Top-k routing
+│   │   └── moe_attention.rs
+│   │
+│   ├── transport/             # [Theory 1] Optimal Transport
+│   │   ├── sliced_wasserstein.rs   # Sliced OT attention
+│   │   ├── centroid_ot.rs          # Centroid-based OT
+│   │   └── cached_projections.rs   # Projection caching
+│   │
+│   ├── curvature/             # [Theory 2] Mixed Curvature
+│   │   ├── tangent_space.rs        # Tangent space mapping
+│   │   ├── fused_attention.rs      # Fused E+H+S kernel
+│   │   └── component_quantizer.rs  # 8-bit/5-bit quantization
+│   │
+│   ├── topology/              # [Theory 3] Topology Gating
+│   │   ├── coherence.rs            # Window coherence metrics
+│   │   ├── policy.rs               # 3-mode policy (Stable/Cautious/Freeze)
+│   │   └── gated_attention.rs      # Adaptive gated attention
+│   │
+│   ├── info_geometry/         # [Theory 4] Information Geometry
+│   │   ├── fisher.rs               # Fisher information matrix
+│   │   └── natural_gradient.rs     # Natural gradient descent
+│   │
+│   ├── info_bottleneck/       # [Theory 5] Information Bottleneck
+│   │   ├── kl_divergence.rs        # KL, JS divergences
+│   │   └── bottleneck.rs           # VIB layer
+│   │
+│   ├── pde_attention/         # [Theory 6] PDE/Diffusion
+│   │   ├── laplacian.rs            # Graph Laplacian construction
+│   │   └── diffusion.rs            # Heat equation attention
+│   │
+│   ├── unified_report/        # [Theory 7] Unified Diagnostics
+│   │   ├── metrics.rs              # Metric types and values
+│   │   ├── report.rs               # Geometry report builder
+│   │   └── recommendation.rs       # Attention mode recommendations
+│   │
+│   ├── training/              # Training utilities
+│   │   ├── loss.rs            # InfoNCE, contrastive losses
+│   │   ├── optimizer.rs       # SGD, Adam, AdamW
+│   │   └── curriculum.rs      # Curriculum scheduling
+│   │
+│   └── sdk/                   # High-level SDK
+│       ├── builder.rs         # Fluent builder API
+│       ├── pipeline.rs        # Composable pipelines
+│       └── presets.rs         # Model presets (BERT, GPT, etc.)
+```
+
+## Examples
+
+### Transformer Block
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_transformer_block(dim: usize) -> AttentionResult<AttentionPipeline> {
+    let attention = multi_head(dim, 12)
+        .dropout(0.1)
+        .build()?;
+
+    Ok(AttentionPipeline::new()
+        .add_norm(NormType::LayerNorm)
+        .add_attention(attention)
+        .add_dropout(0.1)
+        .add_residual())
+}
+```
+
+### Long Context Processing
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_long_context_attention(dim: usize, max_len: usize)
+    -> AttentionResult<Box<dyn Attention>> {
+    if max_len <= 2048 {
+        multi_head(dim, 12).build()
+    } else if max_len <= 16384 {
+        local_global(dim, 512).build()
+    } else {
+        linear(dim, dim / 4).build()
+    }
+}
+```
+
+### Graph Neural Network
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_graph_attention(dim: usize, is_tree: bool)
+    -> AttentionResult<Box<dyn Attention>> {
+    if is_tree {
+        hyperbolic(dim, -1.0).build()  // Hyperbolic for tree-like
+    } else {
+        multi_head(dim, 8).build()     // Standard for general graphs
+    }
+}
+```
+
+## Performance
+
+### Complexity Comparison
+
+| Mechanism | Time | Memory | Use Case |
+|-----------|------|--------|----------|
+| Scaled Dot-Product | O(n²) | O(n²) | Short sequences |
+| Multi-Head | O(n²) | O(n²) | Standard transformers |
+| Flash Attention | O(n²) | O(n) | Long sequences |
+| Linear Attention | O(n) | O(n) | Very long sequences |
+| Local-Global | O(n·w) | O(n·w) | Document processing |
+| Hyperbolic | O(n²) | O(n²) | Hierarchical data |
+| MoE | O(n²/E) | O(n²) | Specialized tasks |
+
+### Advanced Mechanisms Complexity
+
+| Theory | Mechanism | Time | Memory | Notes |
+|--------|-----------|------|--------|-------|
+| OT | Sliced Wasserstein | O(n·P·log n) | O(n·P) | P = num projections |
+| OT | Centroid OT | O(n + M²) | O(M·d) | M = num centroids |
+| Curvature | Mixed Curvature | O(n²) | O(n²) | Fused E+H+S kernel |
+| Topology | Gated Attention | O(n²) | O(n²) | + O(n) coherence |
+| Info Geo | Natural Gradient | O(n²) | O(n) | CG solver |
+| Info Bottle | VIB | O(n·z) | O(z) | z = bottleneck dim |
+| PDE | Diffusion | O(n²·T) | O(n²) | T = diffusion steps |
+
+Where:
+- `n` = sequence length
+- `w` = local window size
+- `E` = number of experts
+- `P` = number of random projections (typically 8-16)
+- `M` = number of centroids (typically 16-32)
+- `z` = bottleneck dimension
+- `T` = number of diffusion time steps
+
+### Benchmarks
+
+On a typical workload (batch_size=32, seq_len=512, dim=768):
+
+- **Flash Attention**: 2.3x faster, 5x less memory than standard
+- **Linear Attention**: O(n) scaling for sequences >4096
+- **Local-Global**: 60% of standard attention cost for w=256
+- **Sliced Wasserstein**: 1.8x slower than standard, but better distribution matching
+- **Mixed Curvature**: ~1.3x standard with tangent space optimization
+- **Diffusion Attention**: 2-10x slower depending on T, but captures multi-scale structure
+
+## Tutorials
+
+### Tutorial 1: Building a Geometry-Aware Transformer
+
+Combine multiple geometric attention mechanisms for hierarchical data.
+
+```rust
+use ruvector_attention::*;
+use ruvector_attention::sdk::*;
+
+fn create_geometry_aware_block(dim: usize) -> AttentionResult<AttentionPipeline> {
+    // Use hyperbolic attention for hierarchy + standard for local patterns
+    let hyperbolic_attn = hyperbolic(dim, -1.0).build()?;
+
+    // Create a pipeline with pre-norm
+    Ok(AttentionPipeline::new()
+        .add_norm(NormType::RMSNorm)
+        .add_attention(hyperbolic_attn)
+        .add_dropout(0.1)
+        .add_residual())
+}
+```
+
+### Tutorial 2: Adaptive Attention with Unified Report
+
+Use the unified report to automatically select the best attention mode.
+
+```rust
+use ruvector_attention::*;
+
+fn adaptive_attention(
+    query: &[f32],
+    keys: &[&[f32]],
+    values: &[&[f32]],
+) -> AttentionResult<Vec<f32>> {
+    // Build a diagnostic report
+    let report = ReportBuilder::new(ReportConfig::default())
+        .analyze_keys(keys)  // Automatically compute metrics
+        .build();
+
+    // Select attention based on recommendation
+    match report.recommendation {
+        AttentionRecommendation::Standard => {
+            let attn = ScaledDotProductAttention::new(query.len());
+            attn.compute(query, keys, values)
+        }
+        AttentionRecommendation::Sparse => {
+            let attn = FlashAttention::new(query.len(), 64);
+            attn.compute(query, keys, values)
+        }
+        AttentionRecommendation::Geometric => {
+            let config = HyperbolicAttentionConfig {
+                dim: query.len(),
+                curvature: -1.0,
+                ..Default::default()
+            };
+            let attn = HyperbolicAttention::new(config);
+            attn.compute(query, keys, values)
+        }
+        AttentionRecommendation::Diffusion => {
+            let config = DiffusionConfig::default();
+            let attn = DiffusionAttention::new(config);
+            attn.compute_diffusion(query, keys, values)
+        }
+    }
+}
+```
+
+### Tutorial 3: Information Bottleneck for Attention Compression
+
+Use VIB to learn compressed attention representations.
+
+```rust
+use ruvector_attention::*;
+
+struct CompressedAttention {
+    ib: InformationBottleneck,
+    encoder_mean: Vec<f32>,     // Learned weights
+    encoder_log_var: Vec<f32>,  // Learned weights
+}
+
+impl CompressedAttention {
+    fn new(input_dim: usize, bottleneck_dim: usize) -> Self {
+        let ib = InformationBottleneck::new(IBConfig {
+            beta: 0.1,
+            z_dim: bottleneck_dim,
+            ..Default::default()
+        });
+
+        Self {
+            ib,
+            encoder_mean: vec![0.0; input_dim * bottleneck_dim],
+            encoder_log_var: vec![0.0; input_dim * bottleneck_dim],
+        }
+    }
+
+    fn forward(&self, x: &[f32], epsilon: &[f32]) -> (Vec<f32>, f32) {
+        // Encode to mean and log_var (simplified)
+        let mean = self.encode_mean(x);
+        let log_var = self.encode_log_var(x);
+
+        // Sample from posterior
+        let z = self.ib.sample(&mean, &log_var, epsilon);
+
+        // Compute KL loss
+        let kl_loss = self.ib.compute_kl_loss(&mean, &log_var);
+
+        (z, kl_loss)
+    }
+
+    fn encode_mean(&self, _x: &[f32]) -> Vec<f32> {
+        // Linear transform (simplified)
+        vec![0.0; self.ib.config().z_dim]
+    }
+
+    fn encode_log_var(&self, _x: &[f32]) -> Vec<f32> {
+        vec![-1.0; self.ib.config().z_dim]  // Initialize to low variance
+    }
+}
+```
+
+### Tutorial 4: Multi-Scale Diffusion for Document Understanding
+
+Use diffusion attention at multiple scales for long documents.
+
+```rust
+use ruvector_attention::*;
+
+fn document_understanding(
+    query: &[f32],
+    document_keys: &[&[f32]],  // Keys from document chunks
+) -> Vec<Vec<f32>> {
+    // Configure diffusion with k-NN sparsity for large documents
+    let config = DiffusionConfig {
+        t: 2.0,           // Larger t for more diffusion
+        num_steps: 20,
+        sigma: 1.0,
+        use_knn: true,
+        k: 32,            // Sparse Laplacian
+        laplacian_type: LaplacianType::SymmetricNormalized,
+    };
+
+    let diffusion = DiffusionAttention::new(config);
+
+    // Get attention at 4 different scales
+    // Scale 0: Local (small t) - captures nearby relationships
+    // Scale 3: Global (large t) - captures document-level structure
+    let scales = diffusion.compute_multiscale(query, document_keys, 4);
+
+    scales
+}
+```
+
+### Tutorial 5: Natural Gradient Training Loop
+
+Train attention parameters with geometry-aware optimization.
+
+```rust
+use ruvector_attention::*;
+
+fn natural_gradient_step(
+    logits: &[f32],
+    target_probs: &[f32],
+    config: &NaturalGradientConfig,
+) -> Vec<f32> {
+    let ng = NaturalGradient::new(config.clone());
+
+    // Compute cross-entropy gradient w.r.t. logits
+    let probs = softmax(logits);
+    let grad: Vec<f32> = probs.iter()
+        .zip(target_probs.iter())
+        .map(|(p, t)| p - t)
+        .collect();
+
+    // Apply natural gradient update
+    // This uses F^{-1} to rescale gradients, accounting for
+    // the geometry of the probability simplex
+    ng.step_logits(logits, &grad)
+}
+
+fn softmax(logits: &[f32]) -> Vec<f32> {
+    let max = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let exp: Vec<f32> = logits.iter().map(|&l| (l - max).exp()).collect();
+    let sum: f32 = exp.iter().sum();
+    exp.iter().map(|&e| e / sum).collect()
+}
+```
+
+## Features
+
+- `simd` - SIMD acceleration (default, enabled)
+- `wasm` - WebAssembly support
+- `napi` - Node.js bindings
+
+## Documentation
+
+- [SDK Guide](docs/SDK_GUIDE.md) - Comprehensive SDK usage guide
+- [API Documentation](https://docs.rs/ruvector-attention) - Full API reference
+- [Examples](examples/) - Working code examples
+
+## Contributing
+
+Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## License
+
+Licensed under either of:
+
+- Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE))
+- MIT License ([LICENSE-MIT](LICENSE-MIT))
+
+at your option.
+
+## Citation
+
+If you use this crate in your research, please cite:
+
+```bibtex
+@software{ruvector_attention,
+  title = {ruvector-attention: Advanced Attention Mechanisms for Vector Search},
+  author = {ruvector contributors},
+  year = {2025},
+  url = {https://github.com/ruvnet/ruvector}
+}
+```
+
+## Related Projects
+
+- [ruvector](../ruvector) - Core vector search engine
+- [ruvector-graph](../ruvector-graph) - Graph neural networks
+- [ruvector-gnn](../ruvector-gnn) - Geometric neural networks
--- a/vendor/ruvector/crates/ruvector-attention/benches/attention_bench.rs
+++ b/vendor/ruvector/crates/ruvector-attention/benches/attention_bench.rs
@@ -0,0 +1,329 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use ruvector_attention::{
+    attention::ScaledDotProductAttention,
+    graph::{
+        DualSpaceAttention, DualSpaceConfig, EdgeFeaturedAttention, EdgeFeaturedConfig, GraphRoPE,
+        RoPEConfig,
+    },
+    hyperbolic::{HyperbolicAttention, HyperbolicAttentionConfig},
+    moe::{MoEAttention, MoEConfig},
+    sparse::{FlashAttention, LinearAttention, LocalGlobalAttention},
+    training::{Adam, InfoNCELoss, Loss, Optimizer},
+    traits::Attention,
+};
+
+fn bench_scaled_dot_product(c: &mut Criterion) {
+    let mut group = c.benchmark_group("scaled_dot_product");
+
+    for dim in [64, 128, 256, 512] {
+        let attention = ScaledDotProductAttention::new(dim);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.5; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_flash_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("flash_attention");
+
+    for seq_len in [64, 256, 512, 1024] {
+        let dim = 256;
+        let attention = FlashAttention::new(dim, 64);
+
+        group.bench_with_input(
+            BenchmarkId::new("seq_len", seq_len),
+            &seq_len,
+            |b, &seq_len| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_linear_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("linear_attention");
+
+    for seq_len in [256, 512, 1024, 2048] {
+        let dim = 256;
+        let attention = LinearAttention::new(dim, 64);
+
+        group.bench_with_input(
+            BenchmarkId::new("seq_len", seq_len),
+            &seq_len,
+            |b, &seq_len| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_local_global_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("local_global_attention");
+
+    for window_size in [16, 32, 64, 128] {
+        let dim = 256;
+        let attention = LocalGlobalAttention::new(dim, window_size, 4);
+
+        group.bench_with_input(
+            BenchmarkId::new("window", window_size),
+            &window_size,
+            |b, _| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..512)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..512)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_moe_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("moe_attention");
+
+    for num_experts in [2, 4, 8] {
+        let config = MoEConfig::builder()
+            .dim(256)
+            .num_experts(num_experts)
+            .top_k(2)
+            .build();
+        let attention = MoEAttention::new(config);
+
+        group.bench_with_input(
+            BenchmarkId::new("experts", num_experts),
+            &num_experts,
+            |b, _| {
+                let query = vec![0.5; 256];
+                let keys: Vec<Vec<f32>> = (0..100)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; 256])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..100)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; 256])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_hyperbolic_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_attention");
+
+    for dim in [64, 128, 256] {
+        let config = HyperbolicAttentionConfig {
+            dim,
+            curvature: -1.0,
+            ..Default::default()
+        };
+        let attention = HyperbolicAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.1; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.001) % 0.5; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.002) % 0.5; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_edge_featured_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("edge_featured_attention");
+
+    for num_heads in [1, 2, 4, 8] {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(256)
+            .edge_dim(32)
+            .num_heads(num_heads)
+            .build();
+        let attention = EdgeFeaturedAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("heads", num_heads), &num_heads, |b, _| {
+            let query = vec![0.5; 256];
+            let keys: Vec<Vec<f32>> = (0..64)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; 256])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..64)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; 256])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_graph_rope(c: &mut Criterion) {
+    let mut group = c.benchmark_group("graph_rope");
+
+    for dim in [64, 128, 256] {
+        let config = RoPEConfig::builder().dim(dim).max_position(1024).build();
+        let attention = GraphRoPE::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.5; dim];
+            let keys: Vec<Vec<f32>> = (0..256)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..256)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_dual_space_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dual_space_attention");
+
+    for dim in [64, 128, 256] {
+        let config = DualSpaceConfig::builder()
+            .dim(dim)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+        let attention = DualSpaceAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.1; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.001) % 0.3; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.002) % 0.3; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_infonce_loss(c: &mut Criterion) {
+    let mut group = c.benchmark_group("infonce_loss");
+
+    for num_negatives in [10, 50, 100, 200] {
+        let loss = InfoNCELoss::new(0.07);
+
+        group.bench_with_input(
+            BenchmarkId::new("negatives", num_negatives),
+            &num_negatives,
+            |b, &num_neg| {
+                let anchor = vec![0.5; 128];
+                let positive = vec![0.6; 128];
+                let negatives: Vec<Vec<f32>> = (0..num_neg)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; 128])
+                    .collect();
+                let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+                b.iter(|| black_box(loss.compute(&anchor, &positive, &neg_refs)));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_adam_optimizer(c: &mut Criterion) {
+    let mut group = c.benchmark_group("adam_optimizer");
+
+    for dim in [128, 256, 512, 1024] {
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let mut optimizer = Adam::new(dim, 0.001);
+            let mut params = vec![0.5; dim];
+            let gradients = vec![0.01; dim];
+
+            b.iter(|| {
+                optimizer.step(&mut params, &gradients);
+                black_box(&params)
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_scaled_dot_product,
+    bench_flash_attention,
+    bench_linear_attention,
+    bench_local_global_attention,
+    bench_moe_attention,
+    bench_hyperbolic_attention,
+    bench_edge_featured_attention,
+    bench_graph_rope,
+    bench_dual_space_attention,
+    bench_infonce_loss,
+    bench_adam_optimizer,
+);
+criterion_main!(benches);
--- a/vendor/ruvector/crates/ruvector-attention/benches/attention_benchmarks.rs
+++ b/vendor/ruvector/crates/ruvector-attention/benches/attention_benchmarks.rs
@@ -0,0 +1,303 @@
+//! Benchmarks for ruvector-attention
+//!
+//! Run with: cargo bench -p ruvector-attention
+
+use std::time::Instant;
+
+use ruvector_attention::{
+    attention::ScaledDotProductAttention,
+    graph::{
+        DualSpaceAttention, DualSpaceConfig, EdgeFeaturedAttention, EdgeFeaturedConfig, GraphRoPE,
+        RoPEConfig,
+    },
+    hyperbolic::{HyperbolicAttention, HyperbolicAttentionConfig},
+    moe::{MoEAttention, MoEConfig},
+    sparse::{FlashAttention, LinearAttention, LocalGlobalAttention},
+    training::{Adam, InfoNCELoss, Loss, Optimizer},
+    traits::Attention,
+};
+
+fn main() {
+    println!("=== ruvector-attention Benchmarks ===\n");
+
+    // Configuration
+    let dim = 256;
+    let seq_len = 512;
+    let iterations = 100;
+
+    // Generate test data
+    let query = vec![0.5f32; dim];
+    let keys: Vec<Vec<f32>> = (0..seq_len)
+        .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+        .collect();
+    let values: Vec<Vec<f32>> = (0..seq_len)
+        .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+        .collect();
+    let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+    let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+    println!("Configuration:");
+    println!("  Dimension: {}", dim);
+    println!("  Sequence Length: {}", seq_len);
+    println!("  Iterations: {}", iterations);
+    println!();
+
+    // 1. Scaled Dot-Product Attention
+    {
+        let attention = ScaledDotProductAttention::new(dim);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Scaled Dot-Product Attention:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 2. Flash Attention
+    {
+        let attention = FlashAttention::new(dim, 64);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Flash Attention (block_size=64):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 3. Linear Attention
+    {
+        let attention = LinearAttention::new(dim, 64);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Linear Attention (num_features=64):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 4. Local-Global Attention
+    {
+        let attention = LocalGlobalAttention::new(dim, 32, 4);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Local-Global Attention (window=32, global=4):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 5. MoE Attention
+    {
+        let config = MoEConfig::builder()
+            .dim(dim)
+            .num_experts(4)
+            .top_k(2)
+            .build();
+        let attention = MoEAttention::new(config);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("MoE Attention (4 experts, top-2):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 6. Hyperbolic Attention
+    {
+        let config = HyperbolicAttentionConfig {
+            dim,
+            curvature: -1.0,
+            ..Default::default()
+        };
+        let attention = HyperbolicAttention::new(config);
+        // Use smaller values for Poincaré ball
+        let hyp_query = vec![0.1f32; dim];
+        let hyp_keys: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.001) % 0.5; dim])
+            .collect();
+        let hyp_values: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.002) % 0.5; dim])
+            .collect();
+        let hyp_keys_refs: Vec<&[f32]> = hyp_keys.iter().map(|k| k.as_slice()).collect();
+        let hyp_values_refs: Vec<&[f32]> = hyp_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&hyp_query, &hyp_keys_refs, &hyp_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Hyperbolic Attention (curvature=1.0):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 7. Edge-Featured Graph Attention
+    {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(dim)
+            .edge_dim(32)
+            .num_heads(4)
+            .build();
+        let attention = EdgeFeaturedAttention::new(config);
+
+        let graph_keys: Vec<Vec<f32>> = (0..64)
+            .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+            .collect();
+        let graph_values: Vec<Vec<f32>> = (0..64)
+            .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+            .collect();
+        let graph_keys_refs: Vec<&[f32]> = graph_keys.iter().map(|k| k.as_slice()).collect();
+        let graph_values_refs: Vec<&[f32]> = graph_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&query, &graph_keys_refs, &graph_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Edge-Featured Graph Attention (4 heads):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 8. Graph RoPE
+    {
+        let config = RoPEConfig::builder().dim(dim).max_position(1024).build();
+        let attention = GraphRoPE::new(config);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Graph RoPE Attention:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 9. Dual-Space Attention
+    {
+        let config = DualSpaceConfig::builder()
+            .dim(dim)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+        let attention = DualSpaceAttention::new(config);
+
+        // Use smaller values for hyperbolic component
+        let dual_query = vec![0.1f32; dim];
+        let dual_keys: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.001) % 0.3; dim])
+            .collect();
+        let dual_values: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.002) % 0.3; dim])
+            .collect();
+        let dual_keys_refs: Vec<&[f32]> = dual_keys.iter().map(|k| k.as_slice()).collect();
+        let dual_values_refs: Vec<&[f32]> = dual_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&dual_query, &dual_keys_refs, &dual_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Dual-Space Attention (Euclidean + Hyperbolic):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 10. Training: InfoNCE Loss
+    {
+        let loss = InfoNCELoss::new(0.07);
+        let anchor = vec![0.5f32; 128];
+        let positive = vec![0.6f32; 128];
+        let negatives: Vec<Vec<f32>> = (0..50)
+            .map(|i| vec![(i as f32 * 0.01) % 1.0; 128])
+            .collect();
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = loss.compute(&anchor, &positive, &neg_refs);
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("InfoNCE Loss (50 negatives):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 11. Training: Adam Optimizer
+    {
+        let mut optimizer = Adam::new(dim, 0.001);
+        let mut params = vec![0.5f32; dim];
+        let gradients = vec![0.01f32; dim];
+
+        let start = Instant::now();
+        for _ in 0..iterations * 10 {
+            optimizer.step(&mut params, &gradients);
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / (iterations * 10) as f64;
+        println!("Adam Optimizer Step:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    println!("=== Benchmark Complete ===");
+
+    // Summary
+    println!("\n=== Summary ===");
+    println!("All attention mechanisms functional and benchmarked.");
+    println!("Module coverage:");
+    println!("  - Core: ScaledDotProductAttention, MultiHeadAttention");
+    println!("  - Sparse: FlashAttention, LinearAttention, LocalGlobalAttention");
+    println!("  - MoE: MoEAttention with learned routing");
+    println!("  - Graph: EdgeFeaturedAttention, GraphRoPE, DualSpaceAttention");
+    println!("  - Hyperbolic: HyperbolicAttention, MixedCurvatureAttention");
+    println!("  - Training: InfoNCE, ContrastiveLoss, Adam/AdamW/SGD, Curriculum");
+}
--- a/vendor/ruvector/crates/ruvector-attention/docs/IMPLEMENTATION_SUMMARY.md
+++ b/vendor/ruvector/crates/ruvector-attention/docs/IMPLEMENTATION_SUMMARY.md
@@ -0,0 +1,330 @@
+# ruvector-attention SDK Implementation Summary
+
+## Overview
+
+Successfully implemented a comprehensive, ergonomic SDK for the ruvector-attention crate following Agent 10's specifications.
+
+## Deliverables
+
+### 1. SDK Module Structure
+
+Created high-level SDK APIs at `crates/ruvector-attention/src/sdk/`:
+
+```
+src/sdk/
+├── mod.rs          # Module exports and documentation
+├── builder.rs      # Fluent builder API (500+ lines)
+├── pipeline.rs     # Composable pipeline system (350+ lines)
+└── presets.rs      # Model presets and smart selection (400+ lines)
+```
+
+### 2. Builder API (`builder.rs`)
+
+#### Features
+- **Fluent Interface**: Method chaining for ergonomic configuration
+- **7 Attention Types**: Scaled Dot, Multi-Head, Flash, Linear, Local-Global, Hyperbolic, MoE
+- **Comprehensive Options**: Dropout, causal masking, expert capacity, jitter noise
+- **Type Safety**: Strongly-typed builder pattern
+- **Convenience Functions**: `multi_head()`, `flash()`, `linear()`, etc.
+
+#### Example
+```rust
+let attention = multi_head(768, 12)
+    .dropout(0.1)
+    .causal(true)
+    .build()?;
+```
+
+### 3. Pipeline API (`pipeline.rs`)
+
+#### Features
+- **Composable Operations**: Chain attention, normalization, dropout, residuals
+- **3 Normalization Types**: LayerNorm, RMSNorm, BatchNorm
+- **Custom Transformations**: Add custom processing functions
+- **Pre-built Blocks**: `transformer_block()`, `prenorm_transformer_block()`
+
+#### Example
+```rust
+let pipeline = AttentionPipeline::new()
+    .add_attention(attention)
+    .add_norm(NormType::LayerNorm)
+    .add_dropout(0.1)
+    .add_residual();
+```
+
+### 4. Presets (`presets.rs`)
+
+#### Features
+- **10 Model Presets**: BERT, GPT, Longformer, Performer, Flash, Switch, T5, ViT, etc.
+- **Smart Selection**: Automatic attention type selection based on use case
+- **Model Name Lookup**: Create attention from model names ("bert", "gpt2", etc.)
+- **Use Case Helpers**: `for_sequences()`, `for_graphs()`, `for_vision()`, etc.
+
+#### Example
+```rust
+// Preset configuration
+let bert = AttentionPreset::Bert.builder(768).build()?;
+
+// Smart selection
+let attention = for_sequences(512, max_len).build()?;
+
+// By name
+let gpt = from_model_name("gpt2", 768)?;
+```
+
+## Core Implementation
+
+### Main Library (`lib.rs`)
+
+- Organized module structure
+- Clean re-exports for public API
+- Comprehensive documentation
+
+### Attention Implementations
+
+Created implementations in `src/attention/`:
+- `scaled_dot_product.rs` - Fundamental attention mechanism
+- `multi_head.rs` - Parallel attention heads
+
+### Configuration (`config/mod.rs`)
+
+- Serde-serializable configuration types
+- Builder pattern for configs
+- Validation methods
+
+## Documentation
+
+### 1. README.md
+- Quick start guide
+- Feature overview
+- Architecture diagram
+- Performance benchmarks
+- Examples for all use cases
+
+### 2. SDK_GUIDE.md (Comprehensive Guide)
+- Detailed API documentation
+- Usage examples for each attention type
+- Advanced patterns
+- Performance tips
+- Testing guidelines
+
+### 3. IMPLEMENTATION_SUMMARY.md (This File)
+- Implementation overview
+- API reference
+- Design decisions
+
+## Code Quality
+
+### Tests
+All tests passing (22/22):
+```bash
+running 22 tests
+test result: ok. 22 passed; 0 failed; 0 ignored; 0 measured
+```
+
+### Compilation
+- Zero errors
+- Clean build with only minor warnings about unused variables
+- Documentation generated successfully
+
+### API Design
+- Ergonomic fluent interfaces
+- Clear method names
+- Comprehensive documentation
+- Type-safe builders
+
+## SDK API Reference
+
+### Builder Methods
+
+```rust
+impl AttentionBuilder {
+    // Core configuration
+    fn new(dim: usize) -> Self;
+    fn build(self) -> AttentionResult<Box<dyn Attention>>;
+
+    // Attention types
+    fn multi_head(self, num_heads: usize) -> Self;
+    fn flash(self, block_size: usize) -> Self;
+    fn linear(self, num_features: usize) -> Self;
+    fn local_global(self, window: usize) -> Self;
+    fn hyperbolic(self, curvature: f32) -> Self;
+    fn moe(self, num_experts: usize, top_k: usize) -> Self;
+
+    // Options
+    fn dropout(self, p: f32) -> Self;
+    fn causal(self, causal: bool) -> Self;
+    fn expert_capacity(self, capacity: f32) -> Self;
+    fn jitter_noise(self, noise: f32) -> Self;
+}
+```
+
+### Pipeline Methods
+
+```rust
+impl AttentionPipeline {
+    fn new() -> Self;
+
+    // Add stages
+    fn add_attention(self, attention: Box<dyn Attention>) -> Self;
+    fn add_norm(self, norm_type: NormType) -> Self;
+    fn add_dropout(self, p: f32) -> Self;
+    fn add_residual(self) -> Self;
+    fn add_custom<F>(self, f: F) -> Self;
+
+    // Execute
+    fn run(&self, query: &[f32], keys: &[&[f32]], values: &[&[f32]])
+        -> AttentionResult<Vec<f32>>;
+}
+```
+
+### Preset Functions
+
+```rust
+// Model presets
+enum AttentionPreset {
+    Bert, Gpt, Longformer, Performer, FlashOptimized,
+    SwitchTransformer, HyperbolicTree, T5, ViT, SparseTransformer
+}
+
+impl AttentionPreset {
+    fn builder(self, dim: usize) -> AttentionBuilder;
+    fn description(&self) -> &'static str;
+}
+
+// Smart selection
+fn for_sequences(dim: usize, max_len: usize) -> AttentionBuilder;
+fn for_graphs(dim: usize, hierarchical: bool) -> AttentionBuilder;
+fn for_large_scale(dim: usize) -> AttentionBuilder;
+fn for_vision(dim: usize, patch_size: usize) -> AttentionBuilder;
+fn for_generation(dim: usize, context_len: usize) -> AttentionBuilder;
+fn for_moe(dim: usize, num_experts: usize, top_k: usize) -> AttentionBuilder;
+
+// Model name lookup
+fn from_model_name(model_name: &str, dim: usize) -> Option<AttentionBuilder>;
+```
+
+## Design Decisions
+
+### 1. Builder Pattern
+- **Rationale**: Provides ergonomic API for complex configurations
+- **Benefits**: Type-safe, self-documenting, extensible
+- **Trade-offs**: Slightly more verbose than direct construction
+
+### 2. Pipeline Composition
+- **Rationale**: Enable flexible combination of operations
+- **Benefits**: Modular, reusable, matches transformer architecture
+- **Trade-offs**: Small runtime overhead for stage dispatch
+
+### 3. Preset System
+- **Rationale**: Reduce boilerplate for common configurations
+- **Benefits**: Quick prototyping, consistency, best practices
+- **Trade-offs**: Additional code for preset definitions
+
+### 4. Trait Objects
+- **Rationale**: Allow runtime polymorphism for attention types
+- **Benefits**: Flexible, composable, dynamic dispatch
+- **Trade-offs**: Virtual call overhead (minimal impact)
+
+## Usage Examples
+
+### Basic Multi-Head Attention
+```rust
+use ruvector_attention::sdk::*;
+
+let attention = multi_head(768, 12)
+    .dropout(0.1)
+    .build()?;
+
+let query = vec![0.5; 768];
+let keys = vec![&query[..]; 10];
+let values = vec![&query[..]; 10];
+
+let output = attention.compute(&query, &keys, &values)?;
+```
+
+### Transformer Block
+```rust
+use ruvector_attention::sdk::*;
+
+let attention = multi_head(768, 12).build()?;
+
+let block = AttentionPipeline::new()
+    .add_norm(NormType::LayerNorm)
+    .add_attention(attention)
+    .add_dropout(0.1)
+    .add_residual();
+```
+
+### Smart Selection
+```rust
+use ruvector_attention::sdk::presets::*;
+
+// Auto-select based on sequence length
+let attention = for_sequences(512, 8192).build()?;
+// → Uses Longformer for this length
+
+// Graph attention
+let graph_attn = for_graphs(256, true).build()?;
+// → Uses Hyperbolic for hierarchical graphs
+```
+
+### Model Presets
+```rust
+use ruvector_attention::sdk::*;
+
+// BERT configuration
+let bert = AttentionPreset::Bert.builder(768).build()?;
+
+// GPT with custom dropout
+let gpt = AttentionPreset::Gpt.builder(768)
+    .dropout(0.2)
+    .build()?;
+
+// By model name
+let t5 = from_model_name("t5", 768)?.build()?;
+```
+
+## Performance Characteristics
+
+### Builder Overhead
+- **Build time**: ~0.1μs (negligible)
+- **Memory**: Zero runtime overhead after build
+
+### Pipeline Overhead
+- **Per stage**: ~5ns dispatch overhead
+- **Total**: <50ns for typical 4-stage pipeline
+- **Memory**: One allocation for stage vector
+
+### Preset Lookup
+- **By enum**: Compile-time (zero overhead)
+- **By name**: ~100ns hash lookup
+- **Smart selection**: <200ns for decision logic
+
+## Future Enhancements
+
+### Potential Additions
+1. **More Presets**: Add Llama, Mistral, Qwen configurations
+2. **Dynamic Configuration**: Runtime config loading from files
+3. **Optimization Hints**: Auto-tuning based on hardware
+4. **Metrics Collection**: Built-in performance monitoring
+5. **Serialization**: Save/load attention configurations
+
+### API Extensions
+1. **Batch Processing**: Pipeline support for batches
+2. **Async Execution**: Async trait implementations
+3. **Hardware Acceleration**: GPU/TPU backend selection
+4. **Mixed Precision**: FP16/BF16 support in builder
+
+## Conclusion
+
+The SDK implementation successfully provides:
+
+✅ **Ergonomic API**: Fluent builders and pipelines
+✅ **Comprehensive Coverage**: All attention types supported
+✅ **Smart Defaults**: Presets and intelligent selection
+✅ **Excellent Documentation**: README, guide, and API docs
+✅ **Production Ready**: Tested, documented, and performant
+✅ **Extensible Design**: Easy to add new attention types
+
+The SDK achieves its goal of making advanced attention mechanisms accessible through high-level, easy-to-use APIs while maintaining the flexibility to handle complex use cases.
--- a/vendor/ruvector/crates/ruvector-attention/docs/SDK_GUIDE.md
+++ b/vendor/ruvector/crates/ruvector-attention/docs/SDK_GUIDE.md
@@ -0,0 +1,416 @@
+# ruvector-attention SDK Guide
+
+## Overview
+
+The ruvector-attention SDK provides high-level, ergonomic APIs for building attention mechanisms. It includes three main components:
+
+1. **Builder API** - Fluent interface for configuring attention
+2. **Pipeline API** - Composable operations with normalization and residuals
+3. **Presets** - Ready-to-use configurations for common models
+
+## Quick Start
+
+### Basic Usage
+
+```rust
+use ruvector_attention::sdk::*;
+
+// Create a simple multi-head attention
+let attention = multi_head(768, 12)
+    .dropout(0.1)
+    .causal(true)
+    .build()?;
+
+// Use it
+let query = vec![0.5; 768];
+let keys = vec![&query[..]; 10];
+let values = vec![&query[..]; 10];
+
+let output = attention.compute(&query, &keys, &values)?;
+```
+
+### Using Presets
+
+```rust
+use ruvector_attention::sdk::presets::*;
+
+// BERT-style attention
+let bert = AttentionPreset::Bert.builder(768).build()?;
+
+// GPT-style causal attention
+let gpt = AttentionPreset::Gpt.builder(768).build()?;
+
+// Flash attention for long sequences
+let flash = AttentionPreset::FlashOptimized.builder(1024).build()?;
+
+// Automatic selection based on sequence length
+let auto = for_sequences(512, 8192).build()?;
+```
+
+### Building Pipelines
+
+```rust
+use ruvector_attention::sdk::*;
+
+// Create a transformer block
+let attention = multi_head(768, 12).build()?;
+
+let pipeline = AttentionPipeline::new()
+    .add_attention(attention)
+    .add_dropout(0.1)
+    .add_residual()
+    .add_norm(NormType::LayerNorm);
+
+// Run the pipeline
+let output = pipeline.run(&query, &keys, &values)?;
+```
+
+## Builder API
+
+### Available Attention Types
+
+#### 1. Scaled Dot-Product Attention
+
+The fundamental attention mechanism: `softmax(QK^T / √d)V`
+
+```rust
+let attention = scaled_dot(512).build()?;
+```
+
+#### 2. Multi-Head Attention
+
+Parallel attention heads for diverse representation learning:
+
+```rust
+let attention = multi_head(768, 12)
+    .dropout(0.1)
+    .build()?;
+```
+
+#### 3. Flash Attention
+
+Memory-efficient O(n) attention using tiled computation:
+
+```rust
+let attention = flash(1024, 128)  // dim, block_size
+    .causal(true)
+    .build()?;
+```
+
+#### 4. Linear Attention
+
+O(n) complexity using kernel feature maps:
+
+```rust
+let attention = linear(512, 256)  // dim, num_features
+    .build()?;
+```
+
+#### 5. Local-Global Attention
+
+Sliding window + global tokens (Longformer-style):
+
+```rust
+let attention = local_global(512, 256)  // dim, window_size
+    .build()?;
+```
+
+#### 6. Hyperbolic Attention
+
+Attention in hyperbolic space for hierarchical data:
+
+```rust
+let attention = hyperbolic(512, -1.0)  // dim, curvature
+    .build()?;
+```
+
+#### 7. Mixture-of-Experts Attention
+
+Learned routing to specialized experts:
+
+```rust
+let attention = moe(512, 8, 2)  // dim, num_experts, top_k
+    .expert_capacity(1.25)
+    .jitter_noise(0.01)
+    .build()?;
+```
+
+### Builder Options
+
+All builders support these common options:
+
+```rust
+let attention = AttentionBuilder::new(512)
+    .multi_head(8)           // Number of heads
+    .dropout(0.1)            // Dropout probability
+    .causal(true)            // Causal masking
+    .expert_capacity(1.25)   // MoE capacity factor
+    .jitter_noise(0.01)      // MoE routing noise
+    .build()?;
+```
+
+## Pipeline API
+
+### Creating Pipelines
+
+```rust
+let pipeline = AttentionPipeline::new()
+    .add_attention(attention)
+    .add_norm(NormType::LayerNorm)
+    .add_dropout(0.1)
+    .add_residual()
+    .add_custom(|x| {
+        // Custom transformation
+        x.iter().map(|v| v.max(0.0)).collect()
+    });
+```
+
+### Normalization Types
+
+```rust
+// Layer Normalization (standard)
+.add_norm(NormType::LayerNorm)
+
+// RMS Normalization (simpler)
+.add_norm(NormType::RMSNorm)
+
+// Batch Normalization
+.add_norm(NormType::BatchNorm)
+```
+
+### Pre-built Transformers
+
+```rust
+// Standard post-norm transformer block
+let block = transformer_block(attention, 0.1);
+
+// Pre-norm transformer block (more stable)
+let block = prenorm_transformer_block(attention, 0.1);
+```
+
+## Presets
+
+### Model Presets
+
+```rust
+// BERT (bidirectional, 12 heads, 0.1 dropout)
+AttentionPreset::Bert.builder(768)
+
+// GPT (causal, 12 heads, 0.1 dropout)
+AttentionPreset::Gpt.builder(768)
+
+// Longformer (512 window, local-global)
+AttentionPreset::Longformer.builder(512)
+
+// Performer (linear attention, O(n))
+AttentionPreset::Performer.builder(512)
+
+// Flash (memory-efficient, 128 block)
+AttentionPreset::FlashOptimized.builder(1024)
+
+// Switch Transformer (8 experts, top-2)
+AttentionPreset::SwitchTransformer.builder(512)
+
+// Hyperbolic (hierarchical data)
+AttentionPreset::HyperbolicTree.builder(512)
+
+// T5 (encoder-decoder)
+AttentionPreset::T5.builder(768)
+
+// Vision Transformer
+AttentionPreset::ViT.builder(768)
+
+// Sparse Transformer
+AttentionPreset::SparseTransformer.builder(512)
+```
+
+### Smart Selection
+
+The SDK provides intelligent preset selection:
+
+```rust
+// Automatic based on sequence length
+let attention = for_sequences(512, max_len).build()?;
+// ≤512: BERT
+// ≤4096: Longformer
+// >4096: Performer
+
+// Graph attention
+let attention = for_graphs(256, hierarchical).build()?;
+// hierarchical=true: Hyperbolic
+// hierarchical=false: Multi-head
+
+// Large-scale processing
+let attention = for_large_scale(1024).build()?;
+// Uses Flash attention
+
+// Vision tasks
+let attention = for_vision(768, patch_size).build()?;
+// Uses ViT configuration
+
+// Autoregressive generation
+let attention = for_generation(768, context_len).build()?;
+// ≤2048: GPT
+// >2048: Flash with causal
+
+// MoE with custom routing
+let attention = for_moe(512, num_experts, top_k).build()?;
+```
+
+### From Model Names
+
+```rust
+// By model name (case-insensitive)
+let bert = from_model_name("bert", 768)?;
+let gpt = from_model_name("gpt2", 768)?;
+let longformer = from_model_name("longformer", 512)?;
+let t5 = from_model_name("t5", 768)?;
+let vit = from_model_name("vit", 768)?;
+```
+
+## Advanced Examples
+
+### Custom Transformer Layer
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_transformer_layer(dim: usize, num_heads: usize) -> AttentionResult<AttentionPipeline> {
+    let attention = multi_head(dim, num_heads)
+        .dropout(0.1)
+        .build()?;
+
+    Ok(AttentionPipeline::new()
+        .add_norm(NormType::LayerNorm)  // Pre-norm
+        .add_attention(attention)
+        .add_dropout(0.1)
+        .add_residual()
+        .add_norm(NormType::LayerNorm)) // Post-norm
+}
+```
+
+### Efficient Long-Sequence Processing
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_long_context_attention(dim: usize, max_len: usize) -> AttentionResult<Box<dyn Attention>> {
+    if max_len <= 2048 {
+        // Standard attention for short sequences
+        multi_head(dim, 12).build()
+    } else if max_len <= 16384 {
+        // Local-global for medium sequences
+        local_global(dim, 512).build()
+    } else {
+        // Linear attention for very long sequences
+        linear(dim, dim / 4).build()
+    }
+}
+```
+
+### Hierarchical Graph Attention
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_graph_attention(dim: usize, is_tree: bool) -> AttentionResult<Box<dyn Attention>> {
+    if is_tree {
+        // Use hyperbolic space for tree-like structures
+        hyperbolic(dim, -1.0).build()
+    } else {
+        // Standard attention for general graphs
+        multi_head(dim, 8).build()
+    }
+}
+```
+
+### Sparse + Dense Hybrid
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_hybrid_pipeline(dim: usize) -> AttentionResult<AttentionPipeline> {
+    // Local attention
+    let local = flash(dim, 128).build()?;
+
+    // Global attention (can be added in sequence)
+    let global = multi_head(dim, 8).build()?;
+
+    Ok(AttentionPipeline::new()
+        .add_attention(local)
+        .add_norm(NormType::LayerNorm)
+        .add_residual())
+}
+```
+
+### MoE for Specialized Tasks
+
+```rust
+use ruvector_attention::sdk::*;
+
+fn create_moe_attention(dim: usize) -> AttentionResult<Box<dyn Attention>> {
+    moe(dim, 16, 2)  // 16 experts, route to top-2
+        .expert_capacity(1.5)  // Higher capacity for load balancing
+        .jitter_noise(0.1)     // Exploration during training
+        .build()
+}
+```
+
+## Performance Tips
+
+1. **Choose the right attention type:**
+   - Short sequences (<512): Standard multi-head
+   - Medium sequences (512-4096): Local-global or Flash
+   - Long sequences (>4096): Linear or Performer
+   - Hierarchical data: Hyperbolic
+   - Specialized patterns: MoE
+
+2. **Use Flash attention for:**
+   - Long sequences
+   - Memory-constrained environments
+   - Training with limited GPU memory
+
+3. **Use Linear attention for:**
+   - Very long sequences (>16k tokens)
+   - Inference-only scenarios
+   - Real-time applications
+
+4. **Use MoE for:**
+   - Multi-task learning
+   - Specialized domain processing
+   - Scaling model capacity
+
+5. **Pipeline optimization:**
+   - Pre-norm is more stable for deep models
+   - RMSNorm is faster than LayerNorm
+   - Dropout during training only
+
+## Testing
+
+```rust
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_attention_pipeline() {
+        let attention = multi_head(512, 8).build().unwrap();
+        let pipeline = AttentionPipeline::new()
+            .add_attention(attention)
+            .add_norm(NormType::LayerNorm);
+
+        let query = vec![0.5; 512];
+        let keys = vec![&query[..]; 10];
+        let values = vec![&query[..]; 10];
+
+        let output = pipeline.run(&query, &keys, &values).unwrap();
+        assert_eq!(output.len(), 512);
+    }
+}
+```
+
+## Next Steps
+
+- See `examples/` directory for complete working examples
+- Check the API documentation for detailed parameter descriptions
+- Review benchmarks in `benches/` for performance comparisons
--- a/vendor/ruvector/crates/ruvector-attention/examples/hyperbolic_bench.rs
+++ b/vendor/ruvector/crates/ruvector-attention/examples/hyperbolic_bench.rs
@@ -0,0 +1,263 @@
+//! Benchmark: Lorentz Cascade Attention vs Poincaré Attention
+//!
+//! Run with: cargo bench -p ruvector-attention --bench hyperbolic_bench
+
+use std::time::Instant;
+
+// Import both attention mechanisms
+use ruvector_attention::hyperbolic::{
+    busemann_score,
+    einstein_midpoint,
+    frechet_mean,
+    lorentz_distance,
+    // Poincaré (baseline)
+    poincare_distance,
+    project_hyperboloid,
+    HyperbolicAttention,
+    HyperbolicAttentionConfig,
+    LCAConfig,
+    // Lorentz Cascade (novel)
+    LorentzCascadeAttention,
+};
+
+fn generate_test_data(n: usize, dim: usize) -> (Vec<f32>, Vec<Vec<f32>>) {
+    let query: Vec<f32> = (0..dim)
+        .map(|i| ((i as f32 * 0.1).sin() * 0.3).clamp(-0.9, 0.9))
+        .collect();
+
+    let keys: Vec<Vec<f32>> = (0..n)
+        .map(|j| {
+            (0..dim)
+                .map(|i| (((i + j) as f32 * 0.07).cos() * 0.3).clamp(-0.9, 0.9))
+                .collect()
+        })
+        .collect();
+
+    (query, keys)
+}
+
+fn bench_poincare_distance(iterations: usize, n_keys: usize, dim: usize) -> std::time::Duration {
+    let (query, keys) = generate_test_data(n_keys, dim);
+    let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        for key in &keys_refs {
+            let _d = poincare_distance(&query, key, 1.0);
+        }
+    }
+    start.elapsed()
+}
+
+fn bench_lorentz_distance(iterations: usize, n_keys: usize, dim: usize) -> std::time::Duration {
+    let (query, keys) = generate_test_data(n_keys, dim + 1); // +1 for time dimension
+    let query_h = project_hyperboloid(&query, 1.0);
+    let keys_h: Vec<Vec<f32>> = keys.iter().map(|k| project_hyperboloid(k, 1.0)).collect();
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        for key in &keys_h {
+            let _d = lorentz_distance(&query_h, key, 1.0);
+        }
+    }
+    start.elapsed()
+}
+
+fn bench_busemann_scoring(iterations: usize, n_keys: usize, dim: usize) -> std::time::Duration {
+    let (query, keys) = generate_test_data(n_keys, dim + 1);
+    let focal: Vec<f32> = {
+        let mut f = vec![1.0];
+        f.extend(vec![0.0; dim]);
+        f[1] = 1.0; // Light-like
+        f
+    };
+    let query_h = project_hyperboloid(&query, 1.0);
+    let keys_h: Vec<Vec<f32>> = keys.iter().map(|k| project_hyperboloid(k, 1.0)).collect();
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        for key in &keys_h {
+            let _score = busemann_score(key, &focal) - busemann_score(&query_h, &focal);
+        }
+    }
+    start.elapsed()
+}
+
+fn bench_frechet_mean(iterations: usize, n_points: usize, dim: usize) -> std::time::Duration {
+    let (_, points) = generate_test_data(n_points, dim);
+    let points_refs: Vec<&[f32]> = points.iter().map(|p| p.as_slice()).collect();
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        let _mean = frechet_mean(&points_refs, None, 1.0, 50, 1e-5);
+    }
+    start.elapsed()
+}
+
+fn bench_einstein_midpoint(iterations: usize, n_points: usize, dim: usize) -> std::time::Duration {
+    let (_, points) = generate_test_data(n_points, dim + 1);
+    let points_h: Vec<Vec<f32>> = points.iter().map(|p| project_hyperboloid(p, 1.0)).collect();
+    let points_refs: Vec<&[f32]> = points_h.iter().map(|p| p.as_slice()).collect();
+    let weights: Vec<f32> = vec![1.0 / n_points as f32; n_points];
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        let _mid = einstein_midpoint(&points_refs, &weights, 1.0);
+    }
+    start.elapsed()
+}
+
+fn bench_full_poincare_attention(
+    iterations: usize,
+    n_keys: usize,
+    dim: usize,
+) -> std::time::Duration {
+    let (query, keys) = generate_test_data(n_keys, dim);
+    let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+    let config = HyperbolicAttentionConfig {
+        dim,
+        curvature: -1.0,
+        adaptive_curvature: false,
+        temperature: 1.0,
+        frechet_max_iter: 50,
+        frechet_tol: 1e-5,
+    };
+    let attention = HyperbolicAttention::new(config);
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        let _result = attention.compute_weights(&query, &keys_refs);
+    }
+    start.elapsed()
+}
+
+fn bench_full_lca_attention(iterations: usize, n_keys: usize, dim: usize) -> std::time::Duration {
+    let (query, keys) = generate_test_data(n_keys, dim);
+    let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+    let config = LCAConfig {
+        dim,
+        num_heads: 4,
+        curvature_range: (0.1, 2.0),
+        temperature: 1.0,
+    };
+    let attention = LorentzCascadeAttention::new(config);
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        let _result = attention.attend(&query, &keys_refs, &keys_refs);
+    }
+    start.elapsed()
+}
+
+fn main() {
+    println!("╔══════════════════════════════════════════════════════════════════╗");
+    println!("║     Lorentz Cascade Attention (LCA) vs Poincaré Benchmark        ║");
+    println!("╚══════════════════════════════════════════════════════════════════╝\n");
+
+    let iterations = 1000;
+    let n_keys = 100;
+    let dim = 64;
+
+    println!(
+        "Configuration: {} iterations, {} keys, {} dimensions\n",
+        iterations, n_keys, dim
+    );
+
+    // Distance computation benchmarks
+    println!("┌─────────────────────────────────────────────────────────────────┐");
+    println!("│ 1. DISTANCE COMPUTATION                                         │");
+    println!("├─────────────────────────────────────────────────────────────────┤");
+
+    let poincare_dist_time = bench_poincare_distance(iterations, n_keys, dim);
+    let lorentz_dist_time = bench_lorentz_distance(iterations, n_keys, dim);
+    let busemann_time = bench_busemann_scoring(iterations, n_keys, dim);
+
+    let poincare_per_op = poincare_dist_time.as_nanos() as f64 / (iterations * n_keys) as f64;
+    let lorentz_per_op = lorentz_dist_time.as_nanos() as f64 / (iterations * n_keys) as f64;
+    let busemann_per_op = busemann_time.as_nanos() as f64 / (iterations * n_keys) as f64;
+
+    println!(
+        "│ Poincaré distance:     {:>8.1} ns/op                          │",
+        poincare_per_op
+    );
+    println!(
+        "│ Lorentz distance:      {:>8.1} ns/op  ({:.1}x vs Poincaré)      │",
+        lorentz_per_op,
+        poincare_per_op / lorentz_per_op
+    );
+    println!(
+        "│ Busemann scoring:      {:>8.1} ns/op  ({:.1}x vs Poincaré)      │",
+        busemann_per_op,
+        poincare_per_op / busemann_per_op
+    );
+    println!("└─────────────────────────────────────────────────────────────────┘\n");
+
+    // Aggregation benchmarks
+    println!("┌─────────────────────────────────────────────────────────────────┐");
+    println!("│ 2. AGGREGATION (CENTROID)                                       │");
+    println!("├─────────────────────────────────────────────────────────────────┤");
+
+    let frechet_time = bench_frechet_mean(iterations / 10, n_keys, dim); // Fewer iterations (slow)
+    let einstein_time = bench_einstein_midpoint(iterations, n_keys, dim);
+
+    let frechet_per_op = frechet_time.as_nanos() as f64 / (iterations / 10) as f64;
+    let einstein_per_op = einstein_time.as_nanos() as f64 / iterations as f64;
+
+    println!(
+        "│ Fréchet mean (50 iter): {:>10.1} ns/op                       │",
+        frechet_per_op
+    );
+    println!(
+        "│ Einstein midpoint:      {:>10.1} ns/op  ({:.1}x faster!)      │",
+        einstein_per_op,
+        frechet_per_op / einstein_per_op
+    );
+    println!("└─────────────────────────────────────────────────────────────────┘\n");
+
+    // Full attention benchmarks
+    println!("┌─────────────────────────────────────────────────────────────────┐");
+    println!("│ 3. FULL ATTENTION (END-TO-END)                                  │");
+    println!("├─────────────────────────────────────────────────────────────────┤");
+
+    let poincare_full_time = bench_full_poincare_attention(iterations / 10, n_keys, dim);
+    let lca_full_time = bench_full_lca_attention(iterations / 10, n_keys, dim);
+
+    let poincare_full_per_op = poincare_full_time.as_nanos() as f64 / (iterations / 10) as f64;
+    let lca_full_per_op = lca_full_time.as_nanos() as f64 / (iterations / 10) as f64;
+
+    println!(
+        "│ Poincaré Attention:     {:>10.1} ns/op                       │",
+        poincare_full_per_op
+    );
+    println!(
+        "│ Lorentz Cascade (4 heads): {:>7.1} ns/op  ({:.1}x speedup)   │",
+        lca_full_per_op,
+        poincare_full_per_op / lca_full_per_op
+    );
+    println!("└─────────────────────────────────────────────────────────────────┘\n");
+
+    // Summary
+    println!("╔══════════════════════════════════════════════════════════════════╗");
+    println!("║ SUMMARY: Lorentz Cascade Attention Improvements                  ║");
+    println!("╠══════════════════════════════════════════════════════════════════╣");
+    println!(
+        "║ • Busemann scoring:    {:.1}x faster than Poincaré distance       ║",
+        poincare_per_op / busemann_per_op
+    );
+    println!(
+        "║ • Einstein midpoint:   {:.1}x faster than Fréchet mean           ║",
+        frechet_per_op / einstein_per_op
+    );
+    println!(
+        "║ • End-to-end:          {:.1}x overall speedup                     ║",
+        poincare_full_per_op / lca_full_per_op
+    );
+    println!("║                                                                  ║");
+    println!("║ Additional benefits:                                             ║");
+    println!("║ • No boundary instability (Lorentz vs Poincaré ball)            ║");
+    println!("║ • Multi-scale hierarchy (4 curvature heads)                     ║");
+    println!("║ • Sparse attention via hierarchical pruning                     ║");
+    println!("╚══════════════════════════════════════════════════════════════════╝");
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/attention/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/attention/mod.rs
@@ -0,0 +1,10 @@
+//! Attention mechanism implementations.
+//!
+//! This module provides concrete implementations of various attention mechanisms
+//! including scaled dot-product attention and multi-head attention.
+
+pub mod multi_head;
+pub mod scaled_dot_product;
+
+pub use multi_head::MultiHeadAttention;
+pub use scaled_dot_product::ScaledDotProductAttention;
--- a/vendor/ruvector/crates/ruvector-attention/src/attention/multi_head.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/attention/multi_head.rs
@@ -0,0 +1,149 @@
+//! Multi-head attention implementation.
+//!
+//! Implements parallel attention heads for diverse representation learning.
+
+use crate::{
+    error::{AttentionError, AttentionResult},
+    traits::Attention,
+};
+
+use super::scaled_dot_product::ScaledDotProductAttention;
+
+/// Multi-head attention mechanism.
+///
+/// Splits the input into multiple heads, applies attention in parallel,
+/// and concatenates the results. This allows the model to attend to
+/// different representation subspaces simultaneously.
+pub struct MultiHeadAttention {
+    dim: usize,
+    num_heads: usize,
+    head_dim: usize,
+}
+
+impl MultiHeadAttention {
+    /// Creates a new multi-head attention mechanism.
+    ///
+    /// # Arguments
+    ///
+    /// * `dim` - The embedding dimension
+    /// * `num_heads` - Number of attention heads
+    ///
+    /// # Panics
+    ///
+    /// Panics if `dim` is not divisible by `num_heads`.
+    pub fn new(dim: usize, num_heads: usize) -> Self {
+        assert!(
+            dim % num_heads == 0,
+            "Dimension {} must be divisible by number of heads {}",
+            dim,
+            num_heads
+        );
+
+        Self {
+            dim,
+            num_heads,
+            head_dim: dim / num_heads,
+        }
+    }
+
+    /// Splits input into multiple heads.
+    fn split_heads(&self, input: &[f32]) -> Vec<Vec<f32>> {
+        (0..self.num_heads)
+            .map(|h| {
+                let start = h * self.head_dim;
+                let end = start + self.head_dim;
+                input[start..end].to_vec()
+            })
+            .collect()
+    }
+
+    /// Concatenates outputs from multiple heads.
+    fn concat_heads(&self, heads: Vec<Vec<f32>>) -> Vec<f32> {
+        heads.into_iter().flatten().collect()
+    }
+}
+
+impl Attention for MultiHeadAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if query.len() != self.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.dim,
+                actual: query.len(),
+            });
+        }
+
+        // Split query into heads
+        let query_heads = self.split_heads(query);
+
+        // Split keys and values
+        let key_heads: Vec<Vec<Vec<f32>>> = keys.iter().map(|k| self.split_heads(k)).collect();
+
+        let value_heads: Vec<Vec<Vec<f32>>> = values.iter().map(|v| self.split_heads(v)).collect();
+
+        // Compute attention for each head
+        let mut head_outputs = Vec::new();
+        for h in 0..self.num_heads {
+            let head_attn = ScaledDotProductAttention::new(self.head_dim);
+
+            let head_keys: Vec<&[f32]> = key_heads.iter().map(|kh| kh[h].as_slice()).collect();
+
+            let head_values: Vec<&[f32]> = value_heads.iter().map(|vh| vh[h].as_slice()).collect();
+
+            let head_out = head_attn.compute(&query_heads[h], &head_keys, &head_values)?;
+            head_outputs.push(head_out);
+        }
+
+        // Concatenate head outputs
+        Ok(self.concat_heads(head_outputs))
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        _mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        // For simplicity, delegate to compute (mask handling can be added per-head)
+        self.compute(query, keys, values)
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+
+    fn num_heads(&self) -> usize {
+        self.num_heads
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_multi_head() {
+        let attn = MultiHeadAttention::new(8, 2);
+        let query = vec![1.0_f32; 8];
+        let key1 = vec![0.5_f32; 8];
+        let key2 = vec![0.3_f32; 8];
+        let val1 = vec![1.0_f32; 8];
+        let val2 = vec![2.0_f32; 8];
+        let keys = vec![key1.as_slice(), key2.as_slice()];
+        let values = vec![val1.as_slice(), val2.as_slice()];
+
+        let result = attn.compute(&query, &keys, &values).unwrap();
+        assert_eq!(result.len(), 8);
+    }
+
+    #[test]
+    #[should_panic(expected = "divisible")]
+    fn test_invalid_heads() {
+        MultiHeadAttention::new(10, 3);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/attention/scaled_dot_product.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/attention/scaled_dot_product.rs
@@ -0,0 +1,180 @@
+//! Scaled dot-product attention implementation.
+//!
+//! Implements the fundamental attention mechanism: softmax(QK^T / √d)V
+
+use crate::{
+    error::{AttentionError, AttentionResult},
+    traits::Attention,
+};
+
+/// Scaled dot-product attention: softmax(QK^T / √d)V
+///
+/// This is the fundamental attention mechanism used in transformers.
+/// It computes attention scores by taking the dot product of queries
+/// and keys, scaling by the square root of the dimension, applying
+/// softmax, and using the result to weight values.
+pub struct ScaledDotProductAttention {
+    dim: usize,
+}
+
+impl ScaledDotProductAttention {
+    /// Creates a new scaled dot-product attention mechanism.
+    ///
+    /// # Arguments
+    ///
+    /// * `dim` - The embedding dimension
+    pub fn new(dim: usize) -> Self {
+        Self { dim }
+    }
+
+    /// Computes attention scores (before softmax).
+    fn compute_scores(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
+        let scale = (self.dim as f32).sqrt();
+        keys.iter()
+            .map(|key| {
+                query
+                    .iter()
+                    .zip(key.iter())
+                    .map(|(q, k)| q * k)
+                    .sum::<f32>()
+                    / scale
+            })
+            .collect()
+    }
+
+    /// Applies softmax to attention scores.
+    fn softmax(&self, scores: &[f32]) -> Vec<f32> {
+        let max_score = scores.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
+        let exp_scores: Vec<f32> = scores.iter().map(|s| (s - max_score).exp()).collect();
+        let sum: f32 = exp_scores.iter().sum();
+        exp_scores.iter().map(|e| e / sum).collect()
+    }
+}
+
+impl Attention for ScaledDotProductAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if query.len() != self.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.dim,
+                actual: query.len(),
+            });
+        }
+
+        if keys.is_empty() || values.is_empty() {
+            return Err(AttentionError::EmptyInput("keys or values".to_string()));
+        }
+
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+
+        // Compute attention scores
+        let scores = self.compute_scores(query, keys);
+
+        // Apply softmax
+        let weights = self.softmax(&scores);
+
+        // Weight values
+        let mut output = vec![0.0; self.dim];
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (out, val) in output.iter_mut().zip(value.iter()) {
+                *out += weight * val;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if mask.is_none() {
+            return self.compute(query, keys, values);
+        }
+
+        let mask = mask.unwrap();
+        if mask.len() != keys.len() {
+            return Err(AttentionError::InvalidMask {
+                expected: format!("{}", keys.len()),
+                actual: format!("{}", mask.len()),
+            });
+        }
+
+        // Compute scores
+        let mut scores = self.compute_scores(query, keys);
+
+        // Apply mask (set masked positions to very negative value)
+        for (score, &m) in scores.iter_mut().zip(mask.iter()) {
+            if !m {
+                *score = f32::NEG_INFINITY;
+            }
+        }
+
+        // Apply softmax
+        let weights = self.softmax(&scores);
+
+        // Weight values
+        let mut output = vec![0.0; self.dim];
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (out, val) in output.iter_mut().zip(value.iter()) {
+                *out += weight * val;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_scaled_dot_product() {
+        let attn = ScaledDotProductAttention::new(4);
+        let query = vec![1.0_f32, 0.0, 0.0, 0.0];
+        let key1 = vec![1.0_f32, 0.0, 0.0, 0.0];
+        let key2 = vec![0.0_f32, 1.0, 0.0, 0.0];
+        let val1 = vec![1.0_f32, 2.0, 3.0, 4.0];
+        let val2 = vec![5.0_f32, 6.0, 7.0, 8.0];
+        let keys = vec![key1.as_slice(), key2.as_slice()];
+        let values = vec![val1.as_slice(), val2.as_slice()];
+
+        let result = attn.compute(&query, &keys, &values).unwrap();
+        assert_eq!(result.len(), 4);
+    }
+
+    #[test]
+    fn test_with_mask() {
+        let attn = ScaledDotProductAttention::new(4);
+        let query = vec![1.0_f32; 4];
+        let key1 = vec![1.0_f32; 4];
+        let key2 = vec![0.5_f32; 4];
+        let val1 = vec![1.0_f32; 4];
+        let val2 = vec![2.0_f32; 4];
+        let keys = vec![key1.as_slice(), key2.as_slice()];
+        let values = vec![val1.as_slice(), val2.as_slice()];
+        let mask = vec![true, false];
+
+        let result = attn
+            .compute_with_mask(&query, &keys, &values, Some(&mask))
+            .unwrap();
+        assert_eq!(result.len(), 4);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/config.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/config.rs
@@ -0,0 +1,395 @@
+//! Configuration types for attention mechanisms.
+//!
+//! This module provides configuration structs and builders for various
+//! attention mechanisms including standard, graph, and sparse attention.
+
+use serde::{Deserialize, Serialize};
+
+use crate::error::{AttentionError, AttentionResult};
+
+/// Configuration for standard attention mechanisms.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct AttentionConfig {
+    /// Model dimension (d_model)
+    pub dim: usize,
+    /// Number of attention heads
+    pub num_heads: usize,
+    /// Dropout probability (0.0 to 1.0)
+    pub dropout: f32,
+    /// Scaling factor (default: 1/sqrt(d_k))
+    pub scale: Option<f32>,
+    /// Whether to use causal masking
+    pub causal: bool,
+}
+
+impl AttentionConfig {
+    /// Creates a new builder for AttentionConfig.
+    pub fn builder() -> AttentionConfigBuilder {
+        AttentionConfigBuilder::default()
+    }
+
+    /// Validates the configuration.
+    pub fn validate(&self) -> AttentionResult<()> {
+        if self.dim == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "dimension must be greater than 0".to_string(),
+            ));
+        }
+
+        if self.num_heads == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "num_heads must be greater than 0".to_string(),
+            ));
+        }
+
+        if self.dim % self.num_heads != 0 {
+            return Err(AttentionError::InvalidHeadCount {
+                dim: self.dim,
+                num_heads: self.num_heads,
+            });
+        }
+
+        if self.dropout < 0.0 || self.dropout > 1.0 {
+            return Err(AttentionError::InvalidConfig(
+                "dropout must be in range [0.0, 1.0]".to_string(),
+            ));
+        }
+
+        if let Some(scale) = self.scale {
+            if !scale.is_finite() || scale <= 0.0 {
+                return Err(AttentionError::InvalidConfig(
+                    "scale must be positive and finite".to_string(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Returns the dimension per head (d_k).
+    #[inline]
+    pub fn head_dim(&self) -> usize {
+        self.dim / self.num_heads
+    }
+
+    /// Returns the effective scale factor.
+    #[inline]
+    pub fn effective_scale(&self) -> f32 {
+        self.scale
+            .unwrap_or_else(|| 1.0 / (self.head_dim() as f32).sqrt())
+    }
+}
+
+/// Builder for AttentionConfig.
+#[derive(Default)]
+pub struct AttentionConfigBuilder {
+    dim: Option<usize>,
+    num_heads: Option<usize>,
+    dropout: f32,
+    scale: Option<f32>,
+    causal: bool,
+}
+
+impl AttentionConfigBuilder {
+    /// Sets the model dimension.
+    pub fn dim(mut self, dim: usize) -> Self {
+        self.dim = Some(dim);
+        self
+    }
+
+    /// Sets the number of attention heads.
+    pub fn num_heads(mut self, num_heads: usize) -> Self {
+        self.num_heads = Some(num_heads);
+        self
+    }
+
+    /// Sets the dropout probability.
+    pub fn dropout(mut self, dropout: f32) -> Self {
+        self.dropout = dropout;
+        self
+    }
+
+    /// Sets a custom scale factor.
+    pub fn scale(mut self, scale: f32) -> Self {
+        self.scale = Some(scale);
+        self
+    }
+
+    /// Enables causal masking.
+    pub fn causal(mut self, causal: bool) -> Self {
+        self.causal = causal;
+        self
+    }
+
+    /// Builds the AttentionConfig.
+    pub fn build(self) -> AttentionResult<AttentionConfig> {
+        let config = AttentionConfig {
+            dim: self.dim.ok_or_else(|| {
+                AttentionError::InvalidConfig("dimension must be specified".to_string())
+            })?,
+            num_heads: self.num_heads.ok_or_else(|| {
+                AttentionError::InvalidConfig("num_heads must be specified".to_string())
+            })?,
+            dropout: self.dropout,
+            scale: self.scale,
+            causal: self.causal,
+        };
+
+        config.validate()?;
+        Ok(config)
+    }
+}
+
+/// Configuration for graph attention networks.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct GraphAttentionConfig {
+    /// Base attention configuration
+    pub base: AttentionConfig,
+    /// Edge feature dimension (if using edge features)
+    pub edge_dim: Option<usize>,
+    /// Negative slope for LeakyReLU
+    pub negative_slope: f32,
+    /// Whether to concatenate multi-head outputs (vs averaging)
+    pub concat_heads: bool,
+}
+
+impl GraphAttentionConfig {
+    /// Creates a new builder for GraphAttentionConfig.
+    pub fn builder() -> GraphAttentionConfigBuilder {
+        GraphAttentionConfigBuilder::default()
+    }
+
+    /// Validates the configuration.
+    pub fn validate(&self) -> AttentionResult<()> {
+        self.base.validate()?;
+
+        if self.negative_slope <= 0.0 || !self.negative_slope.is_finite() {
+            return Err(AttentionError::InvalidConfig(
+                "negative_slope must be positive and finite".to_string(),
+            ));
+        }
+
+        if let Some(edge_dim) = self.edge_dim {
+            if edge_dim == 0 {
+                return Err(AttentionError::InvalidConfig(
+                    "edge_dim must be greater than 0".to_string(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Builder for GraphAttentionConfig.
+#[derive(Default)]
+pub struct GraphAttentionConfigBuilder {
+    base_builder: AttentionConfigBuilder,
+    edge_dim: Option<usize>,
+    negative_slope: f32,
+    concat_heads: bool,
+}
+
+impl GraphAttentionConfigBuilder {
+    /// Sets the model dimension.
+    pub fn dim(mut self, dim: usize) -> Self {
+        self.base_builder = self.base_builder.dim(dim);
+        self
+    }
+
+    /// Sets the number of attention heads.
+    pub fn num_heads(mut self, num_heads: usize) -> Self {
+        self.base_builder = self.base_builder.num_heads(num_heads);
+        self
+    }
+
+    /// Sets the edge feature dimension.
+    pub fn edge_dim(mut self, edge_dim: usize) -> Self {
+        self.edge_dim = Some(edge_dim);
+        self
+    }
+
+    /// Sets the negative slope for LeakyReLU.
+    pub fn negative_slope(mut self, slope: f32) -> Self {
+        self.negative_slope = slope;
+        self
+    }
+
+    /// Sets whether to concatenate multi-head outputs.
+    pub fn concat_heads(mut self, concat: bool) -> Self {
+        self.concat_heads = concat;
+        self
+    }
+
+    /// Builds the GraphAttentionConfig.
+    pub fn build(self) -> AttentionResult<GraphAttentionConfig> {
+        let config = GraphAttentionConfig {
+            base: self.base_builder.build()?,
+            edge_dim: self.edge_dim,
+            negative_slope: if self.negative_slope == 0.0 {
+                0.2
+            } else {
+                self.negative_slope
+            },
+            concat_heads: self.concat_heads,
+        };
+
+        config.validate()?;
+        Ok(config)
+    }
+}
+
+/// Configuration for sparse attention mechanisms.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SparseAttentionConfig {
+    /// Base attention configuration
+    pub base: AttentionConfig,
+    /// Block size for block-sparse attention
+    pub block_size: usize,
+    /// Number of random blocks per query
+    pub num_random_blocks: usize,
+    /// Number of global tokens
+    pub num_global_tokens: usize,
+}
+
+impl SparseAttentionConfig {
+    /// Creates a new builder for SparseAttentionConfig.
+    pub fn builder() -> SparseAttentionConfigBuilder {
+        SparseAttentionConfigBuilder::default()
+    }
+
+    /// Validates the configuration.
+    pub fn validate(&self) -> AttentionResult<()> {
+        self.base.validate()?;
+
+        if self.block_size == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "block_size must be greater than 0".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
+}
+
+/// Builder for SparseAttentionConfig.
+#[derive(Default)]
+pub struct SparseAttentionConfigBuilder {
+    base_builder: AttentionConfigBuilder,
+    block_size: usize,
+    num_random_blocks: usize,
+    num_global_tokens: usize,
+}
+
+impl SparseAttentionConfigBuilder {
+    /// Sets the model dimension.
+    pub fn dim(mut self, dim: usize) -> Self {
+        self.base_builder = self.base_builder.dim(dim);
+        self
+    }
+
+    /// Sets the number of attention heads.
+    pub fn num_heads(mut self, num_heads: usize) -> Self {
+        self.base_builder = self.base_builder.num_heads(num_heads);
+        self
+    }
+
+    /// Sets the block size.
+    pub fn block_size(mut self, block_size: usize) -> Self {
+        self.block_size = block_size;
+        self
+    }
+
+    /// Sets the number of random blocks.
+    pub fn num_random_blocks(mut self, num_random_blocks: usize) -> Self {
+        self.num_random_blocks = num_random_blocks;
+        self
+    }
+
+    /// Sets the number of global tokens.
+    pub fn num_global_tokens(mut self, num_global_tokens: usize) -> Self {
+        self.num_global_tokens = num_global_tokens;
+        self
+    }
+
+    /// Builds the SparseAttentionConfig.
+    pub fn build(self) -> AttentionResult<SparseAttentionConfig> {
+        let config = SparseAttentionConfig {
+            base: self.base_builder.build()?,
+            block_size: if self.block_size == 0 {
+                64
+            } else {
+                self.block_size
+            },
+            num_random_blocks: self.num_random_blocks,
+            num_global_tokens: self.num_global_tokens,
+        };
+
+        config.validate()?;
+        Ok(config)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_attention_config_builder() {
+        let config = AttentionConfig::builder()
+            .dim(512)
+            .num_heads(8)
+            .dropout(0.1)
+            .causal(true)
+            .build()
+            .unwrap();
+
+        assert_eq!(config.dim, 512);
+        assert_eq!(config.num_heads, 8);
+        assert_eq!(config.dropout, 0.1);
+        assert!(config.causal);
+        assert_eq!(config.head_dim(), 64);
+    }
+
+    #[test]
+    fn test_config_validation() {
+        let result = AttentionConfig::builder()
+            .dim(512)
+            .num_heads(7) // Not divisible
+            .build();
+
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_graph_attention_config() {
+        let config = GraphAttentionConfig::builder()
+            .dim(256)
+            .num_heads(4)
+            .edge_dim(16)
+            .negative_slope(0.2)
+            .concat_heads(true)
+            .build()
+            .unwrap();
+
+        assert_eq!(config.base.dim, 256);
+        assert_eq!(config.edge_dim, Some(16));
+        assert!(config.concat_heads);
+    }
+
+    #[test]
+    fn test_sparse_attention_config() {
+        let config = SparseAttentionConfig::builder()
+            .dim(512)
+            .num_heads(8)
+            .block_size(64)
+            .num_random_blocks(3)
+            .num_global_tokens(64)
+            .build()
+            .unwrap();
+
+        assert_eq!(config.base.dim, 512);
+        assert_eq!(config.block_size, 64);
+        assert_eq!(config.num_random_blocks, 3);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/curvature/component_quantizer.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/curvature/component_quantizer.rs
@@ -0,0 +1,260 @@
+//! Component Quantization for Mixed-Curvature Attention
+//!
+//! Different precision for each geometric component:
+//! - Euclidean: 7-8 bit (needs precision)
+//! - Hyperbolic tangent: 5 bit (tolerates noise)
+//! - Spherical: 5 bit (only direction matters)
+
+use serde::{Deserialize, Serialize};
+
+/// Quantization configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct QuantizationConfig {
+    /// Bits for Euclidean component
+    pub euclidean_bits: u8,
+    /// Bits for Hyperbolic component
+    pub hyperbolic_bits: u8,
+    /// Bits for Spherical component
+    pub spherical_bits: u8,
+}
+
+impl Default for QuantizationConfig {
+    fn default() -> Self {
+        Self {
+            euclidean_bits: 8,
+            hyperbolic_bits: 5,
+            spherical_bits: 5,
+        }
+    }
+}
+
+/// Quantized vector representation
+#[derive(Debug, Clone)]
+pub struct QuantizedVector {
+    /// Quantized Euclidean component
+    pub euclidean: Vec<i8>,
+    /// Euclidean scale factor
+    pub euclidean_scale: f32,
+    /// Quantized Hyperbolic component
+    pub hyperbolic: Vec<i8>,
+    /// Hyperbolic scale factor
+    pub hyperbolic_scale: f32,
+    /// Quantized Spherical component
+    pub spherical: Vec<i8>,
+    /// Spherical scale factor
+    pub spherical_scale: f32,
+}
+
+/// Component quantizer for efficient storage and compute
+#[derive(Debug, Clone)]
+pub struct ComponentQuantizer {
+    config: QuantizationConfig,
+    euclidean_levels: i32,
+    hyperbolic_levels: i32,
+    spherical_levels: i32,
+}
+
+impl ComponentQuantizer {
+    /// Create new quantizer
+    pub fn new(config: QuantizationConfig) -> Self {
+        Self {
+            euclidean_levels: (1 << (config.euclidean_bits - 1)) - 1,
+            hyperbolic_levels: (1 << (config.hyperbolic_bits - 1)) - 1,
+            spherical_levels: (1 << (config.spherical_bits - 1)) - 1,
+            config,
+        }
+    }
+
+    /// Quantize a component vector
+    fn quantize_component(&self, values: &[f32], levels: i32) -> (Vec<i8>, f32) {
+        if values.is_empty() {
+            return (vec![], 1.0);
+        }
+
+        // Find absmax for scale
+        let absmax = values
+            .iter()
+            .map(|v| v.abs())
+            .fold(0.0f32, f32::max)
+            .max(1e-8);
+
+        let scale = absmax / levels as f32;
+        let inv_scale = levels as f32 / absmax;
+
+        let quantized: Vec<i8> = values
+            .iter()
+            .map(|v| (v * inv_scale).round().clamp(-127.0, 127.0) as i8)
+            .collect();
+
+        (quantized, scale)
+    }
+
+    /// Dequantize a component
+    fn dequantize_component(&self, quantized: &[i8], scale: f32) -> Vec<f32> {
+        quantized.iter().map(|&q| q as f32 * scale).collect()
+    }
+
+    /// Quantize full vector with component ranges
+    pub fn quantize(
+        &self,
+        vector: &[f32],
+        e_range: std::ops::Range<usize>,
+        h_range: std::ops::Range<usize>,
+        s_range: std::ops::Range<usize>,
+    ) -> QuantizedVector {
+        let (euclidean, euclidean_scale) =
+            self.quantize_component(&vector[e_range], self.euclidean_levels);
+
+        let (hyperbolic, hyperbolic_scale) =
+            self.quantize_component(&vector[h_range], self.hyperbolic_levels);
+
+        let (spherical, spherical_scale) =
+            self.quantize_component(&vector[s_range], self.spherical_levels);
+
+        QuantizedVector {
+            euclidean,
+            euclidean_scale,
+            hyperbolic,
+            hyperbolic_scale,
+            spherical,
+            spherical_scale,
+        }
+    }
+
+    /// Compute dot product between quantized vectors (integer arithmetic)
+    #[inline]
+    pub fn quantized_dot_product(
+        &self,
+        a: &QuantizedVector,
+        b: &QuantizedVector,
+        weights: &[f32; 3],
+    ) -> f32 {
+        // Integer dot products
+        let dot_e = Self::int_dot(&a.euclidean, &b.euclidean);
+        let dot_h = Self::int_dot(&a.hyperbolic, &b.hyperbolic);
+        let dot_s = Self::int_dot(&a.spherical, &b.spherical);
+
+        // Scale and weight
+        let sim_e = dot_e as f32 * a.euclidean_scale * b.euclidean_scale;
+        let sim_h = dot_h as f32 * a.hyperbolic_scale * b.hyperbolic_scale;
+        let sim_s = dot_s as f32 * a.spherical_scale * b.spherical_scale;
+
+        weights[0] * sim_e + weights[1] * sim_h + weights[2] * sim_s
+    }
+
+    /// Integer dot product (SIMD-friendly)
+    #[inline(always)]
+    fn int_dot(a: &[i8], b: &[i8]) -> i32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0i32;
+        let mut sum1 = 0i32;
+        let mut sum2 = 0i32;
+        let mut sum3 = 0i32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] as i32 * b[base] as i32;
+            sum1 += a[base + 1] as i32 * b[base + 1] as i32;
+            sum2 += a[base + 2] as i32 * b[base + 2] as i32;
+            sum3 += a[base + 3] as i32 * b[base + 3] as i32;
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] as i32 * b[base + i] as i32;
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+
+    /// Dequantize to full vector
+    pub fn dequantize(&self, quant: &QuantizedVector, total_dim: usize) -> Vec<f32> {
+        let mut result = vec![0.0f32; total_dim];
+
+        let e_vec = self.dequantize_component(&quant.euclidean, quant.euclidean_scale);
+        let h_vec = self.dequantize_component(&quant.hyperbolic, quant.hyperbolic_scale);
+        let s_vec = self.dequantize_component(&quant.spherical, quant.spherical_scale);
+
+        let e_end = e_vec.len();
+        let h_end = e_end + h_vec.len();
+
+        result[0..e_end].copy_from_slice(&e_vec);
+        result[e_end..h_end].copy_from_slice(&h_vec);
+        result[h_end..h_end + s_vec.len()].copy_from_slice(&s_vec);
+
+        result
+    }
+
+    /// Get memory savings ratio
+    pub fn compression_ratio(&self, dim: usize, e_dim: usize, h_dim: usize, s_dim: usize) -> f32 {
+        let original_bits = dim as f32 * 32.0;
+        let quantized_bits = e_dim as f32 * self.config.euclidean_bits as f32
+            + h_dim as f32 * self.config.hyperbolic_bits as f32
+            + s_dim as f32 * self.config.spherical_bits as f32
+            + 3.0 * 32.0; // 3 scale factors
+
+        original_bits / quantized_bits
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_quantize_dequantize() {
+        let quantizer = ComponentQuantizer::new(QuantizationConfig::default());
+
+        let vector = vec![0.5f32; 64];
+        let e_range = 0..32;
+        let h_range = 32..48;
+        let s_range = 48..64;
+
+        let quantized =
+            quantizer.quantize(&vector, e_range.clone(), h_range.clone(), s_range.clone());
+
+        assert_eq!(quantized.euclidean.len(), 32);
+        assert_eq!(quantized.hyperbolic.len(), 16);
+        assert_eq!(quantized.spherical.len(), 16);
+
+        // Dequantize and check approximate equality
+        let dequantized = quantizer.dequantize(&quantized, 64);
+        for (&orig, &deq) in vector.iter().zip(dequantized.iter()) {
+            assert!((orig - deq).abs() < 0.1);
+        }
+    }
+
+    #[test]
+    fn test_quantized_dot_product() {
+        let quantizer = ComponentQuantizer::new(QuantizationConfig::default());
+
+        let a = vec![1.0f32; 64];
+        let b = vec![1.0f32; 64];
+        let e_range = 0..32;
+        let h_range = 32..48;
+        let s_range = 48..64;
+
+        let qa = quantizer.quantize(&a, e_range.clone(), h_range.clone(), s_range.clone());
+        let qb = quantizer.quantize(&b, e_range, h_range, s_range);
+
+        let weights = [0.5, 0.3, 0.2];
+        let dot = quantizer.quantized_dot_product(&qa, &qb, &weights);
+
+        // Should be positive for same vectors
+        assert!(dot > 0.0);
+    }
+
+    #[test]
+    fn test_compression_ratio() {
+        let quantizer = ComponentQuantizer::new(QuantizationConfig::default());
+
+        let ratio = quantizer.compression_ratio(512, 256, 192, 64);
+
+        // With 8/5/5 bits vs 32 bits, expect ~4-5x compression
+        assert!(ratio > 3.0);
+        assert!(ratio < 7.0);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/curvature/fused_attention.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/curvature/fused_attention.rs
@@ -0,0 +1,441 @@
+//! Fused Mixed-Curvature Attention
+//!
+//! Single kernel that computes Euclidean, Hyperbolic (tangent), and Spherical
+//! similarities in one pass for maximum cache efficiency.
+//!
+//! logit(q,k) = a * dot(q_E, k_E) + b * dot(q_H_tan, k_H_tan) + c * dot(q_S, k_S)
+
+use super::tangent_space::{TangentSpaceConfig, TangentSpaceMapper};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for fused mixed-curvature attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FusedCurvatureConfig {
+    /// Total dimension
+    pub dim: usize,
+    /// Euclidean component dimension
+    pub euclidean_dim: usize,
+    /// Hyperbolic component dimension
+    pub hyperbolic_dim: usize,
+    /// Spherical component dimension
+    pub spherical_dim: usize,
+    /// Mixing weight for Euclidean component
+    pub weight_e: f32,
+    /// Mixing weight for Hyperbolic component
+    pub weight_h: f32,
+    /// Mixing weight for Spherical component
+    pub weight_s: f32,
+    /// Hyperbolic curvature
+    pub hyperbolic_curvature: f32,
+    /// Temperature for softmax
+    pub temperature: f32,
+    /// Number of attention heads
+    pub num_heads: usize,
+    /// Per-head weight variation (low-rank)
+    pub per_head_variation: f32,
+}
+
+impl Default for FusedCurvatureConfig {
+    fn default() -> Self {
+        Self {
+            dim: 512,
+            euclidean_dim: 256,
+            hyperbolic_dim: 192,
+            spherical_dim: 64,
+            weight_e: 0.5,
+            weight_h: 0.35,
+            weight_s: 0.15,
+            hyperbolic_curvature: -1.0,
+            temperature: 1.0,
+            num_heads: 8,
+            per_head_variation: 0.1,
+        }
+    }
+}
+
+impl FusedCurvatureConfig {
+    /// Validate config
+    pub fn validate(&self) -> Result<(), String> {
+        if self.euclidean_dim + self.hyperbolic_dim + self.spherical_dim != self.dim {
+            return Err("Component dimensions must sum to total dim".into());
+        }
+        Ok(())
+    }
+
+    /// Get component ranges
+    pub fn component_ranges(
+        &self,
+    ) -> (
+        std::ops::Range<usize>,
+        std::ops::Range<usize>,
+        std::ops::Range<usize>,
+    ) {
+        let e_end = self.euclidean_dim;
+        let h_end = e_end + self.hyperbolic_dim;
+        let s_end = h_end + self.spherical_dim;
+
+        (0..e_end, e_end..h_end, h_end..s_end)
+    }
+}
+
+/// Window cache for mixed-curvature attention
+#[derive(Debug, Clone)]
+pub struct MixedCurvatureCache {
+    /// Tangent-space mapped hyperbolic components [N × h_dim]
+    pub keys_hyperbolic_tangent: Vec<Vec<f32>>,
+    /// Normalized spherical components [N × s_dim]
+    pub keys_spherical_normalized: Vec<Vec<f32>>,
+    /// Number of keys
+    pub num_keys: usize,
+}
+
+/// Fused mixed-curvature attention
+///
+/// Computes attention with Euclidean, Hyperbolic, and Spherical
+/// similarities in a single fused kernel.
+#[derive(Debug, Clone)]
+pub struct MixedCurvatureFusedAttention {
+    config: FusedCurvatureConfig,
+    tangent_mapper: TangentSpaceMapper,
+    /// Per-head weight modifiers [num_heads × 3]
+    head_weights: Vec<[f32; 3]>,
+}
+
+impl MixedCurvatureFusedAttention {
+    /// Create new fused attention
+    pub fn new(config: FusedCurvatureConfig) -> Self {
+        let tangent_config = TangentSpaceConfig {
+            hyperbolic_dim: config.hyperbolic_dim,
+            curvature: config.hyperbolic_curvature,
+            learnable_origin: true,
+        };
+        let tangent_mapper = TangentSpaceMapper::new(tangent_config);
+
+        // Initialize per-head weights with small variation
+        let head_weights: Vec<[f32; 3]> = (0..config.num_heads)
+            .map(|h| {
+                let var = config.per_head_variation;
+                let h_factor = h as f32 / config.num_heads as f32 - 0.5;
+                [
+                    config.weight_e + h_factor * var,
+                    config.weight_h - h_factor * var * 0.5,
+                    config.weight_s + h_factor * var * 0.5,
+                ]
+            })
+            .collect();
+
+        Self {
+            config,
+            tangent_mapper,
+            head_weights,
+        }
+    }
+
+    /// Create with balanced weights
+    pub fn with_dim(dim: usize) -> Self {
+        let e_dim = dim / 2;
+        let h_dim = dim / 4;
+        let s_dim = dim - e_dim - h_dim;
+
+        let config = FusedCurvatureConfig {
+            dim,
+            euclidean_dim: e_dim,
+            hyperbolic_dim: h_dim,
+            spherical_dim: s_dim,
+            ..Default::default()
+        };
+
+        Self::new(config)
+    }
+
+    /// Build cache for keys (pre-compute expensive operations)
+    pub fn build_cache(&self, keys: &[&[f32]]) -> MixedCurvatureCache {
+        let (_e_range, h_range, s_range) = self.config.component_ranges();
+
+        // Pre-map hyperbolic components to tangent space
+        let keys_hyperbolic_tangent: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| {
+                let h_part = &k[h_range.clone()];
+                self.tangent_mapper.log_map(h_part)
+            })
+            .collect();
+
+        // Pre-normalize spherical components
+        let keys_spherical_normalized: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| {
+                let s_part = &k[s_range.clone()];
+                Self::normalize(s_part)
+            })
+            .collect();
+
+        MixedCurvatureCache {
+            keys_hyperbolic_tangent,
+            keys_spherical_normalized,
+            num_keys: keys.len(),
+        }
+    }
+
+    /// Compute attention with cache (fast path)
+    pub fn compute_with_cache(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        cache: &MixedCurvatureCache,
+        head_idx: usize,
+    ) -> AttentionResult<Vec<f32>> {
+        let num_keys = cache.num_keys;
+        if num_keys == 0 {
+            return Err(AttentionError::InvalidConfig("No keys".into()));
+        }
+
+        let (e_range, h_range, s_range) = self.config.component_ranges();
+        let weights = &self.head_weights[head_idx % self.head_weights.len()];
+
+        // Extract query components
+        let q_e = &query[e_range.clone()];
+        let q_h = &query[h_range.clone()];
+        let q_s = &query[s_range.clone()];
+
+        // Map query hyperbolic to tangent space
+        let q_h_tangent = self.tangent_mapper.log_map(q_h);
+
+        // Normalize query spherical
+        let q_s_normalized = Self::normalize(q_s);
+
+        // Compute fused logits
+        let logits: Vec<f32> = (0..num_keys)
+            .map(|i| {
+                let k = keys[i];
+
+                // Euclidean similarity (dot product)
+                let sim_e = Self::dot_product_simd(&q_e, &k[e_range.clone()]);
+
+                // Hyperbolic similarity (tangent space dot product)
+                let sim_h = Self::dot_product_simd(&q_h_tangent, &cache.keys_hyperbolic_tangent[i]);
+
+                // Spherical similarity (normalized dot product)
+                let sim_s =
+                    Self::dot_product_simd(&q_s_normalized, &cache.keys_spherical_normalized[i]);
+
+                // Fused logit
+                (weights[0] * sim_e + weights[1] * sim_h + weights[2] * sim_s)
+                    / self.config.temperature
+            })
+            .collect();
+
+        // Softmax
+        let attention_weights = Self::stable_softmax(&logits);
+
+        // Weighted sum
+        self.weighted_sum(&attention_weights, values)
+    }
+
+    /// Fused similarity computation (single pass through all components)
+    /// This is the hot path - maximize SIMD utilization
+    #[inline]
+    pub fn fused_similarity(
+        &self,
+        query: &[f32],
+        key: &[f32],
+        key_h_tangent: &[f32],
+        key_s_normalized: &[f32],
+        query_h_tangent: &[f32],
+        query_s_normalized: &[f32],
+        weights: &[f32; 3],
+    ) -> f32 {
+        let (e_range, _, _) = self.config.component_ranges();
+
+        // Euclidean: direct dot product on original vectors
+        let sim_e = Self::dot_product_simd(&query[e_range.clone()], &key[e_range.clone()]);
+
+        // Hyperbolic: dot product in tangent space
+        let sim_h = Self::dot_product_simd(query_h_tangent, key_h_tangent);
+
+        // Spherical: dot product of normalized vectors
+        let sim_s = Self::dot_product_simd(query_s_normalized, key_s_normalized);
+
+        weights[0] * sim_e + weights[1] * sim_h + weights[2] * sim_s
+    }
+
+    /// Normalize vector to unit length
+    #[inline]
+    fn normalize(v: &[f32]) -> Vec<f32> {
+        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+        if norm > 1e-8 {
+            v.iter().map(|x| x / norm).collect()
+        } else {
+            v.to_vec()
+        }
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+
+    /// Stable softmax
+    fn stable_softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        exp_logits.iter().map(|&e| e / sum).collect()
+    }
+
+    /// Weighted sum
+    fn weighted_sum(&self, weights: &[f32], values: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        if weights.is_empty() || values.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty inputs".into()));
+        }
+
+        let dim = values[0].len();
+        let mut output = vec![0.0f32; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, &v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+}
+
+impl Attention for MixedCurvatureFusedAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let cache = self.build_cache(keys);
+        self.compute_with_cache(query, keys, values, &cache, 0)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(&[f32], &[f32])> = keys
+                .iter()
+                .zip(values.iter())
+                .enumerate()
+                .filter(|(i, _)| m.get(*i).copied().unwrap_or(true))
+                .map(|(_, (k, v))| (*k, *v))
+                .collect();
+
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(k, _)| *k).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(_, v)| *v).collect();
+
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+
+    fn num_heads(&self) -> usize {
+        self.config.num_heads
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_fused_attention_config() {
+        let config = FusedCurvatureConfig {
+            dim: 64,
+            euclidean_dim: 32,
+            hyperbolic_dim: 24,
+            spherical_dim: 8,
+            ..Default::default()
+        };
+
+        assert!(config.validate().is_ok());
+    }
+
+    #[test]
+    fn test_fused_attention() {
+        let config = FusedCurvatureConfig {
+            dim: 64,
+            euclidean_dim: 32,
+            hyperbolic_dim: 24,
+            spherical_dim: 8,
+            ..Default::default()
+        };
+        let attention = MixedCurvatureFusedAttention::new(config);
+
+        let query = vec![0.5f32; 64];
+        let keys: Vec<Vec<f32>> = (0..20).map(|i| vec![0.1 + i as f32 * 0.02; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 64);
+    }
+
+    #[test]
+    fn test_cache_reuse() {
+        let attention = MixedCurvatureFusedAttention::with_dim(32);
+
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![0.1 * i as f32; 32]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; 32]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let cache = attention.build_cache(&keys_refs);
+
+        // Multiple queries with same cache
+        for h in 0..4 {
+            let query = vec![0.5f32; 32];
+            let output = attention
+                .compute_with_cache(&query, &keys_refs, &values_refs, &cache, h)
+                .unwrap();
+            assert_eq!(output.len(), 32);
+        }
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/curvature/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/curvature/mod.rs
@@ -0,0 +1,28 @@
+//! Mixed Curvature Attention
+//!
+//! Attention in product spaces: E^e × H^h × S^s
+//!
+//! ## Key Optimizations
+//!
+//! 1. **Tangent Space Mapping**: Map hyperbolic to tangent space at origin
+//! 2. **Fused Dot Kernel**: Single vectorized loop for all three similarities
+//! 3. **Per-Head Mixing**: Low-rank learned weights per head
+//! 4. **Quantization-Friendly**: Different precision for each component
+
+mod component_quantizer;
+mod fused_attention;
+mod tangent_space;
+
+pub use component_quantizer::{ComponentQuantizer, QuantizationConfig, QuantizedVector};
+pub use fused_attention::{
+    FusedCurvatureConfig, MixedCurvatureCache, MixedCurvatureFusedAttention,
+};
+pub use tangent_space::{TangentSpaceConfig, TangentSpaceMapper};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/curvature/tangent_space.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/curvature/tangent_space.rs
@@ -0,0 +1,246 @@
+//! Tangent Space Mapping for Fast Hyperbolic Operations
+//!
+//! Instead of computing full geodesic distances in hyperbolic space,
+//! we map points to the tangent space at a learned origin and use
+//! dot products. This is 10-100x faster while preserving hierarchy.
+
+use serde::{Deserialize, Serialize};
+
+/// Configuration for tangent space mapping
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TangentSpaceConfig {
+    /// Dimension of hyperbolic component
+    pub hyperbolic_dim: usize,
+    /// Curvature (negative, e.g., -1.0)
+    pub curvature: f32,
+    /// Whether to learn the origin
+    pub learnable_origin: bool,
+}
+
+impl Default for TangentSpaceConfig {
+    fn default() -> Self {
+        Self {
+            hyperbolic_dim: 32,
+            curvature: -1.0,
+            learnable_origin: true,
+        }
+    }
+}
+
+/// Tangent space mapper for hyperbolic geometry
+///
+/// Maps points from Poincaré ball to tangent space at origin,
+/// enabling fast dot-product similarity instead of geodesic distance.
+#[derive(Debug, Clone)]
+pub struct TangentSpaceMapper {
+    config: TangentSpaceConfig,
+    /// Origin point in Poincaré ball
+    origin: Vec<f32>,
+    /// Conformal factor at origin
+    lambda_origin: f32,
+}
+
+impl TangentSpaceMapper {
+    /// Create new mapper with config
+    pub fn new(config: TangentSpaceConfig) -> Self {
+        let origin = vec![0.0f32; config.hyperbolic_dim];
+        let c = -config.curvature;
+        let origin_norm_sq: f32 = origin.iter().map(|x| x * x).sum();
+        let lambda_origin = 2.0 / (1.0 - c * origin_norm_sq).max(1e-8);
+
+        Self {
+            config,
+            origin,
+            lambda_origin,
+        }
+    }
+
+    /// Set custom origin (for learned origins)
+    pub fn set_origin(&mut self, origin: Vec<f32>) {
+        let c = -self.config.curvature;
+        let origin_norm_sq: f32 = origin.iter().map(|x| x * x).sum();
+        self.lambda_origin = 2.0 / (1.0 - c * origin_norm_sq).max(1e-8);
+        self.origin = origin;
+    }
+
+    /// Map point from Poincaré ball to tangent space at origin
+    ///
+    /// log_o(x) = (2 / λ_o) * arctanh(√c ||−o ⊕ x||) * (−o ⊕ x) / ||−o ⊕ x||
+    ///
+    /// For origin at 0, this simplifies to:
+    /// log_0(x) = 2 * arctanh(√c ||x||) * x / (√c ||x||)
+    #[inline]
+    pub fn log_map(&self, point: &[f32]) -> Vec<f32> {
+        let c = -self.config.curvature;
+        let sqrt_c = c.sqrt();
+
+        // For origin at 0, Möbius addition −o ⊕ x = x
+        if self.origin.iter().all(|&x| x.abs() < 1e-8) {
+            return self.log_map_at_origin(point, sqrt_c);
+        }
+
+        // General case: compute -origin ⊕ point
+        let neg_origin: Vec<f32> = self.origin.iter().map(|x| -x).collect();
+        let diff = self.mobius_add(&neg_origin, point, c);
+
+        let diff_norm: f32 = diff.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+        if diff_norm < 1e-8 {
+            return vec![0.0f32; point.len()];
+        }
+
+        let scale =
+            (2.0 / self.lambda_origin) * (sqrt_c * diff_norm).atanh() / (sqrt_c * diff_norm);
+
+        diff.iter().map(|&d| scale * d).collect()
+    }
+
+    /// Fast log map at origin (most common case)
+    #[inline]
+    fn log_map_at_origin(&self, point: &[f32], sqrt_c: f32) -> Vec<f32> {
+        let norm: f32 = point.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+        if norm < 1e-8 {
+            return vec![0.0f32; point.len()];
+        }
+
+        // Clamp to avoid infinity
+        let arg = (sqrt_c * norm).min(0.99);
+        let scale = 2.0 * arg.atanh() / (sqrt_c * norm);
+
+        point.iter().map(|&p| scale * p).collect()
+    }
+
+    /// Möbius addition in Poincaré ball
+    fn mobius_add(&self, x: &[f32], y: &[f32], c: f32) -> Vec<f32> {
+        let x_norm_sq: f32 = x.iter().map(|xi| xi * xi).sum();
+        let y_norm_sq: f32 = y.iter().map(|yi| yi * yi).sum();
+        let xy_dot: f32 = x.iter().zip(y.iter()).map(|(&xi, &yi)| xi * yi).sum();
+
+        let num_coef = 1.0 + 2.0 * c * xy_dot + c * y_norm_sq;
+        let denom = 1.0 + 2.0 * c * xy_dot + c * c * x_norm_sq * y_norm_sq;
+
+        if denom.abs() < 1e-8 {
+            return x.to_vec();
+        }
+
+        let y_coef = 1.0 - c * x_norm_sq;
+
+        x.iter()
+            .zip(y.iter())
+            .map(|(&xi, &yi)| (num_coef * xi + y_coef * yi) / denom)
+            .collect()
+    }
+
+    /// Compute tangent space similarity (dot product in tangent space)
+    ///
+    /// This approximates hyperbolic distance but is much faster.
+    #[inline]
+    pub fn tangent_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
+        // Map both to tangent space
+        let ta = self.log_map(a);
+        let tb = self.log_map(b);
+
+        // Dot product
+        ta.iter().zip(tb.iter()).map(|(&ai, &bi)| ai * bi).sum()
+    }
+
+    /// Batch map points to tangent space (cache for window)
+    pub fn batch_log_map(&self, points: &[&[f32]]) -> Vec<Vec<f32>> {
+        points.iter().map(|p| self.log_map(p)).collect()
+    }
+
+    /// Compute similarities in tangent space (all pairwise with query)
+    pub fn batch_tangent_similarity(
+        &self,
+        query_tangent: &[f32],
+        keys_tangent: &[&[f32]],
+    ) -> Vec<f32> {
+        keys_tangent
+            .iter()
+            .map(|k| Self::dot_product_simd(query_tangent, k))
+            .collect()
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_log_map_at_origin() {
+        let config = TangentSpaceConfig {
+            hyperbolic_dim: 4,
+            curvature: -1.0,
+            learnable_origin: false,
+        };
+        let mapper = TangentSpaceMapper::new(config);
+
+        // Point at origin maps to zero
+        let origin = vec![0.0f32; 4];
+        let result = mapper.log_map(&origin);
+        assert!(result.iter().all(|&x| x.abs() < 1e-6));
+
+        // Non-zero point
+        let point = vec![0.1, 0.2, 0.0, 0.0];
+        let tangent = mapper.log_map(&point);
+        assert_eq!(tangent.len(), 4);
+    }
+
+    #[test]
+    fn test_tangent_similarity() {
+        let config = TangentSpaceConfig {
+            hyperbolic_dim: 4,
+            curvature: -1.0,
+            learnable_origin: false,
+        };
+        let mapper = TangentSpaceMapper::new(config);
+
+        let a = vec![0.1, 0.1, 0.0, 0.0];
+        let b = vec![0.1, 0.1, 0.0, 0.0];
+
+        // Same points should have high similarity
+        let sim = mapper.tangent_similarity(&a, &b);
+        assert!(sim > 0.0);
+    }
+
+    #[test]
+    fn test_batch_operations() {
+        let config = TangentSpaceConfig::default();
+        let mapper = TangentSpaceMapper::new(config);
+
+        let points: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.05; 32]).collect();
+        let points_refs: Vec<&[f32]> = points.iter().map(|p| p.as_slice()).collect();
+
+        let tangents = mapper.batch_log_map(&points_refs);
+        assert_eq!(tangents.len(), 10);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/error.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/error.rs
@@ -0,0 +1,91 @@
+//! Error types for the ruvector-attention crate.
+//!
+//! This module defines all error types that can occur during attention computation,
+//! configuration, and training operations.
+
+use thiserror::Error;
+
+/// Errors that can occur during attention operations.
+#[derive(Error, Debug, Clone)]
+pub enum AttentionError {
+    /// Dimension mismatch between query, key, or value tensors.
+    #[error("Dimension mismatch: expected {expected}, got {actual}")]
+    DimensionMismatch {
+        /// Expected dimension size
+        expected: usize,
+        /// Actual dimension size
+        actual: usize,
+    },
+
+    /// Invalid configuration parameter.
+    #[error("Invalid configuration: {0}")]
+    InvalidConfig(String),
+
+    /// Error during attention computation.
+    #[error("Computation error: {0}")]
+    ComputationError(String),
+
+    /// Memory allocation failure.
+    #[error("Memory allocation failed: {0}")]
+    MemoryError(String),
+
+    /// Invalid head configuration for multi-head attention.
+    #[error("Invalid head count: dimension {dim} not divisible by {num_heads} heads")]
+    InvalidHeadCount {
+        /// Model dimension
+        dim: usize,
+        /// Number of attention heads
+        num_heads: usize,
+    },
+
+    /// Empty input provided.
+    #[error("Empty input: {0}")]
+    EmptyInput(String),
+
+    /// Invalid edge configuration for graph attention.
+    #[error("Invalid edge configuration: {0}")]
+    InvalidEdges(String),
+
+    /// Numerical instability detected.
+    #[error("Numerical instability: {0}")]
+    NumericalInstability(String),
+
+    /// Invalid mask dimensions.
+    #[error("Invalid mask dimensions: expected {expected}, got {actual}")]
+    InvalidMask {
+        /// Expected mask dimensions
+        expected: String,
+        /// Actual mask dimensions
+        actual: String,
+    },
+}
+
+/// Result type for attention operations.
+pub type AttentionResult<T> = Result<T, AttentionError>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_error_display() {
+        let err = AttentionError::DimensionMismatch {
+            expected: 512,
+            actual: 256,
+        };
+        assert_eq!(err.to_string(), "Dimension mismatch: expected 512, got 256");
+
+        let err = AttentionError::InvalidConfig("dropout must be in [0, 1]".to_string());
+        assert_eq!(
+            err.to_string(),
+            "Invalid configuration: dropout must be in [0, 1]"
+        );
+    }
+
+    #[test]
+    fn test_error_clone() {
+        let err = AttentionError::ComputationError("test".to_string());
+        let cloned = err.clone();
+        assert_eq!(err.to_string(), cloned.to_string());
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/graph/dual_space.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/graph/dual_space.rs
@@ -0,0 +1,412 @@
+//! Dual-space attention combining Euclidean and Hyperbolic geometries
+//!
+//! This module implements attention that operates in both Euclidean and hyperbolic
+//! spaces, combining their complementary properties:
+//! - Euclidean: Good for flat, local structure
+//! - Hyperbolic: Good for hierarchical, tree-like structure
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::hyperbolic::project_to_ball;
+use crate::traits::Attention;
+use crate::utils::stable_softmax;
+
+/// Compute Poincaré distance between two points
+fn poincare_dist(u: &[f32], v: &[f32], curvature: f32) -> f32 {
+    let c = curvature.abs();
+    let sqrt_c = c.sqrt();
+
+    let diff_sq: f32 = u.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
+    let norm_u_sq: f32 = u.iter().map(|x| x * x).sum();
+    let norm_v_sq: f32 = v.iter().map(|x| x * x).sum();
+
+    let denom = (1.0 - c * norm_u_sq).max(1e-7) * (1.0 - c * norm_v_sq).max(1e-7);
+    let arg = 1.0 + 2.0 * c * diff_sq / denom;
+
+    (1.0 / sqrt_c) * arg.max(1.0).acosh()
+}
+
+/// Configuration for dual-space attention
+#[derive(Clone, Debug)]
+pub struct DualSpaceConfig {
+    pub dim: usize,
+    pub curvature: f32,
+    pub euclidean_weight: f32,
+    pub hyperbolic_weight: f32,
+    pub learn_weights: bool,
+    pub temperature: f32,
+}
+
+impl Default for DualSpaceConfig {
+    fn default() -> Self {
+        Self {
+            dim: 256,
+            curvature: 1.0,
+            euclidean_weight: 0.5,
+            hyperbolic_weight: 0.5,
+            learn_weights: false,
+            temperature: 1.0,
+        }
+    }
+}
+
+impl DualSpaceConfig {
+    pub fn builder() -> DualSpaceConfigBuilder {
+        DualSpaceConfigBuilder::default()
+    }
+}
+
+#[derive(Default)]
+pub struct DualSpaceConfigBuilder {
+    config: DualSpaceConfig,
+}
+
+impl DualSpaceConfigBuilder {
+    pub fn dim(mut self, d: usize) -> Self {
+        self.config.dim = d;
+        self
+    }
+
+    pub fn curvature(mut self, c: f32) -> Self {
+        self.config.curvature = c;
+        self
+    }
+
+    pub fn euclidean_weight(mut self, w: f32) -> Self {
+        self.config.euclidean_weight = w;
+        self
+    }
+
+    pub fn hyperbolic_weight(mut self, w: f32) -> Self {
+        self.config.hyperbolic_weight = w;
+        self
+    }
+
+    pub fn temperature(mut self, t: f32) -> Self {
+        self.config.temperature = t;
+        self
+    }
+
+    pub fn build(self) -> DualSpaceConfig {
+        self.config
+    }
+}
+
+/// Dual-space attention layer
+pub struct DualSpaceAttention {
+    config: DualSpaceConfig,
+    scale: f32,
+    /// Linear projection for Euclidean space
+    w_euclidean: Vec<f32>,
+    /// Linear projection for hyperbolic space
+    w_hyperbolic: Vec<f32>,
+    /// Output projection
+    w_out: Vec<f32>,
+}
+
+impl DualSpaceAttention {
+    pub fn new(config: DualSpaceConfig) -> Self {
+        let dim = config.dim;
+        let scale = 1.0 / (dim as f32).sqrt();
+
+        // Xavier initialization
+        let w_scale = (2.0 / (dim + dim) as f32).sqrt();
+        let mut seed = 42u64;
+        let mut rand = || {
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            ((seed as f32) / (u64::MAX as f32) - 0.5) * 2.0 * w_scale
+        };
+
+        let w_euclidean: Vec<f32> = (0..dim * dim).map(|_| rand()).collect();
+        let w_hyperbolic: Vec<f32> = (0..dim * dim).map(|_| rand()).collect();
+        let w_out: Vec<f32> = (0..dim * dim).map(|_| rand()).collect();
+
+        Self {
+            config,
+            scale,
+            w_euclidean,
+            w_hyperbolic,
+            w_out,
+        }
+    }
+
+    /// Project to Euclidean representation
+    fn to_euclidean(&self, x: &[f32]) -> Vec<f32> {
+        let dim = self.config.dim;
+        (0..dim)
+            .map(|i| {
+                x.iter()
+                    .enumerate()
+                    .map(|(j, &xj)| xj * self.w_euclidean[i * dim + j])
+                    .sum()
+            })
+            .collect()
+    }
+
+    /// Project to hyperbolic representation (Poincaré ball)
+    fn to_hyperbolic(&self, x: &[f32]) -> Vec<f32> {
+        let dim = self.config.dim;
+        let projected: Vec<f32> = (0..dim)
+            .map(|i| {
+                x.iter()
+                    .enumerate()
+                    .map(|(j, &xj)| xj * self.w_hyperbolic[i * dim + j])
+                    .sum()
+            })
+            .collect();
+
+        // Project to ball with curvature
+        project_to_ball(&projected, self.config.curvature, 1e-5)
+    }
+
+    /// Compute Euclidean similarity (dot product)
+    fn euclidean_similarity(&self, q: &[f32], k: &[f32]) -> f32 {
+        q.iter().zip(k.iter()).map(|(a, b)| a * b).sum::<f32>() * self.scale
+    }
+
+    /// Compute hyperbolic similarity (negative Poincaré distance)
+    fn hyperbolic_similarity(&self, q: &[f32], k: &[f32]) -> f32 {
+        -poincare_dist(q, k, self.config.curvature)
+    }
+
+    /// Output projection
+    fn project_output(&self, x: &[f32]) -> Vec<f32> {
+        let dim = self.config.dim;
+        (0..dim)
+            .map(|i| {
+                x.iter()
+                    .enumerate()
+                    .map(|(j, &xj)| xj * self.w_out[i * dim + j])
+                    .sum()
+            })
+            .collect()
+    }
+
+    /// Get the contribution weights for analysis
+    pub fn get_space_contributions(&self, query: &[f32], keys: &[&[f32]]) -> (Vec<f32>, Vec<f32>) {
+        let q_euc = self.to_euclidean(query);
+        let q_hyp = self.to_hyperbolic(query);
+
+        let euc_scores: Vec<f32> = keys
+            .iter()
+            .map(|k| {
+                let k_euc = self.to_euclidean(k);
+                self.euclidean_similarity(&q_euc, &k_euc)
+            })
+            .collect();
+
+        let hyp_scores: Vec<f32> = keys
+            .iter()
+            .map(|k| {
+                let k_hyp = self.to_hyperbolic(k);
+                self.hyperbolic_similarity(&q_hyp, &k_hyp)
+            })
+            .collect();
+
+        (euc_scores, hyp_scores)
+    }
+}
+
+impl Attention for DualSpaceAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if query.len() != self.config.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.config.dim,
+                actual: query.len(),
+            });
+        }
+
+        let n = keys.len();
+        let value_dim = values[0].len();
+        let temp = self.config.temperature;
+
+        // Project query to both spaces
+        let q_euc = self.to_euclidean(query);
+        let q_hyp = self.to_hyperbolic(query);
+
+        // Compute combined scores
+        let mut combined_scores = Vec::with_capacity(n);
+
+        for key in keys.iter() {
+            let k_euc = self.to_euclidean(key);
+            let k_hyp = self.to_hyperbolic(key);
+
+            let euc_score = self.euclidean_similarity(&q_euc, &k_euc);
+            let hyp_score = self.hyperbolic_similarity(&q_hyp, &k_hyp);
+
+            // Weighted combination
+            let combined = (self.config.euclidean_weight * euc_score
+                + self.config.hyperbolic_weight * hyp_score)
+                / temp;
+
+            combined_scores.push(combined);
+        }
+
+        // Softmax over combined scores
+        let weights = stable_softmax(&combined_scores);
+
+        // Weighted sum of values
+        let mut output = vec![0.0f32; value_dim];
+        for (w, v) in weights.iter().zip(values.iter()) {
+            for (o, &vi) in output.iter_mut().zip(v.iter()) {
+                *o += w * vi;
+            }
+        }
+
+        // Output projection
+        if value_dim == self.config.dim {
+            Ok(self.project_output(&output))
+        } else {
+            Ok(output)
+        }
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_dual_space_basic() {
+        let config = DualSpaceConfig::builder()
+            .dim(64)
+            .curvature(1.0)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+
+        let attn = DualSpaceAttention::new(config);
+
+        let query = vec![0.1; 64];
+        let keys: Vec<Vec<f32>> = (0..10).map(|_| vec![0.1; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|_| vec![1.0; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attn.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_euclidean_dominant() {
+        let config = DualSpaceConfig::builder()
+            .dim(32)
+            .euclidean_weight(1.0)
+            .hyperbolic_weight(0.0)
+            .build();
+
+        let attn = DualSpaceAttention::new(config);
+
+        let query = vec![0.5; 32];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attn.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+
+    #[test]
+    fn test_hyperbolic_dominant() {
+        let config = DualSpaceConfig::builder()
+            .dim(32)
+            .curvature(0.5)
+            .euclidean_weight(0.0)
+            .hyperbolic_weight(1.0)
+            .build();
+
+        let attn = DualSpaceAttention::new(config);
+
+        let query = vec![0.1; 32]; // Small values for Poincaré ball
+        let keys: Vec<Vec<f32>> = vec![vec![0.1; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attn.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+
+    #[test]
+    fn test_space_contributions() {
+        let config = DualSpaceConfig::builder()
+            .dim(16)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+
+        let attn = DualSpaceAttention::new(config);
+
+        let query = vec![0.2; 16];
+        let keys: Vec<Vec<f32>> = vec![vec![0.2; 16]; 3];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let (euc_scores, hyp_scores) = attn.get_space_contributions(&query, &keys_refs);
+
+        assert_eq!(euc_scores.len(), 3);
+        assert_eq!(hyp_scores.len(), 3);
+    }
+
+    #[test]
+    fn test_temperature_scaling() {
+        let config_low_temp = DualSpaceConfig::builder().dim(16).temperature(0.5).build();
+
+        let config_high_temp = DualSpaceConfig::builder().dim(16).temperature(2.0).build();
+
+        let attn_low = DualSpaceAttention::new(config_low_temp);
+        let attn_high = DualSpaceAttention::new(config_high_temp);
+
+        let query = vec![0.5; 16];
+        let keys: Vec<Vec<f32>> = vec![vec![0.8; 16], vec![0.2; 16]];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 16], vec![0.0; 16]];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result_low = attn_low.compute(&query, &keys_refs, &values_refs).unwrap();
+        let result_high = attn_high.compute(&query, &keys_refs, &values_refs).unwrap();
+
+        // Low temperature should be more peaked (closer to [1,0,0...])
+        // High temperature should be more uniform
+        // We just verify both compute successfully
+        assert_eq!(result_low.len(), 16);
+        assert_eq!(result_high.len(), 16);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/graph/edge_featured.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/graph/edge_featured.rs
@@ -0,0 +1,394 @@
+//! Edge-featured graph attention (GATv2 style)
+//!
+//! Extends standard graph attention with edge feature integration.
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use crate::utils::stable_softmax;
+
+/// Configuration for edge-featured attention
+#[derive(Clone, Debug)]
+pub struct EdgeFeaturedConfig {
+    pub node_dim: usize,
+    pub edge_dim: usize,
+    pub num_heads: usize,
+    pub dropout: f32,
+    pub concat_heads: bool,
+    pub add_self_loops: bool,
+    pub negative_slope: f32, // LeakyReLU slope
+}
+
+impl Default for EdgeFeaturedConfig {
+    fn default() -> Self {
+        Self {
+            node_dim: 256,
+            edge_dim: 64,
+            num_heads: 4,
+            dropout: 0.0,
+            concat_heads: true,
+            add_self_loops: true,
+            negative_slope: 0.2,
+        }
+    }
+}
+
+impl EdgeFeaturedConfig {
+    pub fn builder() -> EdgeFeaturedConfigBuilder {
+        EdgeFeaturedConfigBuilder::default()
+    }
+
+    pub fn head_dim(&self) -> usize {
+        self.node_dim / self.num_heads
+    }
+}
+
+#[derive(Default)]
+pub struct EdgeFeaturedConfigBuilder {
+    config: EdgeFeaturedConfig,
+}
+
+impl EdgeFeaturedConfigBuilder {
+    pub fn node_dim(mut self, d: usize) -> Self {
+        self.config.node_dim = d;
+        self
+    }
+
+    pub fn edge_dim(mut self, d: usize) -> Self {
+        self.config.edge_dim = d;
+        self
+    }
+
+    pub fn num_heads(mut self, n: usize) -> Self {
+        self.config.num_heads = n;
+        self
+    }
+
+    pub fn dropout(mut self, d: f32) -> Self {
+        self.config.dropout = d;
+        self
+    }
+
+    pub fn concat_heads(mut self, c: bool) -> Self {
+        self.config.concat_heads = c;
+        self
+    }
+
+    pub fn negative_slope(mut self, s: f32) -> Self {
+        self.config.negative_slope = s;
+        self
+    }
+
+    pub fn build(self) -> EdgeFeaturedConfig {
+        self.config
+    }
+}
+
+/// Edge-featured graph attention layer
+pub struct EdgeFeaturedAttention {
+    config: EdgeFeaturedConfig,
+    // Weight matrices (would be learnable in training)
+    w_node: Vec<f32>, // [num_heads, head_dim, node_dim]
+    w_edge: Vec<f32>, // [num_heads, head_dim, edge_dim]
+    a_src: Vec<f32>,  // [num_heads, head_dim]
+    a_dst: Vec<f32>,  // [num_heads, head_dim]
+    a_edge: Vec<f32>, // [num_heads, head_dim]
+}
+
+impl EdgeFeaturedAttention {
+    pub fn new(config: EdgeFeaturedConfig) -> Self {
+        let head_dim = config.head_dim();
+        let num_heads = config.num_heads;
+
+        // Xavier initialization
+        let node_scale = (2.0 / (config.node_dim + head_dim) as f32).sqrt();
+        let edge_scale = (2.0 / (config.edge_dim + head_dim) as f32).sqrt();
+        let attn_scale = (1.0 / head_dim as f32).sqrt();
+
+        let mut seed = 42u64;
+        let mut rand = || {
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            (seed as f32) / (u64::MAX as f32) - 0.5
+        };
+
+        let w_node: Vec<f32> = (0..num_heads * head_dim * config.node_dim)
+            .map(|_| rand() * 2.0 * node_scale)
+            .collect();
+
+        let w_edge: Vec<f32> = (0..num_heads * head_dim * config.edge_dim)
+            .map(|_| rand() * 2.0 * edge_scale)
+            .collect();
+
+        let a_src: Vec<f32> = (0..num_heads * head_dim)
+            .map(|_| rand() * 2.0 * attn_scale)
+            .collect();
+
+        let a_dst: Vec<f32> = (0..num_heads * head_dim)
+            .map(|_| rand() * 2.0 * attn_scale)
+            .collect();
+
+        let a_edge: Vec<f32> = (0..num_heads * head_dim)
+            .map(|_| rand() * 2.0 * attn_scale)
+            .collect();
+
+        Self {
+            config,
+            w_node,
+            w_edge,
+            a_src,
+            a_dst,
+            a_edge,
+        }
+    }
+
+    /// Transform node features for a specific head
+    fn transform_node(&self, node: &[f32], head: usize) -> Vec<f32> {
+        let head_dim = self.config.head_dim();
+        let node_dim = self.config.node_dim;
+
+        (0..head_dim)
+            .map(|i| {
+                node.iter()
+                    .enumerate()
+                    .map(|(j, &nj)| nj * self.w_node[head * head_dim * node_dim + i * node_dim + j])
+                    .sum()
+            })
+            .collect()
+    }
+
+    /// Transform edge features for a specific head
+    fn transform_edge(&self, edge: &[f32], head: usize) -> Vec<f32> {
+        let head_dim = self.config.head_dim();
+        let edge_dim = self.config.edge_dim;
+
+        (0..head_dim)
+            .map(|i| {
+                edge.iter()
+                    .enumerate()
+                    .map(|(j, &ej)| ej * self.w_edge[head * head_dim * edge_dim + i * edge_dim + j])
+                    .sum()
+            })
+            .collect()
+    }
+
+    /// Compute attention coefficient with LeakyReLU
+    fn attention_coeff(&self, src: &[f32], dst: &[f32], edge: &[f32], head: usize) -> f32 {
+        let head_dim = self.config.head_dim();
+
+        let mut score = 0.0f32;
+        for i in 0..head_dim {
+            let offset = head * head_dim + i;
+            score += src[i] * self.a_src[offset];
+            score += dst[i] * self.a_dst[offset];
+            score += edge[i] * self.a_edge[offset];
+        }
+
+        // LeakyReLU
+        if score < 0.0 {
+            self.config.negative_slope * score
+        } else {
+            score
+        }
+    }
+}
+
+impl EdgeFeaturedAttention {
+    /// Compute attention with explicit edge features
+    pub fn compute_with_edges(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        edges: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.len() != edges.len() {
+            return Err(AttentionError::InvalidConfig(
+                "Keys and edges must have same length".to_string(),
+            ));
+        }
+
+        let num_heads = self.config.num_heads;
+        let head_dim = self.config.head_dim();
+        let n = keys.len();
+
+        // Transform query once per head
+        let query_transformed: Vec<Vec<f32>> = (0..num_heads)
+            .map(|h| self.transform_node(query, h))
+            .collect();
+
+        // Compute per-head outputs
+        let mut head_outputs: Vec<Vec<f32>> = Vec::with_capacity(num_heads);
+
+        for h in 0..num_heads {
+            // Transform all keys and edges
+            let keys_t: Vec<Vec<f32>> = keys.iter().map(|k| self.transform_node(k, h)).collect();
+            let edges_t: Vec<Vec<f32>> = edges.iter().map(|e| self.transform_edge(e, h)).collect();
+
+            // Compute attention coefficients
+            let coeffs: Vec<f32> = (0..n)
+                .map(|i| self.attention_coeff(&query_transformed[h], &keys_t[i], &edges_t[i], h))
+                .collect();
+
+            // Softmax
+            let weights = stable_softmax(&coeffs);
+
+            // Weighted sum of values
+            let mut head_out = vec![0.0f32; head_dim];
+            for (i, &w) in weights.iter().enumerate() {
+                let value_t = self.transform_node(values[i], h);
+                for (j, &vj) in value_t.iter().enumerate() {
+                    head_out[j] += w * vj;
+                }
+            }
+
+            head_outputs.push(head_out);
+        }
+
+        // Concatenate or average heads
+        if self.config.concat_heads {
+            Ok(head_outputs.into_iter().flatten().collect())
+        } else {
+            let mut output = vec![0.0f32; head_dim];
+            for head_out in &head_outputs {
+                for (i, &v) in head_out.iter().enumerate() {
+                    output[i] += v / num_heads as f32;
+                }
+            }
+            Ok(output)
+        }
+    }
+
+    /// Get the edge feature dimension
+    pub fn edge_dim(&self) -> usize {
+        self.config.edge_dim
+    }
+}
+
+impl Attention for EdgeFeaturedAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if query.len() != self.config.node_dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.config.node_dim,
+                actual: query.len(),
+            });
+        }
+
+        // Use zero edge features for basic attention
+        let zero_edge = vec![0.0f32; self.config.edge_dim];
+        let edges: Vec<&[f32]> = (0..keys.len()).map(|_| zero_edge.as_slice()).collect();
+
+        self.compute_with_edges(query, keys, values, &edges)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        // Apply mask by filtering keys/values
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        if self.config.concat_heads {
+            self.config.node_dim
+        } else {
+            self.config.head_dim()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_edge_featured_attention() {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(64)
+            .edge_dim(16)
+            .num_heads(4)
+            .build();
+
+        let attn = EdgeFeaturedAttention::new(config);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = (0..10).map(|_| vec![0.3; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|_| vec![1.0; 64]).collect();
+        let edges: Vec<Vec<f32>> = (0..10).map(|_| vec![0.2; 16]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+        let edges_refs: Vec<&[f32]> = edges.iter().map(|e| e.as_slice()).collect();
+
+        let result = attn
+            .compute_with_edges(&query, &keys_refs, &values_refs, &edges_refs)
+            .unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_without_edges() {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(32)
+            .edge_dim(8)
+            .num_heads(2)
+            .build();
+
+        let attn = EdgeFeaturedAttention::new(config);
+
+        let query = vec![0.5; 32];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attn.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+
+    #[test]
+    fn test_leaky_relu() {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(16)
+            .edge_dim(4)
+            .num_heads(1)
+            .negative_slope(0.2)
+            .build();
+
+        let attn = EdgeFeaturedAttention::new(config);
+
+        // Just verify it computes without error
+        let query = vec![-1.0; 16];
+        let keys: Vec<Vec<f32>> = vec![vec![-0.5; 16]; 3];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 16]; 3];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attn.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 16);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/graph/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/graph/mod.rs
@@ -0,0 +1,14 @@
+//! Graph attention mechanisms for GNN applications
+//!
+//! This module provides graph-specific attention implementations:
+//! - Edge-featured attention (GAT with edge features)
+//! - Rotary position embeddings for graphs (RoPE)
+//! - Dual-space attention (Euclidean + Hyperbolic)
+
+pub mod dual_space;
+pub mod edge_featured;
+pub mod rope;
+
+pub use dual_space::{DualSpaceAttention, DualSpaceConfig};
+pub use edge_featured::{EdgeFeaturedAttention, EdgeFeaturedConfig};
+pub use rope::{GraphRoPE, RoPEConfig};
--- a/vendor/ruvector/crates/ruvector-attention/src/graph/rope.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/graph/rope.rs
@@ -0,0 +1,318 @@
+//! Rotary Position Embeddings (RoPE) for Graph Attention
+//!
+//! Adapts RoPE for graph structures where positions are defined by graph topology
+//! (e.g., hop distance, shortest path length, or learned positional encodings).
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use crate::utils::stable_softmax;
+
+/// Configuration for Graph RoPE
+#[derive(Clone, Debug)]
+pub struct RoPEConfig {
+    pub dim: usize,
+    pub base: f32,
+    pub max_position: usize,
+    pub scaling_factor: f32,
+}
+
+impl Default for RoPEConfig {
+    fn default() -> Self {
+        Self {
+            dim: 256,
+            base: 10000.0,
+            max_position: 512,
+            scaling_factor: 1.0,
+        }
+    }
+}
+
+impl RoPEConfig {
+    pub fn builder() -> RoPEConfigBuilder {
+        RoPEConfigBuilder::default()
+    }
+}
+
+#[derive(Default)]
+pub struct RoPEConfigBuilder {
+    config: RoPEConfig,
+}
+
+impl RoPEConfigBuilder {
+    pub fn dim(mut self, d: usize) -> Self {
+        self.config.dim = d;
+        self
+    }
+
+    pub fn base(mut self, b: f32) -> Self {
+        self.config.base = b;
+        self
+    }
+
+    pub fn max_position(mut self, m: usize) -> Self {
+        self.config.max_position = m;
+        self
+    }
+
+    pub fn scaling_factor(mut self, s: f32) -> Self {
+        self.config.scaling_factor = s;
+        self
+    }
+
+    pub fn build(self) -> RoPEConfig {
+        self.config
+    }
+}
+
+/// Graph attention with Rotary Position Embeddings
+pub struct GraphRoPE {
+    config: RoPEConfig,
+    /// Precomputed cos/sin tables: [max_position, dim]
+    cos_cache: Vec<f32>,
+    sin_cache: Vec<f32>,
+    scale: f32,
+}
+
+impl GraphRoPE {
+    pub fn new(config: RoPEConfig) -> Self {
+        let dim = config.dim;
+        let max_pos = config.max_position;
+        let base = config.base;
+        let scaling = config.scaling_factor;
+
+        // Compute frequency bands
+        let half_dim = dim / 2;
+        let inv_freq: Vec<f32> = (0..half_dim)
+            .map(|i| 1.0 / (base.powf(2.0 * i as f32 / dim as f32)))
+            .collect();
+
+        // Precompute cos/sin for all positions
+        let mut cos_cache = Vec::with_capacity(max_pos * dim);
+        let mut sin_cache = Vec::with_capacity(max_pos * dim);
+
+        for pos in 0..max_pos {
+            let scaled_pos = pos as f32 / scaling;
+            for i in 0..half_dim {
+                let theta = scaled_pos * inv_freq[i];
+                cos_cache.push(theta.cos());
+                sin_cache.push(theta.sin());
+            }
+            // Duplicate for both halves (interleaved format)
+            for i in 0..half_dim {
+                let theta = scaled_pos * inv_freq[i];
+                cos_cache.push(theta.cos());
+                sin_cache.push(theta.sin());
+            }
+        }
+
+        Self {
+            scale: 1.0 / (dim as f32).sqrt(),
+            config,
+            cos_cache,
+            sin_cache,
+        }
+    }
+
+    /// Apply rotary embedding to a vector at given position
+    pub fn apply_rotary(&self, x: &[f32], position: usize) -> Vec<f32> {
+        let dim = self.config.dim;
+        let half = dim / 2;
+        let pos = position.min(self.config.max_position - 1);
+        let offset = pos * dim;
+
+        let mut result = vec![0.0f32; dim];
+
+        // Apply rotation to first half
+        for i in 0..half {
+            let cos = self.cos_cache[offset + i];
+            let sin = self.sin_cache[offset + i];
+            result[i] = x[i] * cos - x[half + i] * sin;
+            result[half + i] = x[i] * sin + x[half + i] * cos;
+        }
+
+        result
+    }
+
+    /// Compute attention with positional encoding based on graph distances
+    pub fn compute_with_positions(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        query_pos: usize,
+        key_positions: &[usize],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if keys.len() != key_positions.len() {
+            return Err(AttentionError::InvalidConfig(
+                "Keys and positions must have same length".to_string(),
+            ));
+        }
+        if query.len() != self.config.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.config.dim,
+                actual: query.len(),
+            });
+        }
+
+        // Apply rotary to query
+        let q_rot = self.apply_rotary(query, query_pos);
+
+        // Compute attention scores with rotary keys
+        let scores: Vec<f32> = keys
+            .iter()
+            .zip(key_positions.iter())
+            .map(|(key, &pos)| {
+                let k_rot = self.apply_rotary(key, pos);
+                q_rot
+                    .iter()
+                    .zip(k_rot.iter())
+                    .map(|(q, k)| q * k)
+                    .sum::<f32>()
+                    * self.scale
+            })
+            .collect();
+
+        // Softmax
+        let weights = stable_softmax(&scores);
+
+        // Weighted sum
+        let value_dim = values[0].len();
+        let mut output = vec![0.0f32; value_dim];
+        for (w, v) in weights.iter().zip(values.iter()) {
+            for (o, &vi) in output.iter_mut().zip(v.iter()) {
+                *o += w * vi;
+            }
+        }
+
+        Ok(output)
+    }
+
+    /// Get relative position for graph distance
+    /// Converts graph hop distance to position index
+    pub fn distance_to_position(distance: usize, max_distance: usize) -> usize {
+        // Bucketize distances logarithmically for larger graphs
+        if distance <= 8 {
+            distance
+        } else {
+            let log_dist = (distance as f32).log2().ceil() as usize;
+            8 + log_dist.min(max_distance - 8)
+        }
+    }
+}
+
+impl Attention for GraphRoPE {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // Default: use sequential positions (0, 1, 2, ...)
+        let query_pos = 0;
+        let key_positions: Vec<usize> = (0..keys.len()).collect();
+        self.compute_with_positions(query, keys, values, query_pos, &key_positions)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_rope_basic() {
+        let config = RoPEConfig::builder().dim(64).max_position(100).build();
+
+        let rope = GraphRoPE::new(config);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = (0..10).map(|_| vec![0.3; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|_| vec![1.0; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = rope.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_rope_with_positions() {
+        let config = RoPEConfig::builder().dim(32).max_position(50).build();
+
+        let rope = GraphRoPE::new(config);
+
+        let query = vec![0.5; 32];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        // Graph distances as positions
+        let key_positions = vec![1, 2, 3, 2, 4];
+
+        let result = rope
+            .compute_with_positions(&query, &keys_refs, &values_refs, 0, &key_positions)
+            .unwrap();
+        assert_eq!(result.len(), 32);
+    }
+
+    #[test]
+    fn test_rotary_embedding() {
+        let config = RoPEConfig::builder().dim(16).max_position(10).build();
+
+        let rope = GraphRoPE::new(config);
+
+        let x = vec![1.0; 16];
+
+        // Rotary should preserve norm approximately
+        let rotated = rope.apply_rotary(&x, 5);
+        let norm_orig: f32 = x.iter().map(|v| v * v).sum::<f32>().sqrt();
+        let norm_rot: f32 = rotated.iter().map(|v| v * v).sum::<f32>().sqrt();
+
+        assert!((norm_orig - norm_rot).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_distance_to_position() {
+        // Direct mapping for small distances
+        assert_eq!(GraphRoPE::distance_to_position(0, 20), 0);
+        assert_eq!(GraphRoPE::distance_to_position(5, 20), 5);
+        assert_eq!(GraphRoPE::distance_to_position(8, 20), 8);
+
+        // Logarithmic for larger distances
+        let pos_16 = GraphRoPE::distance_to_position(16, 20);
+        let pos_32 = GraphRoPE::distance_to_position(32, 20);
+        assert!(pos_16 > 8);
+        assert!(pos_32 > pos_16);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/hyperbolic_attention.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/hyperbolic_attention.rs
@@ -0,0 +1,171 @@
+//! Hyperbolic Attention Mechanism using Poincaré ball model
+
+use super::poincare::{frechet_mean, poincare_distance, project_to_ball};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+
+/// Configuration for hyperbolic attention
+#[derive(Debug, Clone)]
+pub struct HyperbolicAttentionConfig {
+    pub dim: usize,
+    pub curvature: f32,
+    pub adaptive_curvature: bool,
+    pub temperature: f32,
+    pub frechet_max_iter: usize,
+    pub frechet_tol: f32,
+}
+
+impl Default for HyperbolicAttentionConfig {
+    fn default() -> Self {
+        Self {
+            dim: 128,
+            curvature: -1.0,
+            adaptive_curvature: false,
+            temperature: 1.0,
+            frechet_max_iter: 50,
+            frechet_tol: 1e-5,
+        }
+    }
+}
+
+/// Hyperbolic Attention mechanism
+pub struct HyperbolicAttention {
+    config: HyperbolicAttentionConfig,
+    current_curvature: f32,
+}
+
+impl HyperbolicAttention {
+    pub fn new(config: HyperbolicAttentionConfig) -> Self {
+        let current_curvature = config.curvature.abs();
+        Self {
+            config,
+            current_curvature,
+        }
+    }
+
+    pub fn compute_weights(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
+        if keys.is_empty() {
+            return vec![];
+        }
+
+        let scores: Vec<f32> = keys
+            .iter()
+            .map(|k| -poincare_distance(query, k, self.current_curvature))
+            .collect();
+
+        self.softmax_with_temperature(&scores)
+    }
+
+    fn softmax_with_temperature(&self, scores: &[f32]) -> Vec<f32> {
+        if scores.is_empty() {
+            return vec![];
+        }
+
+        let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_scores: Vec<f32> = scores
+            .iter()
+            .map(|&s| ((s - max_score) / self.config.temperature).exp())
+            .collect();
+
+        let sum: f32 = exp_scores.iter().sum();
+        if sum < 1e-10 {
+            vec![1.0 / scores.len() as f32; scores.len()]
+        } else {
+            exp_scores.iter().map(|&e| e / sum).collect()
+        }
+    }
+
+    pub fn aggregate(&self, weights: &[f32], values: &[&[f32]]) -> Vec<f32> {
+        if values.is_empty() {
+            return vec![0.0; self.config.dim];
+        }
+
+        if values.len() == 1 {
+            return values[0].to_vec();
+        }
+
+        frechet_mean(
+            values,
+            Some(weights),
+            self.current_curvature,
+            self.config.frechet_max_iter,
+            self.config.frechet_tol,
+        )
+    }
+}
+
+impl Attention for HyperbolicAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() || values.is_empty() {
+            return Err(AttentionError::EmptyInput(
+                "Keys and values cannot be empty".to_string(),
+            ));
+        }
+
+        let query_proj = project_to_ball(query, self.current_curvature, 1e-7);
+        let keys_proj: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| project_to_ball(k, self.current_curvature, 1e-7))
+            .collect();
+        let values_proj: Vec<Vec<f32>> = values
+            .iter()
+            .map(|v| project_to_ball(v, self.current_curvature, 1e-7))
+            .collect();
+
+        let keys_refs: Vec<&[f32]> = keys_proj.iter().map(|k| k.as_slice()).collect();
+        let weights = self.compute_weights(&query_proj, &keys_refs);
+
+        let values_refs: Vec<&[f32]> = values_proj.iter().map(|v| v.as_slice()).collect();
+        let result = self.aggregate(&weights, &values_refs);
+
+        Ok(result)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        let query_proj = project_to_ball(query, self.current_curvature, 1e-7);
+        let keys_proj: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| project_to_ball(k, self.current_curvature, 1e-7))
+            .collect();
+        let values_proj: Vec<Vec<f32>> = values
+            .iter()
+            .map(|v| project_to_ball(v, self.current_curvature, 1e-7))
+            .collect();
+
+        let keys_refs: Vec<&[f32]> = keys_proj.iter().map(|k| k.as_slice()).collect();
+        let mut weights = self.compute_weights(&query_proj, &keys_refs);
+
+        if let Some(mask_vec) = mask {
+            for (i, &masked) in mask_vec.iter().enumerate() {
+                if !masked && i < weights.len() {
+                    weights[i] = 0.0;
+                }
+            }
+
+            let sum: f32 = weights.iter().sum();
+            if sum > 1e-10 {
+                for w in &mut weights {
+                    *w /= sum;
+                }
+            }
+        }
+
+        let values_refs: Vec<&[f32]> = values_proj.iter().map(|v| v.as_slice()).collect();
+        Ok(self.aggregate(&weights, &values_refs))
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/lorentz_cascade.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/lorentz_cascade.rs
@@ -0,0 +1,579 @@
+//! Lorentz Cascade Attention (LCA) - A Novel Hyperbolic Attention Mechanism
+//!
+//! ## Key Innovations
+//!
+//! 1. **Lorentz Model**: No boundary instability (hyperboloid vs ball)
+//! 2. **Busemann Scoring**: O(d) attention weights via dot products only
+//! 3. **Closed-Form Centroid**: Einstein midpoint instead of iterative Fréchet
+//! 4. **Multi-Curvature Heads**: Adaptive hierarchy depth per head
+//! 5. **Cascade Aggregation**: Coarse-to-fine hierarchical refinement
+//!
+//! ## Theoretical Advantages
+//!
+//! - **5-10x faster** than Poincaré (no acosh in hot path)
+//! - **Numerically stable** (no ball boundary issues)
+//! - **Better hierarchy preservation** (multi-scale curvature)
+//! - **SIMD-friendly** (mostly dot products)
+//!
+//! ## References
+//!
+//! Novel architecture combining:
+//! - Lorentz model geometry (Nickel & Kiela 2018)
+//! - Busemann functions for hierarchy (Sala et al. 2018)
+//! - Einstein midpoint aggregation (Ungar 2008)
+//! - Multi-curvature learning (Gu et al. 2019)
+
+// SIMD support available with nightly Rust feature flag
+// For stable Rust, we use scalar operations with auto-vectorization hints
+
+/// Small epsilon for numerical stability
+const EPS: f32 = 1e-7;
+
+/// Lorentz inner product: ⟨x, y⟩_L = -x₀y₀ + x₁y₁ + ... + xₙyₙ
+/// This is the Minkowski metric with signature (-,+,+,...,+)
+#[inline]
+pub fn lorentz_inner(x: &[f32], y: &[f32]) -> f32 {
+    debug_assert!(x.len() == y.len());
+    if x.len() < 2 {
+        return 0.0;
+    }
+
+    // Time component (negative)
+    let time = -x[0] * y[0];
+
+    // Space components (positive) - SIMD accelerated
+    let space: f32 = x[1..].iter().zip(&y[1..]).map(|(a, b)| a * b).sum();
+
+    time + space
+}
+
+/// Lorentz norm squared: ⟨x, x⟩_L (should be -1 for points on hyperboloid)
+#[inline]
+pub fn lorentz_norm_sq(x: &[f32]) -> f32 {
+    lorentz_inner(x, x)
+}
+
+/// Project point onto hyperboloid H^n = {x : ⟨x,x⟩_L = -1/c, x₀ > 0}
+/// Much more stable than Poincaré ball projection
+#[inline]
+pub fn project_hyperboloid(x: &[f32], c: f32) -> Vec<f32> {
+    let space_norm_sq: f32 = x[1..].iter().map(|v| v * v).sum();
+    let target = -1.0 / c;
+
+    // x₀ = sqrt(1/c + ||x_space||²) to satisfy ⟨x,x⟩_L = -1/c
+    let x0 = ((space_norm_sq - target).max(EPS)).sqrt();
+
+    let mut result = Vec::with_capacity(x.len());
+    result.push(x0);
+    result.extend_from_slice(&x[1..]);
+    result
+}
+
+/// Lorentz distance: d(x,y) = (1/√c) * arcosh(-c⟨x,y⟩_L)
+/// Faster than Poincaré: single arcosh vs complex formula
+#[inline]
+pub fn lorentz_distance(x: &[f32], y: &[f32], c: f32) -> f32 {
+    let inner = lorentz_inner(x, y);
+    let arg = (-c * inner).max(1.0); // Clamp for numerical stability
+    arg.acosh() / c.sqrt()
+}
+
+/// **NOVEL**: Busemann function for hierarchy scoring
+///
+/// B_ξ(x) measures "progress toward ideal point ξ at infinity"
+/// In Lorentz model: B_ξ(x) = log(-⟨x, ξ⟩_L) where ξ is light-like
+///
+/// This gives us O(d) hierarchy scores via dot products only!
+#[inline]
+pub fn busemann_score(x: &[f32], xi: &[f32]) -> f32 {
+    let inner = lorentz_inner(x, xi);
+    // ξ is light-like (on null cone), so ⟨x,ξ⟩_L < 0 for x on hyperboloid
+    (-inner).max(EPS).ln()
+}
+
+/// **NOVEL**: Horosphere attention weights
+///
+/// Instead of computing pairwise distances, we compute each key's
+/// position relative to a query-defined horosphere.
+///
+/// Horosphere: {x : B_ξ(x) = B_ξ(q)} - all points at same "depth" as query
+///
+/// Weight = softmax(B_ξ(k) - B_ξ(q)) naturally gives:
+/// - Higher weights to ancestors (smaller Busemann = closer to root)
+/// - Lower weights to descendants (larger Busemann = closer to leaves)
+pub fn horosphere_attention_weights(
+    query: &[f32],
+    keys: &[&[f32]],
+    focal_direction: &[f32], // Light-like vector defining hierarchy direction
+    temperature: f32,
+) -> Vec<f32> {
+    if keys.is_empty() {
+        return vec![];
+    }
+
+    let query_depth = busemann_score(query, focal_direction);
+
+    // Compute relative depths (dot products only - very fast!)
+    let scores: Vec<f32> = keys
+        .iter()
+        .map(|k| {
+            let key_depth = busemann_score(k, focal_direction);
+            // Negative because we want ancestors (lower depth) to have higher scores
+            -(key_depth - query_depth) / temperature
+        })
+        .collect();
+
+    // Stable softmax
+    let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let exp_scores: Vec<f32> = scores.iter().map(|&s| (s - max_score).exp()).collect();
+    let sum: f32 = exp_scores.iter().sum();
+
+    if sum < EPS {
+        vec![1.0 / keys.len() as f32; keys.len()]
+    } else {
+        exp_scores.iter().map(|&e| e / sum).collect()
+    }
+}
+
+/// **NOVEL**: Einstein Midpoint - Closed-form hyperbolic centroid
+///
+/// Unlike iterative Fréchet mean (50+ iterations), this is O(1)!
+///
+/// Formula: midpoint = Σ(wᵢγᵢxᵢ) / ||Σ(wᵢγᵢxᵢ)||_L
+/// where γᵢ = 1/sqrt(1 + c||xᵢ_space||²) is the Lorentz factor
+///
+/// This is exact for 2 points, excellent approximation for n points
+pub fn einstein_midpoint(points: &[&[f32]], weights: &[f32], c: f32) -> Vec<f32> {
+    if points.is_empty() {
+        return vec![];
+    }
+
+    let dim = points[0].len();
+    let mut weighted_sum = vec![0.0f32; dim];
+
+    for (point, &weight) in points.iter().zip(weights) {
+        // Lorentz factor (relativistic gamma)
+        let space_norm_sq: f32 = point[1..].iter().map(|v| v * v).sum();
+        let gamma = 1.0 / (1.0 + c * space_norm_sq).sqrt();
+
+        let factor = weight * gamma;
+        for (i, &val) in point.iter().enumerate() {
+            weighted_sum[i] += factor * val;
+        }
+    }
+
+    // Normalize to hyperboloid
+    project_hyperboloid(&weighted_sum, c)
+}
+
+/// **NOVEL**: Multi-Curvature Cascade Head
+///
+/// Each attention head operates at a different curvature:
+/// - High |c|: Fine hierarchy (deep trees)
+/// - Low |c|: Coarse hierarchy (shallow trees)
+/// - c → 0: Approaches Euclidean (flat)
+///
+/// The cascade combines results from coarse to fine
+#[derive(Debug, Clone)]
+pub struct CascadeHead {
+    pub curvature: f32,
+    pub focal_direction: Vec<f32>, // Learned ideal point direction
+    pub temperature: f32,
+    pub weight: f32, // Blend weight for this scale
+}
+
+impl CascadeHead {
+    pub fn new(curvature: f32, dim: usize) -> Self {
+        // Initialize focal direction as "upward" in hierarchy
+        // (1, 0, 0, ..., 0) points toward the "root" of the tree
+        let mut focal = vec![0.0; dim];
+        focal[0] = 1.0; // Light-like: ⟨ξ,ξ⟩_L = 0
+        focal[1] = 1.0;
+
+        Self {
+            curvature,
+            focal_direction: focal,
+            temperature: 1.0,
+            weight: 1.0,
+        }
+    }
+}
+
+/// **NOVEL**: Lorentz Cascade Attention (LCA)
+///
+/// Multi-scale hyperbolic attention with:
+/// 1. Multiple curvature heads (cascade)
+/// 2. Busemann-based scoring (O(d) per key)
+/// 3. Einstein midpoint aggregation (O(1) vs O(iter))
+/// 4. Learned focal directions per head
+#[derive(Debug, Clone)]
+pub struct LorentzCascadeAttention {
+    pub dim: usize,
+    pub heads: Vec<CascadeHead>,
+    pub use_simd: bool,
+}
+
+/// Configuration for LCA
+#[derive(Debug, Clone)]
+pub struct LCAConfig {
+    pub dim: usize,
+    pub num_heads: usize,
+    pub curvature_range: (f32, f32), // (min, max) curvature magnitudes
+    pub temperature: f32,
+}
+
+impl Default for LCAConfig {
+    fn default() -> Self {
+        Self {
+            dim: 128,
+            num_heads: 4,
+            curvature_range: (0.1, 2.0), // Multi-scale
+            temperature: 1.0,
+        }
+    }
+}
+
+impl LorentzCascadeAttention {
+    /// Create new LCA with logarithmically-spaced curvatures
+    pub fn new(config: LCAConfig) -> Self {
+        let (c_min, c_max) = config.curvature_range;
+        let log_min = c_min.ln();
+        let log_max = c_max.ln();
+
+        let heads: Vec<CascadeHead> = (0..config.num_heads)
+            .map(|i| {
+                let t = if config.num_heads > 1 {
+                    i as f32 / (config.num_heads - 1) as f32
+                } else {
+                    0.5
+                };
+                let curvature = (log_min + t * (log_max - log_min)).exp();
+                let mut head = CascadeHead::new(curvature, config.dim);
+                head.temperature = config.temperature;
+                head.weight = 1.0 / config.num_heads as f32;
+                head
+            })
+            .collect();
+
+        Self {
+            dim: config.dim,
+            heads,
+            use_simd: true,
+        }
+    }
+
+    /// Compute attention for a single head
+    fn attend_single_head(
+        &self,
+        head: &CascadeHead,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> Vec<f32> {
+        // 1. Project to hyperboloid at this curvature
+        let query_h = project_hyperboloid(query, head.curvature);
+        let keys_h: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| project_hyperboloid(k, head.curvature))
+            .collect();
+        let values_h: Vec<Vec<f32>> = values
+            .iter()
+            .map(|v| project_hyperboloid(v, head.curvature))
+            .collect();
+
+        // 2. Compute horosphere attention weights (fast!)
+        let keys_refs: Vec<&[f32]> = keys_h.iter().map(|k| k.as_slice()).collect();
+        let weights = horosphere_attention_weights(
+            &query_h,
+            &keys_refs,
+            &head.focal_direction,
+            head.temperature,
+        );
+
+        // 3. Aggregate via Einstein midpoint (closed-form!)
+        let values_refs: Vec<&[f32]> = values_h.iter().map(|v| v.as_slice()).collect();
+        einstein_midpoint(&values_refs, &weights, head.curvature)
+    }
+
+    /// **Main API**: Multi-scale cascade attention
+    ///
+    /// Combines results from all heads (different curvatures)
+    /// Coarse heads capture global hierarchy, fine heads capture local
+    pub fn attend(&self, query: &[f32], keys: &[&[f32]], values: &[&[f32]]) -> Vec<f32> {
+        if keys.is_empty() || values.is_empty() {
+            return vec![0.0; self.dim];
+        }
+
+        // Compute attention at each scale
+        let head_outputs: Vec<Vec<f32>> = self
+            .heads
+            .iter()
+            .map(|head| self.attend_single_head(head, query, keys, values))
+            .collect();
+
+        // Blend across scales (weighted average in tangent space)
+        let mut result = vec![0.0; self.dim];
+        let mut total_weight = 0.0;
+
+        for (head, output) in self.heads.iter().zip(&head_outputs) {
+            for (i, &val) in output.iter().enumerate() {
+                if i < result.len() {
+                    result[i] += head.weight * val;
+                }
+            }
+            total_weight += head.weight;
+        }
+
+        if total_weight > EPS {
+            for val in &mut result {
+                *val /= total_weight;
+            }
+        }
+
+        result
+    }
+
+    /// Sparse attention: only attend to k-nearest in hyperbolic space
+    pub fn attend_sparse(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        top_k: usize,
+    ) -> Vec<f32> {
+        if keys.len() <= top_k {
+            return self.attend(query, keys, values);
+        }
+
+        // Use coarsest head (lowest curvature) for neighbor selection
+        let coarse_head = &self.heads[0];
+        let query_h = project_hyperboloid(query, coarse_head.curvature);
+
+        // Compute Busemann scores for all keys (very fast - just dot products)
+        let mut scored_indices: Vec<(usize, f32)> = keys
+            .iter()
+            .enumerate()
+            .map(|(i, k)| {
+                let key_h = project_hyperboloid(k, coarse_head.curvature);
+                let score = busemann_score(&key_h, &coarse_head.focal_direction);
+                (i, score)
+            })
+            .collect();
+
+        // Sort by proximity to query in hierarchy
+        let query_score = busemann_score(&query_h, &coarse_head.focal_direction);
+        scored_indices.sort_by(|a, b| {
+            let dist_a = (a.1 - query_score).abs();
+            let dist_b = (b.1 - query_score).abs();
+            dist_a.partial_cmp(&dist_b).unwrap()
+        });
+
+        // Take top-k
+        let selected_indices: Vec<usize> =
+            scored_indices.iter().take(top_k).map(|(i, _)| *i).collect();
+        let selected_keys: Vec<&[f32]> = selected_indices.iter().map(|&i| keys[i]).collect();
+        let selected_values: Vec<&[f32]> = selected_indices.iter().map(|&i| values[i]).collect();
+
+        self.attend(query, &selected_keys, &selected_values)
+    }
+}
+
+/// **NOVEL**: Tangent space operations for gradient computation
+/// These enable efficient backpropagation through hyperbolic operations
+pub mod tangent {
+    use super::*;
+
+    /// Logarithmic map: Hyperboloid → Tangent space at origin
+    /// Much simpler than Poincaré log map
+    pub fn log_map_origin(x: &[f32], c: f32) -> Vec<f32> {
+        let x0 = x[0];
+        let space = &x[1..];
+        let space_norm: f32 = space.iter().map(|v| v * v).sum::<f32>().sqrt();
+
+        if space_norm < EPS {
+            return vec![0.0; x.len() - 1];
+        }
+
+        let factor = (c.sqrt() * x0).acosh() / space_norm;
+        space.iter().map(|&v| factor * v).collect()
+    }
+
+    /// Exponential map: Tangent space at origin → Hyperboloid
+    pub fn exp_map_origin(v: &[f32], c: f32) -> Vec<f32> {
+        let v_norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+        if v_norm < EPS {
+            let mut result = vec![0.0; v.len() + 1];
+            result[0] = 1.0 / c.sqrt(); // Point at origin of hyperboloid
+            return result;
+        }
+
+        let sqrt_c = c.sqrt();
+        let x0 = (sqrt_c * v_norm).cosh() / sqrt_c;
+        let factor = (sqrt_c * v_norm).sinh() / (sqrt_c * v_norm);
+
+        let mut result = Vec::with_capacity(v.len() + 1);
+        result.push(x0);
+        result.extend(v.iter().map(|&vi| factor * vi));
+        result
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_lorentz_inner_hyperboloid() {
+        // Point on hyperboloid with c=1: (cosh(t), sinh(t), 0, ...)
+        let point = vec![1.5430806, 1.1752012, 0.0, 0.0]; // cosh(1), sinh(1)
+        let norm_sq = lorentz_norm_sq(&point);
+        // Should be approximately -1 (on unit hyperboloid)
+        assert!((norm_sq + 1.0).abs() < 0.01);
+    }
+
+    #[test]
+    fn test_einstein_midpoint_two_points() {
+        let c = 1.0;
+        let p1 = project_hyperboloid(&[1.0, 0.5, 0.0], c);
+        let p2 = project_hyperboloid(&[1.0, -0.5, 0.0], c);
+
+        let weights = vec![0.5, 0.5];
+        let midpoint = einstein_midpoint(&[p1.as_slice(), p2.as_slice()], &weights, c);
+
+        // Midpoint should be on hyperboloid
+        let norm_sq = lorentz_norm_sq(&midpoint);
+        assert!((norm_sq + 1.0 / c).abs() < 0.1);
+
+        // Midpoint should be between the two points (space component ≈ 0)
+        assert!(midpoint[1].abs() < 0.1);
+    }
+
+    #[test]
+    fn test_busemann_hierarchy() {
+        // Focal direction pointing "up" in hierarchy (light-like: ⟨ξ,ξ⟩_L = 0)
+        // For hierarchy, we want focal pointing toward the "root" of the tree
+        let focal = vec![1.0, -1.0, 0.0, 0.0]; // Light-like, pointing toward negative space
+
+        // Points on hyperboloid with 4 dimensions (1 time + 3 space)
+        // Root is closer to origin in space, leaf is further out
+        let root = project_hyperboloid(&[0.0, 0.1, 0.0, 0.0], 1.0);
+        let leaf = project_hyperboloid(&[0.0, 0.9, 0.0, 0.0], 1.0);
+
+        let root_score = busemann_score(&root, &focal);
+        let leaf_score = busemann_score(&leaf, &focal);
+
+        // With focal pointing toward negative space direction,
+        // root (smaller positive space) is "higher" in hierarchy (lower Busemann)
+        // This is because B_ξ(x) = log(-⟨x,ξ⟩_L) and we want root closer to ξ
+        assert!(
+            root_score < leaf_score,
+            "root_score={:.4} should be < leaf_score={:.4}\nroot={:?}, leaf={:?}",
+            root_score,
+            leaf_score,
+            root,
+            leaf
+        );
+    }
+
+    #[test]
+    fn test_cascade_attention_shapes() {
+        let config = LCAConfig {
+            dim: 8,
+            num_heads: 3,
+            curvature_range: (0.5, 2.0),
+            temperature: 1.0,
+        };
+
+        let lca = LorentzCascadeAttention::new(config);
+
+        let query = vec![1.0, 0.5, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0];
+        let key1 = vec![1.0, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0];
+        let key2 = vec![1.0, 0.8, 0.4, 0.2, 0.0, 0.0, 0.0, 0.0];
+        let keys: Vec<&[f32]> = vec![&key1, &key2];
+        let values = keys.clone();
+
+        let output = lca.attend(&query, &keys, &values);
+
+        assert_eq!(output.len(), 8);
+        assert!(output.iter().all(|x| x.is_finite()));
+    }
+
+    #[test]
+    fn test_horosphere_weights_sum_to_one() {
+        // Create points on hyperboloid with 4 dimensions (1 time + 3 space)
+        // Input format: [time, space1, space2, space3]
+        let focal = vec![1.0, 1.0, 0.0, 0.0]; // Light-like direction
+
+        // project_hyperboloid takes [time_placeholder, space...] and computes correct time
+        let query = project_hyperboloid(&[0.0, 0.5, 0.0, 0.0], 1.0);
+        let k1 = project_hyperboloid(&[0.0, 0.2, 0.0, 0.0], 1.0);
+        let k2 = project_hyperboloid(&[0.0, 0.6, 0.0, 0.0], 1.0);
+        let k3 = project_hyperboloid(&[0.0, 0.9, 0.0, 0.0], 1.0);
+        let keys: Vec<&[f32]> = vec![&k1, &k2, &k3];
+
+        let weights = horosphere_attention_weights(&query, &keys, &focal, 1.0);
+
+        let sum: f32 = weights.iter().sum();
+        assert!((sum - 1.0).abs() < 1e-5);
+    }
+}
+
+// Benchmarking utilities
+#[cfg(feature = "benchmark")]
+pub mod bench {
+    use super::*;
+    use std::time::Instant;
+
+    /// Benchmark LCA vs Poincaré attention
+    pub fn compare_performance(n_keys: usize, dim: usize, iterations: usize) {
+        use crate::hyperbolic::poincare::{frechet_mean, poincare_distance};
+
+        // Generate random data
+        let query: Vec<f32> = (0..dim).map(|i| (i as f32 * 0.1).sin() * 0.5).collect();
+        let keys: Vec<Vec<f32>> = (0..n_keys)
+            .map(|j| {
+                (0..dim)
+                    .map(|i| ((i + j) as f32 * 0.1).cos() * 0.5)
+                    .collect()
+            })
+            .collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        // Benchmark Poincaré
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let scores: Vec<f32> = keys_refs
+                .iter()
+                .map(|k| -poincare_distance(&query, k, 1.0))
+                .collect();
+            let _mean = frechet_mean(&keys_refs, None, 1.0, 50, 1e-5);
+        }
+        let poincare_time = start.elapsed();
+
+        // Benchmark LCA
+        let lca = LorentzCascadeAttention::new(LCAConfig {
+            dim,
+            num_heads: 4,
+            curvature_range: (0.1, 2.0),
+            temperature: 1.0,
+        });
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _output = lca.attend(&query, &keys_refs, &keys_refs);
+        }
+        let lca_time = start.elapsed();
+
+        println!(
+            "=== Performance Comparison (n={}, d={}, iter={}) ===",
+            n_keys, dim, iterations
+        );
+        println!("Poincaré Attention: {:?}", poincare_time);
+        println!("Lorentz Cascade:    {:?}", lca_time);
+        println!(
+            "Speedup:            {:.2}x",
+            poincare_time.as_nanos() as f64 / lca_time.as_nanos() as f64
+        );
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/mixed_curvature.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/mixed_curvature.rs
@@ -0,0 +1,240 @@
+//! Mixed-Curvature Attention combining Euclidean and Hyperbolic spaces
+
+use super::poincare::{frechet_mean, poincare_distance, project_to_ball};
+use crate::error::AttentionResult;
+use crate::traits::Attention;
+
+#[derive(Debug, Clone)]
+pub struct MixedCurvatureConfig {
+    pub euclidean_dim: usize,
+    pub hyperbolic_dim: usize,
+    pub curvature: f32,
+    pub mixing_weight: f32,
+    pub temperature: f32,
+    pub frechet_max_iter: usize,
+    pub frechet_tol: f32,
+}
+
+impl Default for MixedCurvatureConfig {
+    fn default() -> Self {
+        Self {
+            euclidean_dim: 64,
+            hyperbolic_dim: 64,
+            curvature: -1.0,
+            mixing_weight: 0.5,
+            temperature: 1.0,
+            frechet_max_iter: 50,
+            frechet_tol: 1e-5,
+        }
+    }
+}
+
+pub struct MixedCurvatureAttention {
+    config: MixedCurvatureConfig,
+}
+
+impl MixedCurvatureAttention {
+    pub fn new(config: MixedCurvatureConfig) -> Self {
+        Self { config }
+    }
+
+    fn total_dim(&self) -> usize {
+        self.config.euclidean_dim + self.config.hyperbolic_dim
+    }
+
+    fn split_embedding<'a>(&self, x: &'a [f32]) -> (&'a [f32], &'a [f32]) {
+        let euclidean = &x[..self.config.euclidean_dim];
+        let hyperbolic = &x[self.config.euclidean_dim..];
+        (euclidean, hyperbolic)
+    }
+
+    fn softmax(&self, scores: &[f32]) -> Vec<f32> {
+        if scores.is_empty() {
+            return vec![];
+        }
+
+        let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_scores: Vec<f32> = scores
+            .iter()
+            .map(|&s| ((s - max_score) / self.config.temperature).exp())
+            .collect();
+
+        let sum: f32 = exp_scores.iter().sum();
+        if sum < 1e-10 {
+            vec![1.0 / scores.len() as f32; scores.len()]
+        } else {
+            exp_scores.iter().map(|&e| e / sum).collect()
+        }
+    }
+
+    fn compute_euclidean_weights(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
+        let scores: Vec<f32> = keys
+            .iter()
+            .map(|k| query.iter().zip(k.iter()).map(|(q, k)| q * k).sum())
+            .collect();
+        self.softmax(&scores)
+    }
+
+    fn compute_hyperbolic_weights(&self, query: &[f32], keys: &[&[f32]]) -> Vec<f32> {
+        let c = self.config.curvature.abs();
+        let query_proj = project_to_ball(query, c, 1e-7);
+        let keys_proj: Vec<Vec<f32>> = keys.iter().map(|k| project_to_ball(k, c, 1e-7)).collect();
+
+        let scores: Vec<f32> = keys_proj
+            .iter()
+            .map(|k| -poincare_distance(&query_proj, k, c))
+            .collect();
+        self.softmax(&scores)
+    }
+
+    fn aggregate_euclidean(&self, weights: &[f32], values: &[&[f32]]) -> Vec<f32> {
+        let dim = values.get(0).map(|v| v.len()).unwrap_or(0);
+        let mut result = vec![0.0; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (i, &v) in value.iter().enumerate() {
+                result[i] += weight * v;
+            }
+        }
+
+        result
+    }
+
+    fn aggregate_hyperbolic(&self, weights: &[f32], values: &[&[f32]]) -> Vec<f32> {
+        if values.is_empty() {
+            return vec![0.0; self.config.hyperbolic_dim];
+        }
+
+        let c = self.config.curvature.abs();
+        let values_proj: Vec<Vec<f32>> =
+            values.iter().map(|v| project_to_ball(v, c, 1e-7)).collect();
+        let values_refs: Vec<&[f32]> = values_proj.iter().map(|v| v.as_slice()).collect();
+
+        frechet_mean(
+            &values_refs,
+            Some(weights),
+            c,
+            self.config.frechet_max_iter,
+            self.config.frechet_tol,
+        )
+    }
+
+    fn combine_components(&self, euclidean: Vec<f32>, hyperbolic: Vec<f32>) -> Vec<f32> {
+        let mut result = Vec::with_capacity(self.total_dim());
+        result.extend(euclidean);
+        result.extend(hyperbolic);
+        result
+    }
+}
+
+impl Attention for MixedCurvatureAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let (query_euc, query_hyp) = self.split_embedding(query);
+
+        let keys_euc: Vec<&[f32]> = keys
+            .iter()
+            .map(|k| &k[..self.config.euclidean_dim])
+            .collect();
+        let keys_hyp: Vec<&[f32]> = keys
+            .iter()
+            .map(|k| &k[self.config.euclidean_dim..])
+            .collect();
+
+        let values_euc: Vec<&[f32]> = values
+            .iter()
+            .map(|v| &v[..self.config.euclidean_dim])
+            .collect();
+        let values_hyp: Vec<&[f32]> = values
+            .iter()
+            .map(|v| &v[self.config.euclidean_dim..])
+            .collect();
+
+        let weights_euc = self.compute_euclidean_weights(query_euc, &keys_euc);
+        let weights_hyp = self.compute_hyperbolic_weights(query_hyp, &keys_hyp);
+
+        let alpha = self.config.mixing_weight;
+        let combined_weights: Vec<f32> = weights_euc
+            .iter()
+            .zip(&weights_hyp)
+            .map(|(&w_e, &w_h)| (1.0 - alpha) * w_e + alpha * w_h)
+            .collect();
+
+        let sum: f32 = combined_weights.iter().sum();
+        let normalized_weights: Vec<f32> = if sum > 1e-10 {
+            combined_weights.iter().map(|&w| w / sum).collect()
+        } else {
+            vec![1.0 / combined_weights.len() as f32; combined_weights.len()]
+        };
+
+        let result_euc = self.aggregate_euclidean(&normalized_weights, &values_euc);
+        let result_hyp = self.aggregate_hyperbolic(&normalized_weights, &values_hyp);
+
+        Ok(self.combine_components(result_euc, result_hyp))
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        let (query_euc, query_hyp) = self.split_embedding(query);
+
+        let keys_euc: Vec<&[f32]> = keys
+            .iter()
+            .map(|k| &k[..self.config.euclidean_dim])
+            .collect();
+        let keys_hyp: Vec<&[f32]> = keys
+            .iter()
+            .map(|k| &k[self.config.euclidean_dim..])
+            .collect();
+        let values_euc: Vec<&[f32]> = values
+            .iter()
+            .map(|v| &v[..self.config.euclidean_dim])
+            .collect();
+        let values_hyp: Vec<&[f32]> = values
+            .iter()
+            .map(|v| &v[self.config.euclidean_dim..])
+            .collect();
+
+        let weights_euc = self.compute_euclidean_weights(query_euc, &keys_euc);
+        let weights_hyp = self.compute_hyperbolic_weights(query_hyp, &keys_hyp);
+
+        let alpha = self.config.mixing_weight;
+        let mut combined_weights: Vec<f32> = weights_euc
+            .iter()
+            .zip(&weights_hyp)
+            .map(|(&w_e, &w_h)| (1.0 - alpha) * w_e + alpha * w_h)
+            .collect();
+
+        if let Some(mask_vec) = mask {
+            for (i, &masked) in mask_vec.iter().enumerate() {
+                if !masked && i < combined_weights.len() {
+                    combined_weights[i] = 0.0;
+                }
+            }
+        }
+
+        let sum: f32 = combined_weights.iter().sum();
+        let normalized_weights: Vec<f32> = if sum > 1e-10 {
+            combined_weights.iter().map(|&w| w / sum).collect()
+        } else {
+            vec![1.0 / combined_weights.len() as f32; combined_weights.len()]
+        };
+
+        let result_euc = self.aggregate_euclidean(&normalized_weights, &values_euc);
+        let result_hyp = self.aggregate_hyperbolic(&normalized_weights, &values_hyp);
+
+        Ok(self.combine_components(result_euc, result_hyp))
+    }
+
+    fn dim(&self) -> usize {
+        self.total_dim()
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/mod.rs
@@ -0,0 +1,25 @@
+//! Hyperbolic Attention Module
+//!
+//! Implements attention mechanisms in hyperbolic space using:
+//! - Poincaré ball model (traditional)
+//! - Lorentz hyperboloid model (novel - faster, more stable)
+
+pub mod hyperbolic_attention;
+pub mod lorentz_cascade;
+pub mod mixed_curvature;
+pub mod poincare;
+
+pub use poincare::{
+    exp_map, frechet_mean, log_map, mobius_add, mobius_scalar_mult, poincare_distance,
+    project_to_ball,
+};
+
+pub use hyperbolic_attention::{HyperbolicAttention, HyperbolicAttentionConfig};
+
+pub use mixed_curvature::{MixedCurvatureAttention, MixedCurvatureConfig};
+
+// Novel Lorentz Cascade Attention (LCA)
+pub use lorentz_cascade::{
+    busemann_score, einstein_midpoint, horosphere_attention_weights, lorentz_distance,
+    lorentz_inner, project_hyperboloid, CascadeHead, LCAConfig, LorentzCascadeAttention,
+};
--- a/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/poincare.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/hyperbolic/poincare.rs
@@ -0,0 +1,180 @@
+//! Poincaré Ball Model Operations for Hyperbolic Geometry
+//!
+//! This module implements core operations in the Poincaré ball model of hyperbolic space,
+//! providing mathematically correct implementations with numerical stability guarantees.
+
+/// Small epsilon for numerical stability
+const EPS: f32 = 1e-7;
+
+/// Compute the squared Euclidean norm of a vector
+#[inline]
+fn norm_squared(x: &[f32]) -> f32 {
+    x.iter().map(|&v| v * v).sum()
+}
+
+/// Compute the Euclidean norm of a vector
+#[inline]
+fn norm(x: &[f32]) -> f32 {
+    norm_squared(x).sqrt()
+}
+
+/// Compute Poincaré distance between two points in hyperbolic space
+pub fn poincare_distance(u: &[f32], v: &[f32], c: f32) -> f32 {
+    let c = c.abs();
+    let sqrt_c = c.sqrt();
+
+    let diff: Vec<f32> = u.iter().zip(v).map(|(a, b)| a - b).collect();
+    let norm_diff_sq = norm_squared(&diff);
+    let norm_u_sq = norm_squared(u);
+    let norm_v_sq = norm_squared(v);
+
+    let lambda_u = 1.0 - c * norm_u_sq;
+    let lambda_v = 1.0 - c * norm_v_sq;
+
+    let numerator = 2.0 * c * norm_diff_sq;
+    let denominator = lambda_u * lambda_v;
+
+    let arg = 1.0 + numerator / denominator.max(EPS);
+    (1.0 / sqrt_c) * arg.max(1.0).acosh()
+}
+
+/// Möbius addition in Poincaré ball
+pub fn mobius_add(u: &[f32], v: &[f32], c: f32) -> Vec<f32> {
+    let c = c.abs();
+    let norm_u_sq = norm_squared(u);
+    let norm_v_sq = norm_squared(v);
+    let dot_uv: f32 = u.iter().zip(v).map(|(a, b)| a * b).sum();
+
+    let coef_u = 1.0 + 2.0 * c * dot_uv + c * norm_v_sq;
+    let coef_v = 1.0 - c * norm_u_sq;
+    let denom = 1.0 + 2.0 * c * dot_uv + c * c * norm_u_sq * norm_v_sq;
+
+    let result: Vec<f32> = u
+        .iter()
+        .zip(v)
+        .map(|(ui, vi)| (coef_u * ui + coef_v * vi) / denom.max(EPS))
+        .collect();
+
+    project_to_ball(&result, c, EPS)
+}
+
+/// Möbius scalar multiplication
+pub fn mobius_scalar_mult(r: f32, v: &[f32], c: f32) -> Vec<f32> {
+    let c = c.abs();
+    let sqrt_c = c.sqrt();
+    let norm_v = norm(v);
+
+    if norm_v < EPS {
+        return v.to_vec();
+    }
+
+    let arctanh_arg = (sqrt_c * norm_v).min(1.0 - EPS);
+    let scale = (1.0 / sqrt_c) * (r * arctanh_arg.atanh()).tanh() / norm_v;
+
+    v.iter().map(|&vi| scale * vi).collect()
+}
+
+/// Exponential map: maps tangent vector v at point p to hyperbolic space
+pub fn exp_map(v: &[f32], p: &[f32], c: f32) -> Vec<f32> {
+    let c = c.abs();
+    let sqrt_c = c.sqrt();
+
+    let norm_p_sq = norm_squared(p);
+    let lambda_p = 1.0 / (1.0 - c * norm_p_sq).max(EPS);
+
+    let norm_v = norm(v);
+    let norm_v_p = lambda_p * norm_v;
+
+    if norm_v < EPS {
+        return p.to_vec();
+    }
+
+    let coef = (sqrt_c * norm_v_p / 2.0).tanh() / (sqrt_c * norm_v_p);
+    let transported: Vec<f32> = v.iter().map(|&vi| coef * vi).collect();
+
+    mobius_add(p, &transported, c)
+}
+
+/// Logarithmic map: maps point y to tangent space at point p
+pub fn log_map(y: &[f32], p: &[f32], c: f32) -> Vec<f32> {
+    let c = c.abs();
+    let sqrt_c = c.sqrt();
+
+    let neg_p: Vec<f32> = p.iter().map(|&pi| -pi).collect();
+    let diff = mobius_add(&neg_p, y, c);
+    let norm_diff = norm(&diff);
+
+    if norm_diff < EPS {
+        return vec![0.0; y.len()];
+    }
+
+    let norm_p_sq = norm_squared(p);
+    let lambda_p = 1.0 / (1.0 - c * norm_p_sq).max(EPS);
+
+    let arctanh_arg = (sqrt_c * norm_diff).min(1.0 - EPS);
+    let coef = (2.0 / (sqrt_c * lambda_p)) * arctanh_arg.atanh() / norm_diff;
+
+    diff.iter().map(|&di| coef * di).collect()
+}
+
+/// Project point to Poincaré ball
+pub fn project_to_ball(x: &[f32], c: f32, eps: f32) -> Vec<f32> {
+    let c = c.abs();
+    let norm_x = norm(x);
+    let max_norm = (1.0 / c.sqrt()) - eps;
+
+    if norm_x < max_norm {
+        x.to_vec()
+    } else {
+        let scale = max_norm / norm_x.max(EPS);
+        x.iter().map(|&xi| scale * xi).collect()
+    }
+}
+
+/// Compute the Fréchet mean (centroid) of points in hyperbolic space
+pub fn frechet_mean(
+    points: &[&[f32]],
+    weights: Option<&[f32]>,
+    c: f32,
+    max_iter: usize,
+    tol: f32,
+) -> Vec<f32> {
+    let dim = points[0].len();
+    let c = c.abs();
+
+    let uniform_weights: Vec<f32>;
+    let w = if let Some(weights) = weights {
+        weights
+    } else {
+        uniform_weights = vec![1.0 / points.len() as f32; points.len()];
+        &uniform_weights
+    };
+
+    let mut mean = vec![0.0; dim];
+    for (point, &weight) in points.iter().zip(w) {
+        for (i, &val) in point.iter().enumerate() {
+            mean[i] += weight * val;
+        }
+    }
+    mean = project_to_ball(&mean, c, EPS);
+
+    let learning_rate = 0.1;
+    for _ in 0..max_iter {
+        let mut grad = vec![0.0; dim];
+        for (point, &weight) in points.iter().zip(w) {
+            let log_map_result = log_map(point, &mean, c);
+            for (i, &val) in log_map_result.iter().enumerate() {
+                grad[i] += weight * val;
+            }
+        }
+
+        if norm(&grad) < tol {
+            break;
+        }
+
+        let update: Vec<f32> = grad.iter().map(|&g| learning_rate * g).collect();
+        mean = exp_map(&update, &mean, c);
+    }
+
+    project_to_ball(&mean, c, EPS)
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/bottleneck.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/bottleneck.rs
@@ -0,0 +1,212 @@
+//! Information Bottleneck Layer
+//!
+//! Apply information bottleneck principle to attention.
+
+use super::kl_divergence::{DiagonalGaussian, KLDivergence};
+use serde::{Deserialize, Serialize};
+
+/// Information Bottleneck configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct IBConfig {
+    /// Bottleneck dimension
+    pub bottleneck_dim: usize,
+    /// Beta parameter (tradeoff between compression and reconstruction)
+    pub beta: f32,
+    /// Minimum variance (for numerical stability)
+    pub min_var: f32,
+    /// Whether to use reparameterization trick
+    pub reparameterize: bool,
+}
+
+impl Default for IBConfig {
+    fn default() -> Self {
+        Self {
+            bottleneck_dim: 64,
+            beta: 1e-3,
+            min_var: 1e-4,
+            reparameterize: true,
+        }
+    }
+}
+
+/// Information Bottleneck for Attention
+///
+/// Compresses attention representations through a variational bottleneck.
+/// Loss = Reconstruction + beta * KL(q(z|x) || p(z))
+#[derive(Debug, Clone)]
+pub struct InformationBottleneck {
+    config: IBConfig,
+}
+
+impl InformationBottleneck {
+    /// Create new information bottleneck
+    pub fn new(config: IBConfig) -> Self {
+        Self { config }
+    }
+
+    /// Compute IB KL term for attention values
+    /// Assumes values encode (mean, log_var) in first 2*bottleneck_dim dims
+    pub fn compute_kl_loss(&self, mean: &[f32], log_var: &[f32]) -> f32 {
+        let kl = KLDivergence::gaussian_to_unit_arrays(mean, log_var);
+        self.config.beta * kl
+    }
+
+    /// Compute IB KL term from DiagonalGaussian
+    pub fn compute_kl_loss_gaussian(&self, gaussian: &DiagonalGaussian) -> f32 {
+        let kl = KLDivergence::gaussian_to_unit(gaussian);
+        self.config.beta * kl
+    }
+
+    /// Sample from bottleneck distribution (for forward pass)
+    pub fn sample(&self, mean: &[f32], log_var: &[f32], epsilon: &[f32]) -> Vec<f32> {
+        let n = mean.len().min(log_var.len()).min(epsilon.len());
+        let mut z = vec![0.0f32; n];
+
+        for i in 0..n {
+            let lv = log_var[i].max(self.config.min_var.ln());
+            // Security: clamp to prevent exp() overflow
+            let std = (0.5 * lv.clamp(-20.0, 20.0)).exp();
+            z[i] = mean[i] + std * epsilon[i];
+        }
+
+        z
+    }
+
+    /// Compute gradient of KL term w.r.t. mean and log_var
+    /// d KL / d mu = mu
+    /// d KL / d log_var = 0.5 * (exp(log_var) - 1)
+    pub fn kl_gradients(&self, mean: &[f32], log_var: &[f32]) -> (Vec<f32>, Vec<f32>) {
+        let n = mean.len().min(log_var.len()); // Security: bounds check
+
+        let mut d_mean = vec![0.0f32; n];
+        let mut d_log_var = vec![0.0f32; n];
+
+        for i in 0..n {
+            d_mean[i] = self.config.beta * mean[i];
+            // Security: clamp log_var to prevent exp() overflow
+            let lv_clamped = log_var[i].clamp(-20.0, 20.0);
+            d_log_var[i] = self.config.beta * 0.5 * (lv_clamped.exp() - 1.0);
+        }
+
+        (d_mean, d_log_var)
+    }
+
+    /// Apply bottleneck to attention weights
+    /// Returns: (compressed_weights, kl_loss)
+    pub fn compress_attention_weights(&self, weights: &[f32], temperature: f32) -> (Vec<f32>, f32) {
+        let n = weights.len();
+
+        // Compute entropy-based compression
+        let entropy = self.compute_entropy(weights);
+
+        // Target is uniform distribution (maximum entropy)
+        let uniform_entropy = (n as f32).ln();
+
+        // KL from attention to uniform is the "information" we're encoding
+        let kl = (uniform_entropy - entropy).max(0.0);
+
+        // Apply temperature scaling
+        let mut compressed = weights.to_vec();
+        for w in compressed.iter_mut() {
+            *w = (*w).powf(1.0 / temperature.max(0.1));
+        }
+
+        // Renormalize
+        let sum: f32 = compressed.iter().sum();
+        if sum > 0.0 {
+            for w in compressed.iter_mut() {
+                *w /= sum;
+            }
+        }
+
+        (compressed, self.config.beta * kl)
+    }
+
+    /// Compute entropy of attention distribution
+    fn compute_entropy(&self, weights: &[f32]) -> f32 {
+        let eps = 1e-10;
+        let mut entropy = 0.0f32;
+
+        for &w in weights {
+            if w > eps {
+                entropy -= w * w.ln();
+            }
+        }
+
+        entropy.max(0.0)
+    }
+
+    /// Rate-distortion tradeoff
+    /// Higher beta = more compression, lower rate
+    pub fn set_beta(&mut self, beta: f32) {
+        self.config.beta = beta.max(0.0);
+    }
+
+    /// Get current beta
+    pub fn beta(&self) -> f32 {
+        self.config.beta
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ib_kl_loss() {
+        let ib = InformationBottleneck::new(IBConfig::default());
+
+        // Unit Gaussian = 0 KL
+        let mean = vec![0.0; 16];
+        let log_var = vec![0.0; 16];
+
+        let loss = ib.compute_kl_loss(&mean, &log_var);
+        assert!(loss.abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_ib_sample() {
+        let ib = InformationBottleneck::new(IBConfig::default());
+
+        let mean = vec![1.0, 2.0];
+        let log_var = vec![0.0, 0.0];
+        let epsilon = vec![0.0, 0.0];
+
+        let z = ib.sample(&mean, &log_var, &epsilon);
+
+        assert!((z[0] - 1.0).abs() < 1e-5);
+        assert!((z[1] - 2.0).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_kl_gradients() {
+        let ib = InformationBottleneck::new(IBConfig {
+            beta: 1.0,
+            ..Default::default()
+        });
+
+        let mean = vec![1.0, 0.0];
+        let log_var = vec![0.0, 0.0];
+
+        let (d_mean, d_log_var) = ib.kl_gradients(&mean, &log_var);
+
+        assert!((d_mean[0] - 1.0).abs() < 1e-5);
+        assert!((d_mean[1] - 0.0).abs() < 1e-5);
+        assert!((d_log_var[0] - 0.0).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_compress_weights() {
+        let ib = InformationBottleneck::new(IBConfig::default());
+
+        let weights = vec![0.7, 0.2, 0.1];
+        let (compressed, kl) = ib.compress_attention_weights(&weights, 1.0);
+
+        assert_eq!(compressed.len(), 3);
+        assert!(kl >= 0.0);
+
+        // Should still sum to 1
+        let sum: f32 = compressed.iter().sum();
+        assert!((sum - 1.0).abs() < 1e-5);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/kl_divergence.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/kl_divergence.rs
@@ -0,0 +1,204 @@
+//! KL Divergence Computations
+//!
+//! Efficient KL divergence for various distributions used in attention.
+
+/// Diagonal Gaussian parameters
+#[derive(Debug, Clone)]
+pub struct DiagonalGaussian {
+    /// Mean vector
+    pub mean: Vec<f32>,
+    /// Log variance vector
+    pub log_var: Vec<f32>,
+}
+
+impl DiagonalGaussian {
+    /// Create from mean and log variance
+    pub fn new(mean: Vec<f32>, log_var: Vec<f32>) -> Self {
+        Self { mean, log_var }
+    }
+
+    /// Create unit Gaussian (mean=0, var=1)
+    pub fn unit(dim: usize) -> Self {
+        Self {
+            mean: vec![0.0; dim],
+            log_var: vec![0.0; dim],
+        }
+    }
+
+    /// Sample using reparameterization trick
+    /// z = mean + std * epsilon, where epsilon ~ N(0, 1)
+    pub fn sample(&self, epsilon: &[f32]) -> Vec<f32> {
+        let n = self.mean.len();
+        let mut z = vec![0.0f32; n];
+
+        for i in 0..n {
+            let std = (0.5 * self.log_var[i]).exp();
+            z[i] = self.mean[i] + std * epsilon[i];
+        }
+
+        z
+    }
+
+    /// Get variance
+    pub fn variance(&self) -> Vec<f32> {
+        self.log_var.iter().map(|&lv| lv.exp()).collect()
+    }
+
+    /// Get standard deviation
+    pub fn std(&self) -> Vec<f32> {
+        self.log_var.iter().map(|&lv| (0.5 * lv).exp()).collect()
+    }
+}
+
+/// KL Divergence computations
+#[derive(Debug, Clone)]
+pub struct KLDivergence;
+
+impl KLDivergence {
+    /// KL(N(mu, sigma^2) || N(0, 1))
+    /// = 0.5 * sum(exp(log_var) + mu^2 - 1 - log_var)
+    pub fn gaussian_to_unit(gaussian: &DiagonalGaussian) -> f32 {
+        let n = gaussian.mean.len();
+        let mut kl = 0.0f32;
+
+        for i in 0..n {
+            let mu = gaussian.mean[i];
+            let lv = gaussian.log_var[i];
+            let var = lv.exp();
+            kl += var + mu * mu - 1.0 - lv;
+        }
+
+        0.5 * kl
+    }
+
+    /// KL(N(mu, sigma^2) || N(0, 1)) from separate arrays
+    pub fn gaussian_to_unit_arrays(mean: &[f32], log_var: &[f32]) -> f32 {
+        let n = mean.len().min(log_var.len());
+        let mut kl = 0.0f32;
+
+        for i in 0..n {
+            let mu = mean[i];
+            let lv = log_var[i];
+            let var = lv.exp();
+            kl += var + mu * mu - 1.0 - lv;
+        }
+
+        0.5 * kl
+    }
+
+    /// KL(N(mu1, sigma1^2) || N(mu2, sigma2^2))
+    /// = 0.5 * sum(log(var2/var1) + (var1 + (mu1-mu2)^2)/var2 - 1)
+    pub fn gaussian_to_gaussian(q: &DiagonalGaussian, p: &DiagonalGaussian) -> f32 {
+        let n = q.mean.len().min(p.mean.len());
+        let mut kl = 0.0f32;
+
+        for i in 0..n {
+            let mu_q = q.mean[i];
+            let mu_p = p.mean[i];
+            let lv_q = q.log_var[i];
+            let lv_p = p.log_var[i];
+
+            let var_q = lv_q.exp();
+            let var_p = lv_p.exp().max(1e-8);
+
+            let log_ratio = lv_p - lv_q;
+            let diff = mu_q - mu_p;
+
+            kl += log_ratio + (var_q + diff * diff) / var_p - 1.0;
+        }
+
+        0.5 * kl
+    }
+
+    /// KL divergence between categorical distributions
+    /// KL(p || q) = sum(p * log(p/q))
+    pub fn categorical(p: &[f32], q: &[f32]) -> f32 {
+        let n = p.len().min(q.len());
+        let mut kl = 0.0f32;
+        let eps = 1e-10;
+
+        for i in 0..n {
+            let pi = p[i].max(eps);
+            let qi = q[i].max(eps);
+            if pi > eps {
+                kl += pi * (pi / qi).ln();
+            }
+        }
+
+        kl.max(0.0)
+    }
+
+    /// Symmetric KL (Jensen-Shannon divergence approximation)
+    /// JS(p, q) ≈ 0.5 * (KL(p || m) + KL(q || m)) where m = (p+q)/2
+    pub fn jensen_shannon(p: &[f32], q: &[f32]) -> f32 {
+        let n = p.len().min(q.len());
+        let mut m = vec![0.0f32; n];
+
+        for i in 0..n {
+            m[i] = 0.5 * (p[i] + q[i]);
+        }
+
+        0.5 * (Self::categorical(p, &m) + Self::categorical(q, &m))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_kl_to_unit() {
+        // Unit Gaussian should have KL = 0
+        let unit = DiagonalGaussian::unit(4);
+        let kl = KLDivergence::gaussian_to_unit(&unit);
+        assert!(kl.abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_kl_nonzero() {
+        let g = DiagonalGaussian::new(vec![1.0, 0.5, -0.5], vec![0.5, 0.0, -0.5]);
+        let kl = KLDivergence::gaussian_to_unit(&g);
+        assert!(kl > 0.0);
+    }
+
+    #[test]
+    fn test_kl_arrays() {
+        let mean = vec![0.0, 0.0];
+        let log_var = vec![0.0, 0.0];
+
+        let kl = KLDivergence::gaussian_to_unit_arrays(&mean, &log_var);
+        assert!(kl.abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_categorical_kl() {
+        let p = vec![0.5, 0.5];
+        let q = vec![0.5, 0.5];
+
+        let kl = KLDivergence::categorical(&p, &q);
+        assert!(kl.abs() < 1e-5);
+
+        let q2 = vec![0.9, 0.1];
+        let kl2 = KLDivergence::categorical(&p, &q2);
+        assert!(kl2 > 0.0);
+    }
+
+    #[test]
+    fn test_jensen_shannon() {
+        let p = vec![0.5, 0.5];
+        let q = vec![0.5, 0.5];
+
+        let js = KLDivergence::jensen_shannon(&p, &q);
+        assert!(js.abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_sample() {
+        let g = DiagonalGaussian::new(vec![0.0, 1.0], vec![0.0, 0.0]);
+        let epsilon = vec![0.0, 0.0];
+
+        let z = g.sample(&epsilon);
+        assert!((z[0] - 0.0).abs() < 1e-5);
+        assert!((z[1] - 1.0).abs() < 1e-5);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_bottleneck/mod.rs
@@ -0,0 +1,29 @@
+//! Information Bottleneck
+//!
+//! Variational Information Bottleneck (VIB) components for attention.
+//!
+//! ## Key Concepts
+//!
+//! 1. **KL Divergence**: Measure compression quality
+//! 2. **Rate-Distortion**: Balance compression vs. reconstruction
+//! 3. **Per-Layer Bottleneck**: Add IB loss term to each attention layer
+//!
+//! ## Applications
+//!
+//! - Preventing attention from memorizing noise
+//! - Encouraging sparse, meaningful attention patterns
+//! - Regularizing attention weights
+
+mod bottleneck;
+mod kl_divergence;
+
+pub use bottleneck::{IBConfig, InformationBottleneck};
+pub use kl_divergence::{DiagonalGaussian, KLDivergence};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_geometry/fisher.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_geometry/fisher.rs
@@ -0,0 +1,241 @@
+//! Fisher Information Metric
+//!
+//! The Fisher metric on the probability simplex:
+//! F = diag(p) - p*p^T
+//!
+//! This gives the natural geometry for probability distributions.
+
+use serde::{Deserialize, Serialize};
+
+/// Fisher metric configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FisherConfig {
+    /// Regularization epsilon for numerical stability
+    pub eps: f32,
+    /// Maximum CG iterations
+    pub max_iters: usize,
+    /// Convergence threshold
+    pub tol: f32,
+}
+
+impl Default for FisherConfig {
+    fn default() -> Self {
+        Self {
+            eps: 1e-8,
+            max_iters: 10,
+            tol: 1e-6,
+        }
+    }
+}
+
+/// Fisher metric operations
+#[derive(Debug, Clone)]
+pub struct FisherMetric {
+    config: FisherConfig,
+}
+
+impl FisherMetric {
+    /// Create new Fisher metric
+    pub fn new(config: FisherConfig) -> Self {
+        Self { config }
+    }
+
+    /// Apply Fisher matrix to vector: F*v = diag(p)*v - p*(p^T*v)
+    /// This is O(n) instead of O(n^2)
+    #[inline]
+    pub fn apply(&self, probs: &[f32], v: &[f32]) -> Vec<f32> {
+        let n = probs.len().min(v.len()); // Security: bounds check
+
+        if n == 0 {
+            return vec![];
+        }
+
+        // Compute p^T * v
+        let pv = Self::dot_simd(probs, v);
+
+        // F*v = diag(p)*v - p*(p^T*v)
+        let mut result = vec![0.0f32; n];
+        for i in 0..n {
+            result[i] = probs[i] * v[i] - probs[i] * pv;
+        }
+
+        result
+    }
+
+    /// Apply inverse Fisher (approximately) using diagonal preconditioning
+    /// F^{-1} ≈ diag(1/p) for small perturbations
+    #[inline]
+    pub fn apply_inverse_approx(&self, probs: &[f32], v: &[f32]) -> Vec<f32> {
+        let n = probs.len().min(v.len()); // Security: bounds check
+
+        if n == 0 {
+            return vec![];
+        }
+
+        let mut result = vec![0.0f32; n];
+
+        for i in 0..n {
+            let p = probs[i].max(self.config.eps);
+            result[i] = v[i] / p;
+        }
+
+        // Project to sum-zero (tangent space of simplex)
+        let mean: f32 = result.iter().sum::<f32>() / n as f32;
+        for i in 0..n {
+            result[i] -= mean;
+        }
+
+        result
+    }
+
+    /// Solve F*x = b using conjugate gradient
+    /// Returns x such that probs[i]*x[i] - probs[i]*sum(probs[j]*x[j]) ≈ b[i]
+    pub fn solve_cg(&self, probs: &[f32], b: &[f32]) -> Vec<f32> {
+        let n = probs.len().min(b.len()); // Security: bounds check
+
+        if n == 0 {
+            return vec![];
+        }
+
+        // Project b to sum-zero (must be in tangent space)
+        let mut b_proj = b[..n].to_vec();
+        let b_mean: f32 = b_proj.iter().sum::<f32>() / n as f32;
+        for i in 0..n {
+            b_proj[i] -= b_mean;
+        }
+
+        // CG iteration
+        let mut x = vec![0.0f32; n];
+        let mut r = b_proj.clone();
+        let mut d = r.clone();
+
+        let mut rtr = Self::dot_simd(&r, &r);
+        if rtr < self.config.tol {
+            return x;
+        }
+
+        for _ in 0..self.config.max_iters {
+            let fd = self.apply(probs, &d);
+            let dfd = Self::dot_simd(&d, &fd).max(self.config.eps);
+            let alpha = rtr / dfd;
+
+            for i in 0..n {
+                x[i] += alpha * d[i];
+                r[i] -= alpha * fd[i];
+            }
+
+            let rtr_new = Self::dot_simd(&r, &r);
+            if rtr_new < self.config.tol {
+                break;
+            }
+
+            let beta = rtr_new / rtr.max(self.config.eps); // Security: prevent division by zero
+            for i in 0..n {
+                d[i] = r[i] + beta * d[i];
+            }
+
+            rtr = rtr_new;
+        }
+
+        x
+    }
+
+    /// Compute Fisher-Rao distance between two probability distributions
+    /// d_FR(p, q) = 2 * arccos(sum(sqrt(p_i * q_i)))
+    pub fn fisher_rao_distance(&self, p: &[f32], q: &[f32]) -> f32 {
+        let n = p.len().min(q.len());
+        let mut bhattacharyya = 0.0f32;
+
+        for i in 0..n {
+            let pi = p[i].max(self.config.eps);
+            let qi = q[i].max(self.config.eps);
+            bhattacharyya += (pi * qi).sqrt();
+        }
+
+        // Clamp for numerical stability
+        let cos_half = bhattacharyya.clamp(0.0, 1.0);
+        2.0 * cos_half.acos()
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_fisher_apply() {
+        let fisher = FisherMetric::new(FisherConfig::default());
+
+        // Uniform distribution
+        let p = vec![0.25, 0.25, 0.25, 0.25];
+        let v = vec![1.0, 0.0, 0.0, -1.0];
+
+        let fv = fisher.apply(&p, &v);
+
+        // F*v should be in tangent space (sum to ~0)
+        let sum: f32 = fv.iter().sum();
+        assert!(sum.abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_fisher_cg_solve() {
+        let fisher = FisherMetric::new(FisherConfig::default());
+
+        let p = vec![0.4, 0.3, 0.2, 0.1];
+        let b = vec![0.1, -0.05, -0.02, -0.03]; // sum-zero
+
+        let x = fisher.solve_cg(&p, &b);
+
+        // F*x should approximately equal b
+        let fx = fisher.apply(&p, &x);
+
+        for i in 0..4 {
+            assert!((fx[i] - b[i]).abs() < 0.1);
+        }
+    }
+
+    #[test]
+    fn test_fisher_rao_distance() {
+        let fisher = FisherMetric::new(FisherConfig::default());
+
+        let p = vec![0.5, 0.5];
+        let q = vec![0.5, 0.5];
+
+        // Same distribution = 0 distance
+        let d = fisher.fisher_rao_distance(&p, &q);
+        assert!(d.abs() < 1e-5);
+
+        // Different distributions
+        let q2 = vec![0.9, 0.1];
+        let d2 = fisher.fisher_rao_distance(&p, &q2);
+        assert!(d2 > 0.0);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_geometry/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_geometry/mod.rs
@@ -0,0 +1,29 @@
+//! Information Geometry for Attention
+//!
+//! Natural gradient methods using Fisher information metric.
+//!
+//! ## Key Concepts
+//!
+//! 1. **Fisher Metric**: F = diag(p) - p*p^T on probability simplex
+//! 2. **Natural Gradient**: Solve F*delta = grad, then update params -= lr*delta
+//! 3. **Conjugate Gradient**: Efficient solver for Fisher system
+//!
+//! ## Use Cases
+//!
+//! - Training attention weights with proper geometry
+//! - Routing probabilities in MoE
+//! - Softmax logit optimization
+
+mod fisher;
+mod natural_gradient;
+
+pub use fisher::{FisherConfig, FisherMetric};
+pub use natural_gradient::{NaturalGradient, NaturalGradientConfig};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/info_geometry/natural_gradient.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/info_geometry/natural_gradient.rs
@@ -0,0 +1,159 @@
+//! Natural Gradient Descent
+//!
+//! Update parameters using the natural gradient: F^{-1} * grad
+//! where F is the Fisher information matrix.
+
+use super::fisher::{FisherConfig, FisherMetric};
+use serde::{Deserialize, Serialize};
+
+/// Natural gradient configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct NaturalGradientConfig {
+    /// Learning rate
+    pub lr: f32,
+    /// Fisher metric config
+    pub fisher: FisherConfig,
+    /// Use diagonal approximation (faster but less accurate)
+    pub use_diagonal: bool,
+}
+
+impl Default for NaturalGradientConfig {
+    fn default() -> Self {
+        Self {
+            lr: 0.1,
+            fisher: FisherConfig::default(),
+            use_diagonal: false,
+        }
+    }
+}
+
+/// Natural gradient optimizer
+#[derive(Debug, Clone)]
+pub struct NaturalGradient {
+    config: NaturalGradientConfig,
+    fisher: FisherMetric,
+}
+
+impl NaturalGradient {
+    /// Create new natural gradient optimizer
+    pub fn new(config: NaturalGradientConfig) -> Self {
+        let fisher = FisherMetric::new(config.fisher.clone());
+        Self { config, fisher }
+    }
+
+    /// Compute natural gradient step for logits
+    /// Returns updated logits
+    pub fn step_logits(&self, logits: &[f32], grad_logits: &[f32]) -> Vec<f32> {
+        let probs = Self::softmax(logits);
+
+        // Compute natural gradient direction
+        let nat_grad = if self.config.use_diagonal {
+            self.fisher.apply_inverse_approx(&probs, grad_logits)
+        } else {
+            self.fisher.solve_cg(&probs, grad_logits)
+        };
+
+        // Update logits
+        let mut new_logits = logits.to_vec();
+        for i in 0..new_logits.len() {
+            new_logits[i] -= self.config.lr * nat_grad[i];
+        }
+
+        new_logits
+    }
+
+    /// Compute natural gradient step for general parameters with diagonal Fisher
+    /// Fisher diag should be pre-computed from data
+    pub fn step_diagonal(&self, params: &[f32], grads: &[f32], fisher_diag: &[f32]) -> Vec<f32> {
+        let n = params.len();
+        let mut new_params = params.to_vec();
+        let eps = self.config.fisher.eps;
+
+        for i in 0..n {
+            let f_inv = 1.0 / (fisher_diag[i].abs() + eps);
+            new_params[i] -= self.config.lr * grads[i] * f_inv;
+        }
+
+        new_params
+    }
+
+    /// Compute natural gradient for attention logits
+    /// Uses the Fisher metric on the output probability distribution
+    pub fn step_attention_logits(&self, logits: &[f32], grad_logits: &[f32]) -> Vec<f32> {
+        self.step_logits(logits, grad_logits)
+    }
+
+    /// Stable softmax
+    fn softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        if sum > 0.0 {
+            exp_logits.iter().map(|&e| e / sum).collect()
+        } else {
+            vec![1.0 / logits.len() as f32; logits.len()]
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_natural_gradient_step() {
+        let config = NaturalGradientConfig {
+            lr: 0.1,
+            ..Default::default()
+        };
+        let ng = NaturalGradient::new(config);
+
+        let logits = vec![1.0, 2.0, 0.5, 0.5];
+        let grads = vec![0.1, -0.1, 0.05, -0.05];
+
+        let new_logits = ng.step_logits(&logits, &grads);
+
+        assert_eq!(new_logits.len(), 4);
+        // Should be different from original
+        assert!(
+            (new_logits[0] - logits[0]).abs() > 1e-6 || (new_logits[1] - logits[1]).abs() > 1e-6
+        );
+    }
+
+    #[test]
+    fn test_diagonal_step() {
+        let ng = NaturalGradient::new(NaturalGradientConfig::default());
+
+        let params = vec![1.0, 2.0, 3.0];
+        let grads = vec![0.1, 0.1, 0.1]; // Equal gradients
+        let fisher_diag = vec![1.0, 2.0, 0.5]; // Different Fisher values
+
+        let new_params = ng.step_diagonal(&params, &grads, &fisher_diag);
+
+        assert_eq!(new_params.len(), 3);
+        // Larger Fisher = smaller step (with equal gradients)
+        let step0 = (new_params[0] - params[0]).abs();
+        let step1 = (new_params[1] - params[1]).abs();
+        let step2 = (new_params[2] - params[2]).abs();
+        // Fisher[1] > Fisher[0] > Fisher[2], so step1 < step0 < step2
+        assert!(step1 < step0);
+        assert!(step0 < step2);
+    }
+
+    #[test]
+    fn test_attention_logits_step() {
+        let ng = NaturalGradient::new(NaturalGradientConfig::default());
+
+        let logits = vec![0.0; 10];
+        let grads = vec![0.1; 10];
+
+        let new_logits = ng.step_attention_logits(&logits, &grads);
+
+        assert_eq!(new_logits.len(), 10);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/lib.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/lib.rs
@@ -0,0 +1,176 @@
+//! # ruvector-attention
+//!
+//! Attention mechanisms for ruvector, including geometric, graph, and sparse attention.
+//!
+//! This crate provides efficient implementations of various attention mechanisms:
+//! - Scaled dot-product attention
+//! - Multi-head attention with parallel processing
+//! - Graph attention for GNN applications
+//! - Geometric attention in hyperbolic spaces
+//! - Sparse attention patterns
+//!
+//! ## Features
+//!
+//! - **SIMD Acceleration**: Optional SIMD optimizations for performance
+//! - **Parallel Processing**: Rayon-based parallel head computation
+//! - **WASM Support**: WebAssembly compilation support
+//! - **NAPI Bindings**: Node.js bindings for JavaScript integration
+//!
+//! ## Example
+//!
+//! ```rust
+//! use ruvector_attention::{
+//!     attention::ScaledDotProductAttention,
+//!     traits::Attention,
+//! };
+//!
+//! // Create scaled dot-product attention
+//! let attention = ScaledDotProductAttention::new(512);
+//!
+//! // Prepare inputs
+//! let query = vec![1.0; 512];
+//! let keys = vec![vec![0.5; 512], vec![0.3; 512]];
+//! let values = vec![vec![1.0; 512], vec![2.0; 512]];
+//!
+//! let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+//! let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+//!
+//! // Compute attention
+//! let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+//! assert_eq!(output.len(), 512);
+//! ```
+
+pub mod attention;
+pub mod config;
+pub mod error;
+pub mod graph;
+pub mod hyperbolic;
+pub mod moe;
+pub mod sdk;
+pub mod sparse;
+pub mod training;
+pub mod traits;
+pub mod utils;
+
+// Advanced attention mechanisms
+pub mod curvature;
+pub mod topology;
+pub mod transport;
+
+// Mathematical foundations
+pub mod info_bottleneck;
+pub mod info_geometry;
+pub mod pde_attention;
+pub mod unified_report;
+
+// Sheaf attention (Coherence-Gated Transformer per ADR-015)
+#[cfg(feature = "sheaf")]
+pub mod sheaf;
+
+// Re-export main types
+pub use attention::{MultiHeadAttention, ScaledDotProductAttention};
+pub use config::{AttentionConfig, GraphAttentionConfig, SparseAttentionConfig};
+pub use error::{AttentionError, AttentionResult};
+pub use hyperbolic::{
+    exp_map, log_map, mobius_add, poincare_distance, project_to_ball, HyperbolicAttention,
+    HyperbolicAttentionConfig, MixedCurvatureAttention, MixedCurvatureConfig,
+};
+pub use traits::{
+    Attention, EdgeInfo, GeometricAttention, Gradients, GraphAttention, SparseAttention,
+    SparseMask, TrainableAttention,
+};
+
+// Sparse attention exports
+pub use sparse::{
+    AttentionMask, FlashAttention, LinearAttention, LocalGlobalAttention, SparseMaskBuilder,
+};
+
+// MoE exports
+pub use moe::{
+    Expert, ExpertType, HyperbolicExpert, LearnedRouter, LinearExpert, MoEAttention, MoEConfig,
+    Router, StandardExpert, TopKRouting,
+};
+
+// Graph attention exports
+pub use graph::{
+    DualSpaceAttention, DualSpaceConfig, EdgeFeaturedAttention, EdgeFeaturedConfig, GraphRoPE,
+    RoPEConfig,
+};
+
+// Training exports
+pub use training::{
+    Adam, AdamW, CurriculumScheduler, CurriculumStage, DecayType, HardNegativeMiner, InfoNCELoss,
+    LocalContrastiveLoss, Loss, MiningStrategy, NegativeMiner, Optimizer, Reduction,
+    SpectralRegularization, TemperatureAnnealing, SGD,
+};
+
+// SDK exports
+pub use sdk::{presets, AttentionBuilder, AttentionPipeline};
+
+// Transport (OT-based attention) exports
+pub use transport::{
+    CentroidCache, CentroidOTAttention, CentroidOTConfig, ProjectionCache,
+    SlicedWassersteinAttention, SlicedWassersteinConfig, WindowCache,
+};
+
+// Curvature (Mixed curvature attention) exports
+pub use curvature::{
+    ComponentQuantizer, FusedCurvatureConfig, MixedCurvatureCache, MixedCurvatureFusedAttention,
+    QuantizationConfig, QuantizedVector, TangentSpaceConfig, TangentSpaceMapper,
+};
+
+// Topology (Gated attention) exports
+pub use topology::{
+    AttentionMode, AttentionPolicy, CoherenceMetric, PolicyConfig, TopologyGatedAttention,
+    TopologyGatedConfig, WindowCoherence,
+};
+
+// Information Geometry exports
+pub use info_geometry::{FisherConfig, FisherMetric, NaturalGradient, NaturalGradientConfig};
+
+// Information Bottleneck exports
+pub use info_bottleneck::{DiagonalGaussian, IBConfig, InformationBottleneck, KLDivergence};
+
+// PDE Attention exports
+pub use pde_attention::{DiffusionAttention, DiffusionConfig, GraphLaplacian, LaplacianType};
+
+// Sheaf Attention exports (Coherence-Gated Transformer per ADR-015)
+#[cfg(feature = "sheaf")]
+pub use sheaf::{
+    process_with_early_exit, ComputeLane, EarlyExit, EarlyExitConfig, EarlyExitResult,
+    EarlyExitStatistics, ExitReason, LaneStatistics, ResidualSparseMask, RestrictionMap,
+    RestrictionMapConfig, RoutingDecision, SheafAttention, SheafAttentionConfig,
+    SparseResidualAttention, SparseResidualConfig, SparsityStatistics, TokenRouter,
+    TokenRouterConfig,
+};
+
+// Unified Report exports
+pub use unified_report::{
+    AttentionRecommendation, GeometryReport, MetricType, MetricValue, ReportBuilder, ReportConfig,
+};
+
+/// Library version
+pub const VERSION: &str = env!("CARGO_PKG_VERSION");
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_version() {
+        assert!(!VERSION.is_empty());
+    }
+
+    #[test]
+    fn test_basic_attention_workflow() {
+        let config = AttentionConfig::builder()
+            .dim(64)
+            .num_heads(4)
+            .build()
+            .unwrap();
+
+        assert_eq!(config.dim, 64);
+        assert_eq!(config.num_heads, 4);
+        assert_eq!(config.head_dim(), 16);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/moe/expert.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/moe/expert.rs
@@ -0,0 +1,299 @@
+//! Expert implementations for MoE attention
+
+use crate::error::AttentionResult;
+use crate::utils::stable_softmax;
+
+/// Type of expert
+#[derive(Clone, Debug, PartialEq)]
+pub enum ExpertType {
+    /// Standard scaled dot-product
+    Standard,
+    /// Hyperbolic attention
+    Hyperbolic,
+    /// Linear attention
+    Linear,
+}
+
+/// Expert trait for attention computation
+pub trait Expert: Send + Sync {
+    /// Compute attention for this expert
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>>;
+
+    /// Get expert type
+    fn expert_type(&self) -> ExpertType;
+
+    /// Get dimension
+    fn dim(&self) -> usize;
+}
+
+/// Standard scaled dot-product expert
+pub struct StandardExpert {
+    dim: usize,
+    scale: f32,
+}
+
+impl StandardExpert {
+    pub fn new(dim: usize) -> Self {
+        Self {
+            dim,
+            scale: 1.0 / (dim as f32).sqrt(),
+        }
+    }
+}
+
+impl Expert for StandardExpert {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // Compute attention scores
+        let scores: Vec<f32> = keys
+            .iter()
+            .map(|k| {
+                query
+                    .iter()
+                    .zip(k.iter())
+                    .map(|(q, ki)| q * ki)
+                    .sum::<f32>()
+                    * self.scale
+            })
+            .collect();
+
+        // Softmax
+        let weights = stable_softmax(&scores);
+
+        // Weighted sum
+        let mut output = vec![0.0f32; self.dim];
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn expert_type(&self) -> ExpertType {
+        ExpertType::Standard
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+/// Hyperbolic expert using Poincaré distance
+pub struct HyperbolicExpert {
+    dim: usize,
+    curvature: f32,
+}
+
+impl HyperbolicExpert {
+    pub fn new(dim: usize, curvature: f32) -> Self {
+        Self { dim, curvature }
+    }
+
+    fn poincare_distance(&self, u: &[f32], v: &[f32]) -> f32 {
+        let c = self.curvature.abs();
+        let sqrt_c = c.sqrt();
+
+        let diff_sq: f32 = u.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
+        let norm_u_sq: f32 = u.iter().map(|x| x * x).sum();
+        let norm_v_sq: f32 = v.iter().map(|x| x * x).sum();
+
+        let denom = (1.0 - c * norm_u_sq).max(1e-7) * (1.0 - c * norm_v_sq).max(1e-7);
+        let arg = 1.0 + 2.0 * c * diff_sq / denom;
+
+        (1.0 / sqrt_c) * arg.max(1.0).acosh()
+    }
+}
+
+impl Expert for HyperbolicExpert {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // Use negative Poincaré distance as similarity
+        let scores: Vec<f32> = keys
+            .iter()
+            .map(|k| -self.poincare_distance(query, k))
+            .collect();
+
+        let weights = stable_softmax(&scores);
+
+        let mut output = vec![0.0f32; self.dim];
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn expert_type(&self) -> ExpertType {
+        ExpertType::Hyperbolic
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+/// Linear attention expert with random features
+pub struct LinearExpert {
+    dim: usize,
+    num_features: usize,
+    random_features: Vec<f32>,
+}
+
+impl LinearExpert {
+    pub fn new(dim: usize, num_features: usize) -> Self {
+        use std::f32::consts::PI;
+
+        // Generate random features
+        let mut features = Vec::with_capacity(num_features * dim);
+        let mut seed = 123u64;
+
+        for _ in 0..((num_features * dim + 1) / 2) {
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            let u1 = (seed as f32) / (u64::MAX as f32);
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            let u2 = (seed as f32) / (u64::MAX as f32);
+
+            let r = (-2.0 * u1.max(1e-10).ln()).sqrt();
+            let theta = 2.0 * PI * u2;
+
+            features.push(r * theta.cos() / (dim as f32).sqrt());
+            if features.len() < num_features * dim {
+                features.push(r * theta.sin() / (dim as f32).sqrt());
+            }
+        }
+        features.truncate(num_features * dim);
+
+        Self {
+            dim,
+            num_features,
+            random_features: features,
+        }
+    }
+
+    fn feature_map(&self, x: &[f32]) -> Vec<f32> {
+        (0..self.num_features)
+            .map(|i| {
+                let proj: f32 = x
+                    .iter()
+                    .enumerate()
+                    .map(|(j, &xj)| xj * self.random_features[i * self.dim + j])
+                    .sum();
+                let norm_sq: f32 = x.iter().map(|xi| xi * xi).sum();
+                (proj - norm_sq / 2.0).exp() / (self.num_features as f32).sqrt()
+            })
+            .collect()
+    }
+}
+
+impl Expert for LinearExpert {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let phi_q = self.feature_map(query);
+        let value_dim = values.get(0).map(|v| v.len()).unwrap_or(self.dim);
+
+        let mut kv_sum = vec![0.0f32; self.num_features * value_dim];
+        let mut k_sum = vec![0.0f32; self.num_features];
+
+        for (key, value) in keys.iter().zip(values.iter()) {
+            let phi_k = self.feature_map(key);
+            for (i, &phi_ki) in phi_k.iter().enumerate() {
+                for (j, &vj) in value.iter().enumerate() {
+                    kv_sum[i * value_dim + j] += phi_ki * vj;
+                }
+                k_sum[i] += phi_ki;
+            }
+        }
+
+        let mut output = vec![0.0f32; value_dim];
+        let mut normalizer = 0.0f32;
+
+        for (i, &phi_qi) in phi_q.iter().enumerate() {
+            for (j, out_j) in output.iter_mut().enumerate() {
+                *out_j += phi_qi * kv_sum[i * value_dim + j];
+            }
+            normalizer += phi_qi * k_sum[i];
+        }
+
+        if normalizer.abs() > 1e-8 {
+            output.iter_mut().for_each(|x| *x /= normalizer);
+        }
+
+        Ok(output)
+    }
+
+    fn expert_type(&self) -> ExpertType {
+        ExpertType::Linear
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_standard_expert() {
+        let expert = StandardExpert::new(64);
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 64]; 10];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 64]; 10];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = expert.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_hyperbolic_expert() {
+        let expert = HyperbolicExpert::new(32, 1.0);
+        let query = vec![0.1; 32]; // Small values to stay in ball
+        let keys: Vec<Vec<f32>> = vec![vec![0.1; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = expert.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+
+    #[test]
+    fn test_linear_expert() {
+        let expert = LinearExpert::new(64, 32);
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 64]; 10];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 64]; 10];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = expert.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/moe/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/moe/mod.rs
@@ -0,0 +1,11 @@
+//! Mixture of Experts (MoE) attention mechanisms
+//!
+//! This module provides MoE attention where different inputs route to specialized experts.
+
+pub mod expert;
+pub mod moe_attention;
+pub mod router;
+
+pub use expert::{Expert, ExpertType, HyperbolicExpert, LinearExpert, StandardExpert};
+pub use moe_attention::{MoEAttention, MoEConfig};
+pub use router::{LearnedRouter, Router, TopKRouting};
--- a/vendor/ruvector/crates/ruvector-attention/src/moe/moe_attention.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/moe/moe_attention.rs
@@ -0,0 +1,262 @@
+//! Mixture of Experts attention layer
+
+use super::expert::{Expert, HyperbolicExpert, LinearExpert, StandardExpert};
+use super::router::{LearnedRouter, Router, TopKRouting};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+
+/// MoE configuration
+#[derive(Clone, Debug)]
+pub struct MoEConfig {
+    pub dim: usize,
+    pub num_experts: usize,
+    pub top_k: usize,
+    pub expert_capacity: f32,
+    pub jitter_noise: f32,
+}
+
+impl Default for MoEConfig {
+    fn default() -> Self {
+        Self {
+            dim: 256,
+            num_experts: 4,
+            top_k: 2,
+            expert_capacity: 1.25,
+            jitter_noise: 0.0,
+        }
+    }
+}
+
+impl MoEConfig {
+    pub fn builder() -> MoEConfigBuilder {
+        MoEConfigBuilder::default()
+    }
+}
+
+#[derive(Default)]
+pub struct MoEConfigBuilder {
+    config: MoEConfig,
+}
+
+impl MoEConfigBuilder {
+    pub fn dim(mut self, dim: usize) -> Self {
+        self.config.dim = dim;
+        self
+    }
+
+    pub fn num_experts(mut self, n: usize) -> Self {
+        self.config.num_experts = n;
+        self
+    }
+
+    pub fn top_k(mut self, k: usize) -> Self {
+        self.config.top_k = k;
+        self
+    }
+
+    pub fn expert_capacity(mut self, c: f32) -> Self {
+        self.config.expert_capacity = c;
+        self
+    }
+
+    pub fn jitter_noise(mut self, j: f32) -> Self {
+        self.config.jitter_noise = j;
+        self
+    }
+
+    pub fn build(self) -> MoEConfig {
+        self.config
+    }
+}
+
+/// Mixture of Experts attention
+pub struct MoEAttention {
+    experts: Vec<Box<dyn Expert>>,
+    router: LearnedRouter,
+    config: MoEConfig,
+}
+
+impl MoEAttention {
+    /// Create new MoE attention
+    pub fn new(config: MoEConfig) -> Self {
+        // Create diverse experts
+        let mut experts: Vec<Box<dyn Expert>> = Vec::new();
+
+        // Ensure we have at least num_experts
+        let num_each = (config.num_experts + 2) / 3;
+
+        for _ in 0..num_each {
+            experts.push(Box::new(StandardExpert::new(config.dim)));
+        }
+        for _ in 0..num_each {
+            experts.push(Box::new(HyperbolicExpert::new(config.dim, 1.0)));
+        }
+        for _ in 0..num_each {
+            experts.push(Box::new(LinearExpert::new(config.dim, config.dim / 4)));
+        }
+
+        experts.truncate(config.num_experts);
+
+        let router = LearnedRouter::new(config.num_experts, config.dim, config.top_k);
+
+        Self {
+            experts,
+            router,
+            config,
+        }
+    }
+
+    /// Compute with auxiliary load balance loss
+    pub fn compute_with_loss(
+        &self,
+        queries: &[&[f32]],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<(Vec<Vec<f32>>, f32)> {
+        let mut outputs = Vec::with_capacity(queries.len());
+        let mut routing_decisions = Vec::with_capacity(queries.len());
+
+        for query in queries {
+            let routes = self.router.route(query);
+            routing_decisions.push(TopKRouting {
+                selections: routes.clone(),
+            });
+
+            let mut output = vec![0.0f32; self.config.dim];
+            for (expert_idx, weight) in routes {
+                let expert_output = self.experts[expert_idx].compute(query, keys, values)?;
+                for (o, e) in output.iter_mut().zip(expert_output.iter()) {
+                    *o += weight * e;
+                }
+            }
+            outputs.push(output);
+        }
+
+        let loss = self.router.load_balance_loss(&routing_decisions);
+        Ok((outputs, loss))
+    }
+
+    /// Get expert usage statistics
+    pub fn expert_statistics(&self, routing_decisions: &[TopKRouting]) -> Vec<f32> {
+        self.router.expert_statistics(routing_decisions)
+    }
+}
+
+impl Attention for MoEAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if query.len() != self.config.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.config.dim,
+                actual: query.len(),
+            });
+        }
+
+        // Route query to experts
+        let routes = self.router.route(query);
+
+        // Compute weighted sum of expert outputs
+        let mut output = vec![0.0f32; self.config.dim];
+
+        for (expert_idx, weight) in routes {
+            let expert_output = self.experts[expert_idx].compute(query, keys, values)?;
+            for (o, e) in output.iter_mut().zip(expert_output.iter()) {
+                *o += weight * e;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_moe_attention() {
+        let config = MoEConfig::builder().dim(64).num_experts(4).top_k(2).build();
+
+        let moe = MoEAttention::new(config);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 64]; 10];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 64]; 10];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = moe.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_moe_with_loss() {
+        let config = MoEConfig::builder().dim(32).num_experts(4).top_k(2).build();
+
+        let moe = MoEAttention::new(config);
+
+        let queries: Vec<Vec<f32>> = (0..10).map(|_| vec![0.5; 32]).collect();
+        let keys: Vec<Vec<f32>> = vec![vec![0.3; 32]; 5];
+        let values: Vec<Vec<f32>> = vec![vec![1.0; 32]; 5];
+
+        let query_refs: Vec<&[f32]> = queries.iter().map(|q| q.as_slice()).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let (outputs, loss) = moe
+            .compute_with_loss(&query_refs, &keys_refs, &values_refs)
+            .unwrap();
+
+        assert_eq!(outputs.len(), 10);
+        assert!(loss >= 0.0);
+    }
+
+    #[test]
+    fn test_config_builder() {
+        let config = MoEConfig::builder()
+            .dim(128)
+            .num_experts(8)
+            .top_k(3)
+            .expert_capacity(1.5)
+            .jitter_noise(0.1)
+            .build();
+
+        assert_eq!(config.dim, 128);
+        assert_eq!(config.num_experts, 8);
+        assert_eq!(config.top_k, 3);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/moe/router.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/moe/router.rs
@@ -0,0 +1,210 @@
+//! Router implementations for MoE expert selection
+
+use crate::utils::stable_softmax;
+
+/// Router trait for expert selection
+pub trait Router: Send + Sync {
+    /// Route input to experts, returning (expert_idx, weight) pairs
+    fn route(&self, x: &[f32]) -> Vec<(usize, f32)>;
+
+    /// Get number of experts
+    fn num_experts(&self) -> usize;
+}
+
+/// Top-K routing decision
+#[derive(Clone, Debug)]
+pub struct TopKRouting {
+    /// Selected experts with their normalized weights
+    pub selections: Vec<(usize, f32)>,
+}
+
+/// Learned router with softmax gating
+pub struct LearnedRouter {
+    num_experts: usize,
+    dim: usize,
+    top_k: usize,
+    /// Gate weights: [num_experts x dim]
+    gate_weights: Vec<f32>,
+}
+
+impl LearnedRouter {
+    /// Create new learned router
+    pub fn new(num_experts: usize, dim: usize, top_k: usize) -> Self {
+        // Initialize gate weights with Xavier initialization
+        let scale = (2.0 / (dim + num_experts) as f32).sqrt();
+        let mut seed = 42u64;
+
+        let gate_weights: Vec<f32> = (0..num_experts * dim)
+            .map(|_| {
+                seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+                let u = (seed as f32) / (u64::MAX as f32);
+                (u - 0.5) * 2.0 * scale
+            })
+            .collect();
+
+        Self {
+            num_experts,
+            dim,
+            top_k: top_k.min(num_experts),
+            gate_weights,
+        }
+    }
+
+    /// Compute raw gate logits
+    fn compute_logits(&self, x: &[f32]) -> Vec<f32> {
+        (0..self.num_experts)
+            .map(|i| {
+                x.iter()
+                    .enumerate()
+                    .map(|(j, &xj)| xj * self.gate_weights[i * self.dim + j])
+                    .sum()
+            })
+            .collect()
+    }
+
+    /// Compute gate probabilities
+    pub fn compute_gate(&self, x: &[f32]) -> Vec<f32> {
+        let logits = self.compute_logits(x);
+        stable_softmax(&logits)
+    }
+
+    /// Compute load balancing loss for batch
+    pub fn load_balance_loss(&self, routing_decisions: &[TopKRouting]) -> f32 {
+        if routing_decisions.is_empty() {
+            return 0.0;
+        }
+
+        let batch_size = routing_decisions.len() as f32;
+
+        // Count how many times each expert is used
+        let mut expert_counts = vec![0.0f32; self.num_experts];
+        let mut total_weight = vec![0.0f32; self.num_experts];
+
+        for decision in routing_decisions {
+            for &(expert_idx, weight) in &decision.selections {
+                expert_counts[expert_idx] += 1.0;
+                total_weight[expert_idx] += weight;
+            }
+        }
+
+        // Compute auxiliary loss: encourage uniform distribution
+        let _avg_count = expert_counts.iter().sum::<f32>() / self.num_experts as f32;
+        let _avg_weight = total_weight.iter().sum::<f32>() / self.num_experts as f32;
+
+        // CV-squared loss from Switch Transformer paper
+        let count_var: f32 = expert_counts
+            .iter()
+            .map(|c| (c / batch_size - 1.0 / self.num_experts as f32).powi(2))
+            .sum();
+
+        self.num_experts as f32 * count_var
+    }
+
+    /// Update gate weights (for training)
+    pub fn update_weights(&mut self, gradients: &[f32], learning_rate: f32) {
+        for (w, g) in self.gate_weights.iter_mut().zip(gradients.iter()) {
+            *w -= learning_rate * g;
+        }
+    }
+
+    /// Get expert usage statistics
+    pub fn expert_statistics(&self, routing_decisions: &[TopKRouting]) -> Vec<f32> {
+        let mut counts = vec![0.0f32; self.num_experts];
+
+        for decision in routing_decisions {
+            for &(expert_idx, _) in &decision.selections {
+                counts[expert_idx] += 1.0;
+            }
+        }
+
+        let total: f32 = counts.iter().sum();
+        if total > 0.0 {
+            counts.iter_mut().for_each(|c| *c /= total);
+        }
+
+        counts
+    }
+}
+
+impl Router for LearnedRouter {
+    fn route(&self, x: &[f32]) -> Vec<(usize, f32)> {
+        let probs = self.compute_gate(x);
+
+        // Get top-k indices
+        let mut indexed: Vec<(usize, f32)> = probs.into_iter().enumerate().collect();
+        indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Take top-k and renormalize
+        let top_k: Vec<(usize, f32)> = indexed.into_iter().take(self.top_k).collect();
+        let sum: f32 = top_k.iter().map(|(_, p)| p).sum();
+
+        if sum > 1e-8 {
+            top_k.into_iter().map(|(i, p)| (i, p / sum)).collect()
+        } else {
+            // Fallback: uniform over top-k
+            top_k
+                .into_iter()
+                .map(|(i, _)| (i, 1.0 / self.top_k as f32))
+                .collect()
+        }
+    }
+
+    fn num_experts(&self) -> usize {
+        self.num_experts
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_learned_router() {
+        let router = LearnedRouter::new(4, 64, 2);
+
+        let x = vec![0.5; 64];
+        let routes = router.route(&x);
+
+        assert_eq!(routes.len(), 2);
+
+        // Weights should sum to 1
+        let sum: f32 = routes.iter().map(|(_, w)| w).sum();
+        assert!((sum - 1.0).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_load_balance_loss() {
+        let router = LearnedRouter::new(4, 32, 2);
+
+        // Simulate routing decisions
+        let decisions: Vec<TopKRouting> = (0..100)
+            .map(|i| TopKRouting {
+                selections: vec![(i % 4, 0.6), ((i + 1) % 4, 0.4)],
+            })
+            .collect();
+
+        let loss = router.load_balance_loss(&decisions);
+        assert!(loss >= 0.0);
+    }
+
+    #[test]
+    fn test_expert_statistics() {
+        let router = LearnedRouter::new(4, 32, 2);
+
+        let decisions: Vec<TopKRouting> = vec![
+            TopKRouting {
+                selections: vec![(0, 0.6), (1, 0.4)],
+            },
+            TopKRouting {
+                selections: vec![(0, 0.5), (2, 0.5)],
+            },
+        ];
+
+        let stats = router.expert_statistics(&decisions);
+        assert_eq!(stats.len(), 4);
+
+        // Should sum to 1
+        let sum: f32 = stats.iter().sum();
+        assert!((sum - 1.0).abs() < 1e-5);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/pde_attention/diffusion.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/pde_attention/diffusion.rs
@@ -0,0 +1,343 @@
+//! Diffusion Attention
+//!
+//! Attention as heat diffusion on a key similarity graph.
+
+use super::laplacian::{GraphLaplacian, LaplacianType};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use serde::{Deserialize, Serialize};
+
+/// Diffusion attention configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DiffusionConfig {
+    /// Model dimension
+    pub dim: usize,
+    /// Total diffusion time
+    pub diffusion_time: f32,
+    /// Number of diffusion steps
+    pub num_steps: usize,
+    /// Sigma for Gaussian kernel
+    pub sigma: f32,
+    /// Use k-NN sparse Laplacian (0 = dense)
+    pub knn_k: usize,
+    /// Laplacian type
+    pub laplacian_type: LaplacianType,
+    /// Temperature for final softmax
+    pub temperature: f32,
+}
+
+impl Default for DiffusionConfig {
+    fn default() -> Self {
+        Self {
+            dim: 512,
+            diffusion_time: 1.0,
+            num_steps: 5,
+            sigma: 1.0,
+            knn_k: 0, // Dense
+            laplacian_type: LaplacianType::RandomWalk,
+            temperature: 1.0,
+        }
+    }
+}
+
+/// Diffusion-based Attention
+///
+/// Computes attention by diffusing initial logits on a key similarity graph.
+/// This provides multi-scale smoothing and noise resistance.
+#[derive(Debug, Clone)]
+pub struct DiffusionAttention {
+    config: DiffusionConfig,
+}
+
+impl DiffusionAttention {
+    /// Create new diffusion attention
+    pub fn new(config: DiffusionConfig) -> Self {
+        Self { config }
+    }
+
+    /// Create with dimension only
+    pub fn with_dim(dim: usize) -> Self {
+        Self::new(DiffusionConfig {
+            dim,
+            ..Default::default()
+        })
+    }
+
+    /// Compute diffusion attention
+    pub fn compute_diffusion(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let n = keys.len();
+        if n == 0 {
+            return Err(AttentionError::InvalidConfig("No keys".into()));
+        }
+
+        // Build Laplacian
+        let laplacian = if self.config.knn_k > 0 {
+            GraphLaplacian::from_keys_knn(
+                keys,
+                self.config.knn_k,
+                self.config.sigma,
+                self.config.laplacian_type,
+            )
+        } else {
+            GraphLaplacian::from_keys(keys, self.config.sigma, self.config.laplacian_type)
+        };
+
+        // Initial logits from dot product
+        let mut x: Vec<f32> = keys
+            .iter()
+            .map(|k| Self::dot_product_simd(query, k))
+            .collect();
+
+        // Diffusion: x_{t+dt} = x_t - dt * L * x_t
+        let dt = self.config.diffusion_time / self.config.num_steps.max(1) as f32;
+
+        for _ in 0..self.config.num_steps {
+            let lx = laplacian.apply(&x);
+            for i in 0..n {
+                x[i] -= dt * lx[i];
+            }
+        }
+
+        // Apply temperature (Security: prevent division by zero)
+        let temp = self.config.temperature.max(1e-6);
+        for xi in x.iter_mut() {
+            *xi /= temp;
+        }
+
+        // Softmax
+        let weights = Self::stable_softmax(&x);
+
+        // Weighted sum of values
+        self.weighted_sum(&weights, values)
+    }
+
+    /// Compute diffusion energy (for monitoring)
+    /// E = x^T L x (smoothness measure)
+    pub fn diffusion_energy(&self, x: &[f32], laplacian: &GraphLaplacian) -> f32 {
+        let lx = laplacian.apply(x);
+        Self::dot_product_simd(x, &lx)
+    }
+
+    /// Compute multi-scale attention (return attention at different times)
+    pub fn compute_multiscale(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        num_scales: usize,
+    ) -> Vec<Vec<f32>> {
+        let n = keys.len();
+        if n == 0 {
+            return vec![];
+        }
+
+        let laplacian = if self.config.knn_k > 0 {
+            GraphLaplacian::from_keys_knn(
+                keys,
+                self.config.knn_k,
+                self.config.sigma,
+                self.config.laplacian_type,
+            )
+        } else {
+            GraphLaplacian::from_keys(keys, self.config.sigma, self.config.laplacian_type)
+        };
+
+        let mut x: Vec<f32> = keys
+            .iter()
+            .map(|k| Self::dot_product_simd(query, k))
+            .collect();
+
+        let mut scales = Vec::with_capacity(num_scales);
+        scales.push(Self::stable_softmax(&x)); // t=0
+
+        let total_steps = self.config.num_steps * num_scales;
+        let dt = self.config.diffusion_time / total_steps.max(1) as f32;
+        let steps_per_scale = self.config.num_steps;
+
+        for _ in 1..num_scales {
+            for _ in 0..steps_per_scale {
+                let lx = laplacian.apply(&x);
+                for i in 0..n {
+                    x[i] -= dt * lx[i];
+                }
+            }
+            scales.push(Self::stable_softmax(&x));
+        }
+
+        scales
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+
+    /// Stable softmax
+    fn stable_softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        // Security: prevent division by zero if all exp values underflow
+        if sum > 0.0 {
+            exp_logits.iter().map(|&e| e / sum).collect()
+        } else {
+            // Fallback to uniform distribution
+            vec![1.0 / logits.len() as f32; logits.len()]
+        }
+    }
+
+    /// Weighted sum
+    fn weighted_sum(&self, weights: &[f32], values: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        if weights.is_empty() || values.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty inputs".into()));
+        }
+
+        let dim = values[0].len();
+        let mut output = vec![0.0f32; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, &v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+}
+
+impl Attention for DiffusionAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        self.compute_diffusion(query, keys, values)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(&[f32], &[f32])> = keys
+                .iter()
+                .zip(values.iter())
+                .enumerate()
+                .filter(|(i, _)| m.get(*i).copied().unwrap_or(true))
+                .map(|(_, (k, v))| (*k, *v))
+                .collect();
+
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(k, _)| *k).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(_, v)| *v).collect();
+
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_diffusion_attention() {
+        let attention = DiffusionAttention::with_dim(16);
+
+        let query = vec![1.0f32; 16];
+        let keys: Vec<Vec<f32>> = (0..8).map(|i| vec![i as f32 * 0.1; 16]).collect();
+        let values: Vec<Vec<f32>> = (0..8).map(|i| vec![i as f32; 16]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 16);
+    }
+
+    #[test]
+    fn test_multiscale() {
+        let config = DiffusionConfig {
+            dim: 8,
+            num_steps: 2,
+            ..Default::default()
+        };
+        let attention = DiffusionAttention::new(config);
+
+        let query = vec![1.0f32; 8];
+        let keys: Vec<Vec<f32>> = (0..5).map(|i| vec![i as f32 * 0.1; 8]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let scales = attention.compute_multiscale(&query, &keys_refs, 3);
+
+        assert_eq!(scales.len(), 3);
+        for scale in scales {
+            assert_eq!(scale.len(), 5);
+            // Each scale should sum to 1
+            let sum: f32 = scale.iter().sum();
+            assert!((sum - 1.0).abs() < 1e-5);
+        }
+    }
+
+    #[test]
+    fn test_knn_diffusion() {
+        let config = DiffusionConfig {
+            dim: 8,
+            knn_k: 3,
+            ..Default::default()
+        };
+        let attention = DiffusionAttention::new(config);
+
+        let query = vec![1.0f32; 8];
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.1; 8]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; 8]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 8);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/pde_attention/laplacian.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/pde_attention/laplacian.rs
@@ -0,0 +1,227 @@
+//! Graph Laplacian
+//!
+//! Constructs various Laplacian matrices from key similarities.
+
+use serde::{Deserialize, Serialize};
+
+/// Type of Laplacian to use
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum LaplacianType {
+    /// Unnormalized: L = D - W
+    Unnormalized,
+    /// Symmetric normalized: L = I - D^{-1/2} W D^{-1/2}
+    SymmetricNormalized,
+    /// Random walk: L = I - D^{-1} W
+    RandomWalk,
+}
+
+/// Graph Laplacian for attention
+#[derive(Debug, Clone)]
+pub struct GraphLaplacian {
+    /// Weight matrix (dense)
+    weights: Vec<f32>,
+    /// Degree vector
+    degrees: Vec<f32>,
+    /// Number of nodes
+    n: usize,
+    /// Laplacian type
+    lap_type: LaplacianType,
+}
+
+impl GraphLaplacian {
+    /// Build Laplacian from keys using Gaussian kernel
+    pub fn from_keys(keys: &[&[f32]], sigma: f32, lap_type: LaplacianType) -> Self {
+        let n = keys.len();
+        let sigma2 = (sigma * sigma).max(1e-9);
+
+        let mut weights = vec![0.0f32; n * n];
+        let mut degrees = vec![0.0f32; n];
+
+        // Build weight matrix
+        for i in 0..n {
+            for j in 0..n {
+                if i == j {
+                    continue;
+                }
+
+                let dist2 = Self::l2_sq(keys[i], keys[j]);
+                let w = (-dist2 / (2.0 * sigma2)).exp();
+
+                weights[i * n + j] = w;
+                degrees[i] += w;
+            }
+        }
+
+        Self {
+            weights,
+            degrees,
+            n,
+            lap_type,
+        }
+    }
+
+    /// Build sparse Laplacian using k-NN
+    pub fn from_keys_knn(keys: &[&[f32]], k: usize, sigma: f32, lap_type: LaplacianType) -> Self {
+        let n = keys.len();
+        // Security: prevent integer underflow when n=0 or n=1
+        let k = if n > 1 { k.min(n - 1) } else { 0 };
+        let sigma2 = (sigma * sigma).max(1e-9);
+
+        let mut weights = vec![0.0f32; n * n];
+        let mut degrees = vec![0.0f32; n];
+
+        // For each node, find k-NN
+        for i in 0..n {
+            let mut dists: Vec<(usize, f32)> = (0..n)
+                .filter(|&j| j != i)
+                .map(|j| (j, Self::l2_sq(keys[i], keys[j])))
+                .collect();
+
+            dists.sort_unstable_by(|a, b| {
+                a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
+            });
+
+            // Keep only k nearest
+            for (j, dist2) in dists.iter().take(k) {
+                let w = (-dist2 / (2.0 * sigma2)).exp();
+                weights[i * n + j] = w;
+                weights[*j * n + i] = w; // Make symmetric
+            }
+        }
+
+        // Recompute degrees
+        for i in 0..n {
+            for j in 0..n {
+                degrees[i] += weights[i * n + j];
+            }
+        }
+
+        Self {
+            weights,
+            degrees,
+            n,
+            lap_type,
+        }
+    }
+
+    /// Apply Laplacian to vector: L * x
+    pub fn apply(&self, x: &[f32]) -> Vec<f32> {
+        let mut result = vec![0.0f32; self.n];
+
+        match self.lap_type {
+            LaplacianType::Unnormalized => {
+                // L * x = D * x - W * x
+                for i in 0..self.n {
+                    result[i] = self.degrees[i] * x[i];
+                    for j in 0..self.n {
+                        result[i] -= self.weights[i * self.n + j] * x[j];
+                    }
+                }
+            }
+            LaplacianType::SymmetricNormalized => {
+                // L * x = x - D^{-1/2} W D^{-1/2} x
+                let d_inv_sqrt: Vec<f32> = self
+                    .degrees
+                    .iter()
+                    .map(|&d| if d > 0.0 { 1.0 / d.sqrt() } else { 0.0 })
+                    .collect();
+
+                for i in 0..self.n {
+                    result[i] = x[i];
+                    for j in 0..self.n {
+                        let w_norm = d_inv_sqrt[i] * self.weights[i * self.n + j] * d_inv_sqrt[j];
+                        result[i] -= w_norm * x[j];
+                    }
+                }
+            }
+            LaplacianType::RandomWalk => {
+                // L * x = x - D^{-1} W * x
+                for i in 0..self.n {
+                    result[i] = x[i];
+                    let d_inv = if self.degrees[i] > 0.0 {
+                        1.0 / self.degrees[i]
+                    } else {
+                        0.0
+                    };
+                    for j in 0..self.n {
+                        result[i] -= d_inv * self.weights[i * self.n + j] * x[j];
+                    }
+                }
+            }
+        }
+
+        result
+    }
+
+    /// Get number of nodes
+    pub fn num_nodes(&self) -> usize {
+        self.n
+    }
+
+    /// Get degree of node i
+    pub fn degree(&self, i: usize) -> f32 {
+        self.degrees.get(i).copied().unwrap_or(0.0)
+    }
+
+    /// Get weight between nodes i and j
+    pub fn weight(&self, i: usize, j: usize) -> f32 {
+        if i < self.n && j < self.n {
+            self.weights[i * self.n + j]
+        } else {
+            0.0
+        }
+    }
+
+    /// L2 squared distance
+    #[inline]
+    fn l2_sq(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let mut sum = 0.0f32;
+        for i in 0..len {
+            let d = a[i] - b[i];
+            sum += d * d;
+        }
+        sum
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_laplacian_build() {
+        let keys: Vec<Vec<f32>> = vec![vec![0.0, 0.0], vec![1.0, 0.0], vec![0.0, 1.0]];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let lap = GraphLaplacian::from_keys(&keys_refs, 1.0, LaplacianType::Unnormalized);
+
+        assert_eq!(lap.num_nodes(), 3);
+        assert!(lap.degree(0) > 0.0);
+    }
+
+    #[test]
+    fn test_laplacian_apply() {
+        let keys: Vec<Vec<f32>> = vec![vec![0.0], vec![1.0], vec![2.0]];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let lap = GraphLaplacian::from_keys(&keys_refs, 1.0, LaplacianType::Unnormalized);
+
+        // Constant vector should give zero (L * 1 = 0)
+        let x = vec![1.0, 1.0, 1.0];
+        let lx = lap.apply(&x);
+
+        let sum: f32 = lx.iter().map(|v| v.abs()).sum();
+        assert!(sum < 1e-3);
+    }
+
+    #[test]
+    fn test_knn_laplacian() {
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let lap = GraphLaplacian::from_keys_knn(&keys_refs, 3, 1.0, LaplacianType::RandomWalk);
+
+        assert_eq!(lap.num_nodes(), 10);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/pde_attention/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/pde_attention/mod.rs
@@ -0,0 +1,29 @@
+//! PDE-Based Attention
+//!
+//! Continuous-time attention using partial differential equations.
+//!
+//! ## Key Concepts
+//!
+//! 1. **Diffusion Smoothing**: Heat equation on attention graph
+//! 2. **Graph Laplacian**: L = D - W for key similarity
+//! 3. **Time Evolution**: x_{t+dt} = x_t - dt * L * x_t
+//!
+//! ## Interpretation
+//!
+//! - Attention as continuous information flow
+//! - Smoothing removes noise while preserving structure
+//! - Multi-scale attention via different diffusion times
+
+mod diffusion;
+mod laplacian;
+
+pub use diffusion::{DiffusionAttention, DiffusionConfig};
+pub use laplacian::{GraphLaplacian, LaplacianType};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sdk/builder.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sdk/builder.rs
@@ -0,0 +1,61 @@
+//! Fluent builder API for constructing attention mechanisms.
+
+use crate::{error::AttentionResult, traits::Attention};
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum AttentionType {
+    ScaledDot,
+    MultiHead,
+    Flash,
+    Linear,
+    LocalGlobal,
+    Hyperbolic,
+    MoE,
+}
+
+pub struct AttentionBuilder {
+    dim: usize,
+    attention_type: AttentionType,
+}
+
+impl AttentionBuilder {
+    pub fn new(dim: usize) -> Self {
+        Self {
+            dim,
+            attention_type: AttentionType::ScaledDot,
+        }
+    }
+
+    pub fn multi_head(mut self, _heads: usize) -> Self {
+        self.attention_type = AttentionType::MultiHead;
+        self
+    }
+
+    pub fn flash(mut self, _block: usize) -> Self {
+        self.attention_type = AttentionType::Flash;
+        self
+    }
+
+    pub fn dropout(self, _p: f32) -> Self {
+        self
+    }
+    pub fn causal(self, _c: bool) -> Self {
+        self
+    }
+
+    pub fn build(self) -> AttentionResult<Box<dyn Attention + Send + Sync>> {
+        Ok(Box::new(crate::attention::ScaledDotProductAttention::new(
+            self.dim,
+        )))
+    }
+}
+
+pub fn scaled_dot(dim: usize) -> AttentionBuilder {
+    AttentionBuilder::new(dim)
+}
+pub fn multi_head(dim: usize, heads: usize) -> AttentionBuilder {
+    AttentionBuilder::new(dim).multi_head(heads)
+}
+pub fn flash(dim: usize, block: usize) -> AttentionBuilder {
+    AttentionBuilder::new(dim).flash(block)
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sdk/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sdk/mod.rs
@@ -0,0 +1,11 @@
+//! # ruvector-attention SDK
+//!
+//! High-level, ergonomic APIs for building attention mechanisms.
+
+pub mod builder;
+pub mod pipeline;
+pub mod presets;
+
+pub use builder::{flash, multi_head, scaled_dot, AttentionBuilder, AttentionType};
+pub use pipeline::{AttentionPipeline, NormType, PipelineStage};
+pub use presets::{for_graphs, for_large_scale, for_sequences, AttentionPreset};
--- a/vendor/ruvector/crates/ruvector-attention/src/sdk/pipeline.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sdk/pipeline.rs
@@ -0,0 +1,57 @@
+//! Pipeline API for chaining attention operations.
+
+use crate::{error::AttentionResult, traits::Attention};
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum NormType {
+    LayerNorm,
+    RMSNorm,
+    BatchNorm,
+}
+
+pub enum PipelineStage {
+    Attention(Box<dyn Attention + Send + Sync>),
+    Normalize(NormType),
+}
+
+pub struct AttentionPipeline {
+    stages: Vec<PipelineStage>,
+}
+
+impl AttentionPipeline {
+    pub fn new() -> Self {
+        Self { stages: Vec::new() }
+    }
+
+    pub fn add_attention(mut self, attn: Box<dyn Attention + Send + Sync>) -> Self {
+        self.stages.push(PipelineStage::Attention(attn));
+        self
+    }
+
+    pub fn add_norm(mut self, norm: NormType) -> Self {
+        self.stages.push(PipelineStage::Normalize(norm));
+        self
+    }
+
+    pub fn add_dropout(self, _p: f32) -> Self {
+        self
+    }
+    pub fn add_residual(self) -> Self {
+        self
+    }
+
+    pub fn run(
+        &self,
+        query: &[f32],
+        _keys: &[&[f32]],
+        _values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        Ok(query.to_vec())
+    }
+}
+
+impl Default for AttentionPipeline {
+    fn default() -> Self {
+        Self::new()
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sdk/presets.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sdk/presets.rs
@@ -0,0 +1,42 @@
+//! Pre-configured attention presets for common use cases.
+
+use crate::sdk::builder::AttentionBuilder;
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum AttentionPreset {
+    Bert,
+    Gpt,
+    Longformer,
+    Performer,
+    FlashOptimized,
+    SwitchTransformer,
+    HyperbolicTree,
+    T5,
+    ViT,
+    SparseTransformer,
+}
+
+impl AttentionPreset {
+    pub fn builder(self, dim: usize) -> AttentionBuilder {
+        match self {
+            AttentionPreset::Bert => AttentionBuilder::new(dim).multi_head(12).dropout(0.1),
+            AttentionPreset::Gpt => AttentionBuilder::new(dim)
+                .multi_head(12)
+                .causal(true)
+                .dropout(0.1),
+            _ => AttentionBuilder::new(dim),
+        }
+    }
+}
+
+pub fn for_sequences(dim: usize, _max_len: usize) -> AttentionBuilder {
+    AttentionBuilder::new(dim)
+}
+
+pub fn for_graphs(dim: usize, _hierarchical: bool) -> AttentionBuilder {
+    AttentionBuilder::new(dim)
+}
+
+pub fn for_large_scale(dim: usize) -> AttentionBuilder {
+    AttentionBuilder::new(dim).flash(128)
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/attention.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/attention.rs
@@ -0,0 +1,711 @@
+//! Sheaf Attention Layer
+//!
+//! Implements coherence-based attention where weights are inversely proportional
+//! to residual energy:
+//!
+//! ```text
+//! A_ij = exp(-beta * E_ij) / sum_k exp(-beta * E_ik)
+//! ```
+//!
+//! ## Key Properties
+//!
+//! - High residual (incoherent) -> Low attention (don't propagate inconsistency)
+//! - Low residual (coherent) -> High attention (reinforce consistency)
+//! - Beta parameter controls temperature (higher = sharper attention)
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::sheaf::restriction::RestrictionMap;
+use crate::traits::Attention;
+use crate::utils::stable_softmax;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for sheaf attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SheafAttentionConfig {
+    /// Model dimension
+    pub dim: usize,
+    /// Number of attention heads
+    pub num_heads: usize,
+    /// Temperature parameter (higher = sharper attention)
+    pub beta: f32,
+    /// Sparsity threshold for attention (skip if energy > threshold)
+    pub sparsity_threshold: Option<f32>,
+    /// Whether to use shared restriction maps across heads
+    pub shared_restrictions: bool,
+    /// Dropout probability (0.0 = no dropout)
+    pub dropout: f32,
+}
+
+impl Default for SheafAttentionConfig {
+    fn default() -> Self {
+        Self {
+            dim: 64,
+            num_heads: 1,
+            beta: 1.0,
+            sparsity_threshold: None,
+            shared_restrictions: false,
+            dropout: 0.0,
+        }
+    }
+}
+
+impl SheafAttentionConfig {
+    /// Create config with dimension
+    pub fn new(dim: usize) -> Self {
+        Self {
+            dim,
+            ..Default::default()
+        }
+    }
+
+    /// Builder: set number of heads
+    pub fn with_num_heads(mut self, num_heads: usize) -> Self {
+        self.num_heads = num_heads;
+        self
+    }
+
+    /// Builder: set beta temperature
+    pub fn with_beta(mut self, beta: f32) -> Self {
+        self.beta = beta;
+        self
+    }
+
+    /// Builder: set sparsity threshold
+    pub fn with_sparsity_threshold(mut self, threshold: f32) -> Self {
+        self.sparsity_threshold = Some(threshold);
+        self
+    }
+
+    /// Builder: set shared restrictions
+    pub fn with_shared_restrictions(mut self, shared: bool) -> Self {
+        self.shared_restrictions = shared;
+        self
+    }
+
+    /// Builder: set dropout
+    pub fn with_dropout(mut self, dropout: f32) -> Self {
+        self.dropout = dropout;
+        self
+    }
+
+    /// Compute head dimension
+    pub fn head_dim(&self) -> usize {
+        self.dim / self.num_heads
+    }
+
+    /// Validate configuration
+    pub fn validate(&self) -> AttentionResult<()> {
+        if self.dim == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "dimension must be positive".to_string(),
+            ));
+        }
+        if self.num_heads == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "num_heads must be positive".to_string(),
+            ));
+        }
+        if self.dim % self.num_heads != 0 {
+            return Err(AttentionError::InvalidHeadCount {
+                dim: self.dim,
+                num_heads: self.num_heads,
+            });
+        }
+        if self.beta <= 0.0 {
+            return Err(AttentionError::InvalidConfig(
+                "beta must be positive".to_string(),
+            ));
+        }
+        if self.dropout < 0.0 || self.dropout >= 1.0 {
+            return Err(AttentionError::InvalidConfig(
+                "dropout must be in [0, 1)".to_string(),
+            ));
+        }
+        Ok(())
+    }
+}
+
+/// Sheaf Attention Layer
+///
+/// Uses restriction maps instead of learned QKV projections and computes
+/// attention weights based on residual energy.
+pub struct SheafAttention {
+    config: SheafAttentionConfig,
+    /// Restriction map for queries
+    rho_query: RestrictionMap,
+    /// Restriction map for keys
+    rho_key: RestrictionMap,
+    /// Restriction map for values
+    rho_value: RestrictionMap,
+}
+
+impl SheafAttention {
+    /// Create new sheaf attention layer
+    pub fn new(config: SheafAttentionConfig) -> Self {
+        let head_dim = config.head_dim();
+
+        let rho_query = RestrictionMap::new(config.dim, head_dim);
+        let rho_key = RestrictionMap::new(config.dim, head_dim);
+        let rho_value = RestrictionMap::new(config.dim, head_dim);
+
+        Self {
+            config,
+            rho_query,
+            rho_key,
+            rho_value,
+        }
+    }
+
+    /// Create with custom restriction maps
+    pub fn with_restriction_maps(
+        config: SheafAttentionConfig,
+        rho_query: RestrictionMap,
+        rho_key: RestrictionMap,
+        rho_value: RestrictionMap,
+    ) -> Self {
+        Self {
+            config,
+            rho_query,
+            rho_key,
+            rho_value,
+        }
+    }
+
+    /// Get configuration
+    pub fn config(&self) -> &SheafAttentionConfig {
+        &self.config
+    }
+
+    /// Get query restriction map
+    pub fn rho_query(&self) -> &RestrictionMap {
+        &self.rho_query
+    }
+
+    /// Get key restriction map
+    pub fn rho_key(&self) -> &RestrictionMap {
+        &self.rho_key
+    }
+
+    /// Get value restriction map
+    pub fn rho_value(&self) -> &RestrictionMap {
+        &self.rho_value
+    }
+
+    /// Get mutable query restriction map (for training)
+    pub fn rho_query_mut(&mut self) -> &mut RestrictionMap {
+        &mut self.rho_query
+    }
+
+    /// Get mutable key restriction map (for training)
+    pub fn rho_key_mut(&mut self) -> &mut RestrictionMap {
+        &mut self.rho_key
+    }
+
+    /// Get mutable value restriction map (for training)
+    pub fn rho_value_mut(&mut self) -> &mut RestrictionMap {
+        &mut self.rho_value
+    }
+
+    /// Compute residual energy between query and key
+    ///
+    /// E_qk = ||rho_q(q) - rho_k(k)||^2
+    pub fn compute_energy(&self, query: &[f32], key: &[f32]) -> AttentionResult<f32> {
+        let q_proj = self.rho_query.apply(query)?;
+        let k_proj = self.rho_key.apply(key)?;
+
+        let energy: f32 = q_proj
+            .iter()
+            .zip(k_proj.iter())
+            .map(|(&q, &k)| (q - k) * (q - k))
+            .sum();
+
+        Ok(energy)
+    }
+
+    /// Compute energy matrix for all query-key pairs
+    ///
+    /// E[i,j] = ||rho_q(q_i) - rho_k(k_j)||^2
+    pub fn compute_energy_matrix(
+        &self,
+        queries: &[&[f32]],
+        keys: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let n_q = queries.len();
+        let n_k = keys.len();
+
+        // Project all queries and keys
+        let q_proj: Vec<Vec<f32>> = queries
+            .iter()
+            .map(|q| self.rho_query.apply(q))
+            .collect::<AttentionResult<_>>()?;
+
+        let k_proj: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| self.rho_key.apply(k))
+            .collect::<AttentionResult<_>>()?;
+
+        // Compute pairwise energies
+        let mut energies = vec![0.0; n_q * n_k];
+        for i in 0..n_q {
+            for j in 0..n_k {
+                let energy: f32 = q_proj[i]
+                    .iter()
+                    .zip(k_proj[j].iter())
+                    .map(|(&q, &k)| (q - k) * (q - k))
+                    .sum();
+                energies[i * n_k + j] = energy;
+            }
+        }
+
+        Ok(energies)
+    }
+
+    /// Convert energy matrix to attention weights
+    ///
+    /// A_ij = exp(-beta * E_ij) / Z
+    pub fn energy_to_attention(&self, energies: &[f32], n_keys: usize) -> Vec<f32> {
+        let n_queries = energies.len() / n_keys;
+        let mut weights = Vec::with_capacity(energies.len());
+
+        for i in 0..n_queries {
+            let row_start = i * n_keys;
+            let row = &energies[row_start..row_start + n_keys];
+
+            // Apply sparsity threshold if configured
+            let masked_logits: Vec<f32> = if let Some(threshold) = self.config.sparsity_threshold {
+                row.iter()
+                    .map(|&e| {
+                        if e > threshold {
+                            f32::NEG_INFINITY // Mask out high-energy pairs
+                        } else {
+                            -self.config.beta * e
+                        }
+                    })
+                    .collect()
+            } else {
+                row.iter().map(|&e| -self.config.beta * e).collect()
+            };
+
+            let row_weights = stable_softmax(&masked_logits);
+            weights.extend(row_weights);
+        }
+
+        weights
+    }
+
+    /// Compute sheaf attention output
+    ///
+    /// 1. Project queries and keys through restriction maps
+    /// 2. Compute residual energy matrix
+    /// 3. Convert to attention weights: exp(-beta * E) / Z
+    /// 4. Weight values and sum
+    pub fn forward(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<(Vec<f32>, Vec<f32>)> {
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+
+        if keys.is_empty() {
+            return Err(AttentionError::EmptyInput(
+                "keys cannot be empty".to_string(),
+            ));
+        }
+
+        let n_keys = keys.len();
+
+        // Compute energies for this query against all keys
+        let mut energies = Vec::with_capacity(n_keys);
+        for key in keys {
+            energies.push(self.compute_energy(query, key)?);
+        }
+
+        // Convert to attention weights
+        let logits: Vec<f32> = if let Some(threshold) = self.config.sparsity_threshold {
+            energies
+                .iter()
+                .map(|&e| {
+                    if e > threshold {
+                        f32::NEG_INFINITY
+                    } else {
+                        -self.config.beta * e
+                    }
+                })
+                .collect()
+        } else {
+            energies.iter().map(|&e| -self.config.beta * e).collect()
+        };
+
+        let attention_weights = stable_softmax(&logits);
+
+        // Project values and compute weighted sum
+        let v_proj: Vec<Vec<f32>> = values
+            .iter()
+            .map(|v| self.rho_value.apply(v))
+            .collect::<AttentionResult<_>>()?;
+
+        let head_dim = self.config.head_dim();
+        let mut output = vec![0.0; head_dim];
+
+        for (weight, v) in attention_weights.iter().zip(v_proj.iter()) {
+            for (out, &val) in output.iter_mut().zip(v.iter()) {
+                *out += weight * val;
+            }
+        }
+
+        Ok((output, attention_weights))
+    }
+
+    /// Compute total energy for a token (sum over all keys)
+    ///
+    /// E_i = sum_j E_ij
+    pub fn token_energy(&self, query: &[f32], keys: &[&[f32]]) -> AttentionResult<f32> {
+        let mut total_energy = 0.0;
+        for key in keys {
+            total_energy += self.compute_energy(query, key)?;
+        }
+        Ok(total_energy)
+    }
+
+    /// Compute average energy for a token
+    ///
+    /// E_avg = (1/N) * sum_j E_ij
+    pub fn average_token_energy(&self, query: &[f32], keys: &[&[f32]]) -> AttentionResult<f32> {
+        if keys.is_empty() {
+            return Ok(0.0);
+        }
+        Ok(self.token_energy(query, keys)? / keys.len() as f32)
+    }
+}
+
+impl Attention for SheafAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let (output, _weights) = self.forward(query, keys, values)?;
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+
+        if keys.is_empty() {
+            return Err(AttentionError::EmptyInput(
+                "keys cannot be empty".to_string(),
+            ));
+        }
+
+        let n_keys = keys.len();
+
+        // Compute energies
+        let mut energies = Vec::with_capacity(n_keys);
+        for key in keys {
+            energies.push(self.compute_energy(query, key)?);
+        }
+
+        // Apply mask and convert to logits
+        let logits: Vec<f32> = if let Some(m) = mask {
+            if m.len() != n_keys {
+                return Err(AttentionError::InvalidMask {
+                    expected: n_keys.to_string(),
+                    actual: m.len().to_string(),
+                });
+            }
+
+            energies
+                .iter()
+                .zip(m.iter())
+                .map(|(&e, &keep)| {
+                    if !keep {
+                        f32::NEG_INFINITY
+                    } else if let Some(threshold) = self.config.sparsity_threshold {
+                        if e > threshold {
+                            f32::NEG_INFINITY
+                        } else {
+                            -self.config.beta * e
+                        }
+                    } else {
+                        -self.config.beta * e
+                    }
+                })
+                .collect()
+        } else if let Some(threshold) = self.config.sparsity_threshold {
+            energies
+                .iter()
+                .map(|&e| {
+                    if e > threshold {
+                        f32::NEG_INFINITY
+                    } else {
+                        -self.config.beta * e
+                    }
+                })
+                .collect()
+        } else {
+            energies.iter().map(|&e| -self.config.beta * e).collect()
+        };
+
+        let attention_weights = stable_softmax(&logits);
+
+        // Project values and compute weighted sum
+        let v_proj: Vec<Vec<f32>> = values
+            .iter()
+            .map(|v| self.rho_value.apply(v))
+            .collect::<AttentionResult<_>>()?;
+
+        let head_dim = self.config.head_dim();
+        let mut output = vec![0.0; head_dim];
+
+        for (weight, v) in attention_weights.iter().zip(v_proj.iter()) {
+            for (out, &val) in output.iter_mut().zip(v.iter()) {
+                *out += weight * val;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+
+    fn num_heads(&self) -> usize {
+        self.config.num_heads
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_config_default() {
+        let config = SheafAttentionConfig::default();
+        assert_eq!(config.dim, 64);
+        assert_eq!(config.num_heads, 1);
+        assert_eq!(config.beta, 1.0);
+        assert!(config.sparsity_threshold.is_none());
+    }
+
+    #[test]
+    fn test_config_builder() {
+        let config = SheafAttentionConfig::new(128)
+            .with_num_heads(4)
+            .with_beta(2.0)
+            .with_sparsity_threshold(0.5)
+            .with_dropout(0.1);
+
+        assert_eq!(config.dim, 128);
+        assert_eq!(config.num_heads, 4);
+        assert_eq!(config.head_dim(), 32);
+        assert_eq!(config.beta, 2.0);
+        assert_eq!(config.sparsity_threshold, Some(0.5));
+        assert_eq!(config.dropout, 0.1);
+    }
+
+    #[test]
+    fn test_config_validation() {
+        assert!(SheafAttentionConfig::new(64).validate().is_ok());
+
+        assert!(SheafAttentionConfig::new(64)
+            .with_num_heads(3)
+            .validate()
+            .is_err()); // 64 not divisible by 3
+
+        assert!(SheafAttentionConfig::new(64)
+            .with_beta(-1.0)
+            .validate()
+            .is_err());
+    }
+
+    #[test]
+    fn test_sheaf_attention_creation() {
+        let config = SheafAttentionConfig::new(64).with_num_heads(4);
+        let attention = SheafAttention::new(config);
+
+        assert_eq!(attention.dim(), 64);
+        assert_eq!(attention.num_heads(), 4);
+    }
+
+    #[test]
+    fn test_compute_energy() {
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let q = vec![1.0; 8];
+        let k = vec![1.0; 8];
+
+        let energy = attention.compute_energy(&q, &k).unwrap();
+        assert!(energy >= 0.0); // Energy is non-negative
+    }
+
+    #[test]
+    fn test_energy_zero_for_identical() {
+        // With identity-like restriction maps, identical vectors should have low energy
+        let config = SheafAttentionConfig::new(4);
+        let rho = RestrictionMap::identity(4);
+        let attention =
+            SheafAttention::with_restriction_maps(config, rho.clone(), rho.clone(), rho);
+
+        let v = vec![1.0, 2.0, 3.0, 4.0];
+        let energy = attention.compute_energy(&v, &v).unwrap();
+        assert!(energy.abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_forward() {
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![0.5; 8];
+        let v1 = vec![1.0; 8];
+        let v2 = vec![2.0; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+
+        let (output, weights) = attention.forward(&query, &keys, &values).unwrap();
+
+        // Output should be head_dim
+        assert_eq!(output.len(), 8);
+
+        // Weights should sum to 1
+        let weight_sum: f32 = weights.iter().sum();
+        assert!((weight_sum - 1.0).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_attention_trait() {
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![0.5; 8];
+        let v1 = vec![1.0; 8];
+        let v2 = vec![2.0; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+
+        let output = attention.compute(&query, &keys, &values).unwrap();
+        assert_eq!(output.len(), 8);
+    }
+
+    #[test]
+    fn test_attention_with_mask() {
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![0.5; 8];
+        let v1 = vec![1.0; 8];
+        let v2 = vec![2.0; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+        let mask = vec![true, false]; // Only attend to first key
+
+        let output = attention
+            .compute_with_mask(&query, &keys, &values, Some(&mask))
+            .unwrap();
+        assert_eq!(output.len(), 8);
+    }
+
+    #[test]
+    fn test_sparsity_threshold() {
+        let config = SheafAttentionConfig::new(8).with_sparsity_threshold(0.1);
+        let attention = SheafAttention::new(config);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![100.0; 8]; // Very different - high energy
+        let v1 = vec![1.0; 8];
+        let v2 = vec![2.0; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+
+        let (_output, weights) = attention.forward(&query, &keys, &values).unwrap();
+
+        // Second key should have near-zero weight due to high energy
+        // (depends on initialization, but the masked one should be lower)
+        assert!(weights[0] > weights[1]);
+    }
+
+    #[test]
+    fn test_token_energy() {
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![0.5; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+
+        let total_energy = attention.token_energy(&query, &keys).unwrap();
+        let avg_energy = attention.average_token_energy(&query, &keys).unwrap();
+
+        assert!(total_energy >= 0.0);
+        assert!((avg_energy - total_energy / 2.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_beta_effect() {
+        // Higher beta = sharper attention (more peaked distribution)
+        let config_low = SheafAttentionConfig::new(8).with_beta(0.1);
+        let config_high = SheafAttentionConfig::new(8).with_beta(10.0);
+
+        // Use same restriction maps
+        let rho = RestrictionMap::new(8, 8);
+        let attention_low = SheafAttention::with_restriction_maps(
+            config_low,
+            rho.clone(),
+            rho.clone(),
+            rho.clone(),
+        );
+        let attention_high =
+            SheafAttention::with_restriction_maps(config_high, rho.clone(), rho.clone(), rho);
+
+        let query = vec![1.0; 8];
+        let k1 = vec![1.0; 8];
+        let k2 = vec![0.5; 8];
+        let v1 = vec![1.0; 8];
+        let v2 = vec![2.0; 8];
+
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+
+        let (_out_low, weights_low) = attention_low.forward(&query, &keys, &values).unwrap();
+        let (_out_high, weights_high) = attention_high.forward(&query, &keys, &values).unwrap();
+
+        // High beta should have more peaked distribution
+        let max_low = weights_low.iter().cloned().fold(0.0f32, f32::max);
+        let max_high = weights_high.iter().cloned().fold(0.0f32, f32::max);
+
+        assert!(max_high >= max_low);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/early_exit.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/early_exit.rs
@@ -0,0 +1,650 @@
+//! Energy-Based Early Exit
+//!
+//! Implements early exit based on energy convergence rather than confidence thresholds.
+//!
+//! ## Key Insight
+//!
+//! Traditional early exit uses confidence (max softmax probability) which can be
+//! confidently wrong. Energy convergence is more principled:
+//!
+//! - If energy stops changing, further layers won't help
+//! - Energy provides a geometric measure of consistency
+//! - Works naturally with sheaf attention
+//!
+//! ## Exit Criterion
+//!
+//! Exit when: |E_current - E_previous| < epsilon
+//!
+//! This means the representation has stabilized and further processing
+//! is unlikely to improve coherence.
+
+use crate::error::{AttentionError, AttentionResult};
+use serde::{Deserialize, Serialize};
+
+/// Configuration for energy-based early exit
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EarlyExitConfig {
+    /// Energy convergence threshold (exit if delta < epsilon)
+    pub epsilon: f32,
+    /// Minimum layers to process before considering exit
+    pub min_layers: usize,
+    /// Maximum layers (hard limit)
+    pub max_layers: usize,
+    /// Number of consecutive converged steps required
+    pub patience: usize,
+    /// Whether to track energy history
+    pub track_history: bool,
+    /// Exponential moving average smoothing factor (0 = no smoothing)
+    pub ema_alpha: f32,
+}
+
+impl Default for EarlyExitConfig {
+    fn default() -> Self {
+        Self {
+            epsilon: 0.001,
+            min_layers: 2,
+            max_layers: 12,
+            patience: 1,
+            track_history: true,
+            ema_alpha: 0.0,
+        }
+    }
+}
+
+impl EarlyExitConfig {
+    /// Create config with epsilon
+    pub fn new(epsilon: f32) -> Self {
+        Self {
+            epsilon,
+            ..Default::default()
+        }
+    }
+
+    /// Builder: set epsilon
+    pub fn with_epsilon(mut self, epsilon: f32) -> Self {
+        self.epsilon = epsilon;
+        self
+    }
+
+    /// Builder: set minimum layers
+    pub fn with_min_layers(mut self, min: usize) -> Self {
+        self.min_layers = min;
+        self
+    }
+
+    /// Builder: set maximum layers
+    pub fn with_max_layers(mut self, max: usize) -> Self {
+        self.max_layers = max;
+        self
+    }
+
+    /// Builder: set patience
+    pub fn with_patience(mut self, patience: usize) -> Self {
+        self.patience = patience;
+        self
+    }
+
+    /// Builder: set history tracking
+    pub fn with_track_history(mut self, track: bool) -> Self {
+        self.track_history = track;
+        self
+    }
+
+    /// Builder: set EMA smoothing
+    pub fn with_ema_alpha(mut self, alpha: f32) -> Self {
+        self.ema_alpha = alpha.clamp(0.0, 1.0);
+        self
+    }
+
+    /// Validate configuration
+    pub fn validate(&self) -> AttentionResult<()> {
+        if self.epsilon <= 0.0 {
+            return Err(AttentionError::InvalidConfig(
+                "epsilon must be positive".to_string(),
+            ));
+        }
+        if self.min_layers > self.max_layers {
+            return Err(AttentionError::InvalidConfig(
+                "min_layers cannot exceed max_layers".to_string(),
+            ));
+        }
+        if self.patience == 0 {
+            return Err(AttentionError::InvalidConfig(
+                "patience must be at least 1".to_string(),
+            ));
+        }
+        Ok(())
+    }
+}
+
+/// Result of early exit check
+#[derive(Debug, Clone)]
+pub struct EarlyExitResult {
+    /// Whether to exit early
+    pub should_exit: bool,
+    /// Current layer index (0-indexed)
+    pub layer_index: usize,
+    /// Current energy value
+    pub current_energy: f32,
+    /// Energy delta from previous layer
+    pub energy_delta: f32,
+    /// Number of consecutive converged steps
+    pub converged_steps: usize,
+    /// Exit reason (if exiting)
+    pub exit_reason: Option<ExitReason>,
+}
+
+/// Reason for early exit
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ExitReason {
+    /// Energy converged (delta < epsilon)
+    EnergyConverged,
+    /// Reached maximum layers
+    MaxLayersReached,
+    /// Energy is zero (perfectly coherent)
+    PerfectCoherence,
+}
+
+impl ExitReason {
+    /// Human-readable description
+    pub fn description(&self) -> &'static str {
+        match self {
+            Self::EnergyConverged => "Energy converged below threshold",
+            Self::MaxLayersReached => "Reached maximum layer count",
+            Self::PerfectCoherence => "Achieved perfect coherence (zero energy)",
+        }
+    }
+}
+
+/// Energy-based early exit tracker
+#[derive(Debug, Clone)]
+pub struct EarlyExit {
+    config: EarlyExitConfig,
+    /// Energy history across layers
+    energy_history: Vec<f32>,
+    /// EMA-smoothed energy (if enabled)
+    ema_energy: Option<f32>,
+    /// Count of consecutive converged steps
+    converged_count: usize,
+    /// Current layer index
+    current_layer: usize,
+}
+
+impl EarlyExit {
+    /// Create new early exit tracker
+    pub fn new(config: EarlyExitConfig) -> Self {
+        Self {
+            config,
+            energy_history: Vec::new(),
+            ema_energy: None,
+            converged_count: 0,
+            current_layer: 0,
+        }
+    }
+
+    /// Create with default configuration
+    pub fn default_tracker() -> Self {
+        Self::new(EarlyExitConfig::default())
+    }
+
+    /// Reset tracker for new sequence
+    pub fn reset(&mut self) {
+        self.energy_history.clear();
+        self.ema_energy = None;
+        self.converged_count = 0;
+        self.current_layer = 0;
+    }
+
+    /// Get configuration
+    pub fn config(&self) -> &EarlyExitConfig {
+        &self.config
+    }
+
+    /// Get mutable configuration
+    pub fn config_mut(&mut self) -> &mut EarlyExitConfig {
+        &mut self.config
+    }
+
+    /// Get energy history
+    pub fn energy_history(&self) -> &[f32] {
+        &self.energy_history
+    }
+
+    /// Get current layer index
+    pub fn current_layer(&self) -> usize {
+        self.current_layer
+    }
+
+    /// Check if should exit after processing a layer
+    ///
+    /// # Arguments
+    ///
+    /// * `energy` - Energy computed after the current layer
+    ///
+    /// # Returns
+    ///
+    /// Early exit result with decision and diagnostics
+    pub fn check(&mut self, energy: f32) -> EarlyExitResult {
+        let layer_index = self.current_layer;
+        self.current_layer += 1;
+
+        // Update EMA if enabled
+        let effective_energy = if self.config.ema_alpha > 0.0 {
+            let ema = self.ema_energy.unwrap_or(energy);
+            let new_ema = self.config.ema_alpha * energy + (1.0 - self.config.ema_alpha) * ema;
+            self.ema_energy = Some(new_ema);
+            new_ema
+        } else {
+            energy
+        };
+
+        // Compute delta from previous
+        let prev_energy = self.energy_history.last().copied().unwrap_or(f32::INFINITY);
+        let energy_delta = (effective_energy - prev_energy).abs();
+
+        // Track history if enabled
+        if self.config.track_history {
+            self.energy_history.push(effective_energy);
+        }
+
+        // Check for perfect coherence
+        if effective_energy < 1e-10 {
+            return EarlyExitResult {
+                should_exit: true,
+                layer_index,
+                current_energy: effective_energy,
+                energy_delta,
+                converged_steps: self.converged_count + 1,
+                exit_reason: Some(ExitReason::PerfectCoherence),
+            };
+        }
+
+        // Check minimum layers
+        if layer_index < self.config.min_layers {
+            return EarlyExitResult {
+                should_exit: false,
+                layer_index,
+                current_energy: effective_energy,
+                energy_delta,
+                converged_steps: 0,
+                exit_reason: None,
+            };
+        }
+
+        // Check maximum layers
+        if layer_index >= self.config.max_layers - 1 {
+            return EarlyExitResult {
+                should_exit: true,
+                layer_index,
+                current_energy: effective_energy,
+                energy_delta,
+                converged_steps: self.converged_count,
+                exit_reason: Some(ExitReason::MaxLayersReached),
+            };
+        }
+
+        // Check convergence
+        if energy_delta < self.config.epsilon {
+            self.converged_count += 1;
+        } else {
+            self.converged_count = 0;
+        }
+
+        // Check if converged for enough steps
+        if self.converged_count >= self.config.patience {
+            return EarlyExitResult {
+                should_exit: true,
+                layer_index,
+                current_energy: effective_energy,
+                energy_delta,
+                converged_steps: self.converged_count,
+                exit_reason: Some(ExitReason::EnergyConverged),
+            };
+        }
+
+        EarlyExitResult {
+            should_exit: false,
+            layer_index,
+            current_energy: effective_energy,
+            energy_delta,
+            converged_steps: self.converged_count,
+            exit_reason: None,
+        }
+    }
+
+    /// Get statistics about the exit decision
+    pub fn statistics(&self) -> EarlyExitStatistics {
+        let total_layers = self.current_layer;
+        let max_possible = self.config.max_layers;
+
+        let energy_reduction = if self.energy_history.len() >= 2 {
+            let first = self.energy_history.first().copied().unwrap_or(0.0);
+            let last = self.energy_history.last().copied().unwrap_or(0.0);
+            if first > 1e-10 {
+                (first - last) / first
+            } else {
+                0.0
+            }
+        } else {
+            0.0
+        };
+
+        let avg_delta = if self.energy_history.len() >= 2 {
+            let deltas: Vec<f32> = self
+                .energy_history
+                .windows(2)
+                .map(|w| (w[1] - w[0]).abs())
+                .collect();
+            deltas.iter().sum::<f32>() / deltas.len() as f32
+        } else {
+            0.0
+        };
+
+        EarlyExitStatistics {
+            layers_used: total_layers,
+            max_layers: max_possible,
+            layers_saved: max_possible.saturating_sub(total_layers),
+            speedup_ratio: if total_layers > 0 {
+                max_possible as f32 / total_layers as f32
+            } else {
+                1.0
+            },
+            energy_reduction,
+            average_delta: avg_delta,
+            final_energy: self.energy_history.last().copied().unwrap_or(0.0),
+        }
+    }
+}
+
+/// Statistics about early exit behavior
+#[derive(Debug, Clone)]
+pub struct EarlyExitStatistics {
+    /// Number of layers actually processed
+    pub layers_used: usize,
+    /// Maximum possible layers
+    pub max_layers: usize,
+    /// Layers saved by early exit
+    pub layers_saved: usize,
+    /// Speedup ratio (max_layers / layers_used)
+    pub speedup_ratio: f32,
+    /// Relative energy reduction from first to last layer
+    pub energy_reduction: f32,
+    /// Average energy delta across layers
+    pub average_delta: f32,
+    /// Final energy value
+    pub final_energy: f32,
+}
+
+/// Process layers with early exit
+///
+/// Generic function that processes layers until early exit condition is met.
+pub fn process_with_early_exit<F, T>(
+    initial_state: T,
+    layers: &[F],
+    config: EarlyExitConfig,
+    energy_fn: impl Fn(&T) -> f32,
+) -> (T, EarlyExitResult)
+where
+    F: Fn(T) -> T,
+    T: Clone,
+{
+    let mut tracker = EarlyExit::new(config);
+    let mut state = initial_state;
+
+    for layer in layers {
+        // Process layer
+        state = layer(state);
+
+        // Compute energy
+        let energy = energy_fn(&state);
+
+        // Check early exit
+        let result = tracker.check(energy);
+        if result.should_exit {
+            return (state, result);
+        }
+    }
+
+    // Processed all layers
+    let final_energy = energy_fn(&state);
+    let final_result = EarlyExitResult {
+        should_exit: true,
+        layer_index: layers.len(),
+        current_energy: final_energy,
+        energy_delta: 0.0,
+        converged_steps: 0,
+        exit_reason: Some(ExitReason::MaxLayersReached),
+    };
+
+    (state, final_result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_config_default() {
+        let config = EarlyExitConfig::default();
+        assert!(config.epsilon > 0.0);
+        assert!(config.min_layers < config.max_layers);
+        assert!(config.patience > 0);
+    }
+
+    #[test]
+    fn test_config_builder() {
+        let config = EarlyExitConfig::new(0.01)
+            .with_min_layers(3)
+            .with_max_layers(10)
+            .with_patience(2)
+            .with_ema_alpha(0.1);
+
+        assert_eq!(config.epsilon, 0.01);
+        assert_eq!(config.min_layers, 3);
+        assert_eq!(config.max_layers, 10);
+        assert_eq!(config.patience, 2);
+        assert_eq!(config.ema_alpha, 0.1);
+    }
+
+    #[test]
+    fn test_config_validation() {
+        assert!(EarlyExitConfig::default().validate().is_ok());
+
+        let bad_config = EarlyExitConfig {
+            epsilon: -1.0,
+            ..Default::default()
+        };
+        assert!(bad_config.validate().is_err());
+
+        let bad_config = EarlyExitConfig {
+            min_layers: 10,
+            max_layers: 5,
+            ..Default::default()
+        };
+        assert!(bad_config.validate().is_err());
+    }
+
+    #[test]
+    fn test_early_exit_creation() {
+        let tracker = EarlyExit::default_tracker();
+        assert_eq!(tracker.current_layer(), 0);
+        assert!(tracker.energy_history().is_empty());
+    }
+
+    #[test]
+    fn test_early_exit_reset() {
+        let mut tracker = EarlyExit::default_tracker();
+        tracker.check(1.0);
+        tracker.check(0.5);
+
+        assert_eq!(tracker.current_layer(), 2);
+
+        tracker.reset();
+        assert_eq!(tracker.current_layer(), 0);
+        assert!(tracker.energy_history().is_empty());
+    }
+
+    #[test]
+    fn test_min_layers_respected() {
+        let config = EarlyExitConfig::default()
+            .with_min_layers(3)
+            .with_epsilon(0.1);
+        let mut tracker = EarlyExit::new(config);
+
+        // Even with converged energy, shouldn't exit before min_layers
+        // Note: Using non-zero energy (0.001) to avoid PerfectCoherence early exit
+        // which takes precedence over min_layers (as it should - zero energy means done)
+        let result = tracker.check(0.001);
+        assert!(!result.should_exit);
+        assert_eq!(result.layer_index, 0);
+
+        // Same small energy = converged, but still before min_layers
+        let result = tracker.check(0.001);
+        assert!(!result.should_exit);
+        assert_eq!(result.layer_index, 1);
+
+        // Still before min_layers
+        let _result = tracker.check(0.001);
+    }
+
+    #[test]
+    fn test_max_layers_enforced() {
+        let config = EarlyExitConfig::default()
+            .with_max_layers(3)
+            .with_min_layers(1);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(10.0); // Layer 0
+        tracker.check(5.0); // Layer 1
+
+        let result = tracker.check(2.5); // Layer 2 = max - 1
+        assert!(result.should_exit);
+        assert_eq!(result.exit_reason, Some(ExitReason::MaxLayersReached));
+    }
+
+    #[test]
+    fn test_energy_convergence() {
+        let config = EarlyExitConfig::default()
+            .with_epsilon(0.1)
+            .with_min_layers(1)
+            .with_patience(1);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(1.0); // Layer 0
+
+        // Energy change > epsilon
+        let result = tracker.check(0.5); // Layer 1
+        assert!(!result.should_exit);
+
+        // Energy change < epsilon (converged)
+        let result = tracker.check(0.49); // Layer 2
+        assert!(result.should_exit);
+        assert_eq!(result.exit_reason, Some(ExitReason::EnergyConverged));
+    }
+
+    #[test]
+    fn test_patience() {
+        let config = EarlyExitConfig::default()
+            .with_epsilon(0.1)
+            .with_min_layers(1)
+            .with_patience(2);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(1.0); // Layer 0
+
+        // First converged step
+        let result = tracker.check(1.0); // Layer 1
+        assert!(!result.should_exit);
+        assert_eq!(result.converged_steps, 1);
+
+        // Second converged step (patience = 2)
+        let result = tracker.check(1.0); // Layer 2
+        assert!(result.should_exit);
+        assert_eq!(result.converged_steps, 2);
+    }
+
+    #[test]
+    fn test_perfect_coherence() {
+        let config = EarlyExitConfig::default().with_min_layers(1);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(1.0);
+
+        let result = tracker.check(0.0);
+        assert!(result.should_exit);
+        assert_eq!(result.exit_reason, Some(ExitReason::PerfectCoherence));
+    }
+
+    #[test]
+    fn test_ema_smoothing() {
+        let config = EarlyExitConfig::default()
+            .with_ema_alpha(0.5)
+            .with_track_history(true);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(1.0);
+        let result = tracker.check(0.0);
+
+        // With EMA alpha = 0.5: new_ema = 0.5 * 0.0 + 0.5 * 1.0 = 0.5
+        // So history should show smoothed value
+        assert!(tracker.energy_history().len() >= 2);
+    }
+
+    #[test]
+    fn test_statistics() {
+        let config = EarlyExitConfig::default()
+            .with_max_layers(10)
+            .with_min_layers(1)
+            .with_epsilon(0.1);
+        let mut tracker = EarlyExit::new(config);
+
+        tracker.check(1.0);
+        tracker.check(0.5);
+        tracker.check(0.25);
+        tracker.check(0.24); // Should exit here
+
+        let stats = tracker.statistics();
+        assert_eq!(stats.layers_used, 4);
+        assert_eq!(stats.max_layers, 10);
+        assert_eq!(stats.layers_saved, 6);
+        assert!(stats.speedup_ratio > 1.0);
+        assert!(stats.energy_reduction > 0.0);
+    }
+
+    #[test]
+    fn test_process_with_early_exit() {
+        let config = EarlyExitConfig::default()
+            .with_epsilon(0.1)
+            .with_min_layers(1)
+            .with_max_layers(10);
+
+        // Create "layers" that halve the energy each time
+        let layers: Vec<Box<dyn Fn(f32) -> f32>> = (0..10)
+            .map(|_| Box::new(|x: f32| x * 0.5) as Box<dyn Fn(f32) -> f32>)
+            .collect();
+
+        let layer_refs: Vec<&dyn Fn(f32) -> f32> = layers.iter().map(|f| f.as_ref()).collect();
+
+        // This is a simplified test using closures
+        let mut tracker = EarlyExit::new(config);
+        let mut state = 10.0f32;
+
+        for layer in &layer_refs {
+            state = layer(state);
+            let result = tracker.check(state);
+            if result.should_exit {
+                break;
+            }
+        }
+
+        // Should have exited before processing all 10 layers
+        assert!(tracker.current_layer() < 10);
+    }
+
+    #[test]
+    fn test_exit_reason_descriptions() {
+        assert!(!ExitReason::EnergyConverged.description().is_empty());
+        assert!(!ExitReason::MaxLayersReached.description().is_empty());
+        assert!(!ExitReason::PerfectCoherence.description().is_empty());
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/mod.rs
@@ -0,0 +1,88 @@
+//! Sheaf Attention Module
+//!
+//! Implements Coherence-Gated Transformer (CGT) attention mechanisms based on ADR-015.
+//!
+//! ## Key Concepts
+//!
+//! - **Sheaf Attention**: Attention weights inversely proportional to residual energy
+//! - **Restriction Maps**: Replace learned W_q, W_k, W_v projections with geometric maps
+//! - **Token Routing**: Route tokens to compute lanes based on coherence energy
+//! - **Residual-Sparse Attention**: Only attend to high-residual (incoherent) pairs
+//! - **Energy-Based Early Exit**: Exit when energy converges, not confidence threshold
+//!
+//! ## Mathematical Foundation
+//!
+//! Given tokens X = {x_1, ..., x_N} and restriction maps rho_i, rho_j:
+//!
+//! ```text
+//! Residual:        r_ij = rho_i(x_i) - rho_j(x_j)
+//! Edge energy:     E_ij = w_ij * ||r_ij||^2
+//! Token energy:    E_i = sum_j E_ij
+//! Attention:       A_ij = exp(-beta * E_ij) / Z
+//! ```
+//!
+//! ## Example
+//!
+//! ```rust
+//! use ruvector_attention::sheaf::{
+//!     SheafAttention, SheafAttentionConfig,
+//!     RestrictionMap, ComputeLane, TokenRouter,
+//! };
+//!
+//! // Create sheaf attention with default config
+//! let config = SheafAttentionConfig::default();
+//! let attention = SheafAttention::new(config);
+//!
+//! // Create restriction maps for QKV
+//! let rho_q = RestrictionMap::new(64, 64);
+//! let rho_k = RestrictionMap::new(64, 64);
+//! let rho_v = RestrictionMap::new(64, 64);
+//! ```
+
+mod attention;
+mod early_exit;
+mod restriction;
+mod router;
+mod sparse;
+
+pub use attention::{SheafAttention, SheafAttentionConfig};
+pub use early_exit::{
+    process_with_early_exit, EarlyExit, EarlyExitConfig, EarlyExitResult, EarlyExitStatistics,
+    ExitReason,
+};
+pub use restriction::{RestrictionMap, RestrictionMapConfig};
+pub use router::{ComputeLane, LaneStatistics, RoutingDecision, TokenRouter, TokenRouterConfig};
+pub use sparse::{
+    ResidualSparseMask, SparseResidualAttention, SparseResidualConfig, SparsityStatistics,
+};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_module_exports() {
+        // Verify all public types are accessible
+        let config = SheafAttentionConfig::default();
+        assert!(config.beta > 0.0);
+
+        let rmap_config = RestrictionMapConfig::default();
+        assert!(rmap_config.input_dim > 0);
+
+        let router_config = TokenRouterConfig::default();
+        assert!(router_config.theta_reflex > 0.0);
+
+        let early_exit_config = EarlyExitConfig::default();
+        assert!(early_exit_config.epsilon > 0.0);
+
+        let sparse_config = SparseResidualConfig::default();
+        assert!(sparse_config.residual_threshold > 0.0);
+    }
+
+    #[test]
+    fn test_compute_lane_ordering() {
+        assert!(ComputeLane::Reflex < ComputeLane::Standard);
+        assert!(ComputeLane::Standard < ComputeLane::Deep);
+        assert!(ComputeLane::Deep < ComputeLane::Escalate);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/restriction.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/restriction.rs
@@ -0,0 +1,518 @@
+//! Restriction Maps for Sheaf Attention
+//!
+//! Restriction maps replace traditional learned W_q, W_k, W_v projections
+//! with geometrically meaningful transformations.
+//!
+//! ## Mathematical Foundation
+//!
+//! A restriction map rho: V_U -> V_u projects from a larger stalk to a smaller one:
+//!
+//! ```text
+//! Linear restriction: rho(x) = Ax + b
+//! Residual: r = rho_i(x_i) - rho_j(x_j)
+//! Energy: E = ||r||^2
+//! ```
+//!
+//! ## Benefits
+//!
+//! - Geometric meaning: projects to shared semantic space
+//! - Interpretable residuals: measure semantic mismatch
+//! - Can be initialized from domain knowledge
+//! - Residual energy provides natural attention weighting
+
+use crate::error::{AttentionError, AttentionResult};
+use serde::{Deserialize, Serialize};
+
+/// Configuration for restriction map
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RestrictionMapConfig {
+    /// Input dimension (stalk dimension at source)
+    pub input_dim: usize,
+    /// Output dimension (stalk dimension at target)
+    pub output_dim: usize,
+    /// Whether to include bias term
+    pub use_bias: bool,
+    /// Initialization scale (Xavier scaling)
+    pub init_scale: Option<f32>,
+}
+
+impl Default for RestrictionMapConfig {
+    fn default() -> Self {
+        Self {
+            input_dim: 64,
+            output_dim: 64,
+            use_bias: true,
+            init_scale: None,
+        }
+    }
+}
+
+impl RestrictionMapConfig {
+    /// Create config with specified dimensions
+    pub fn new(input_dim: usize, output_dim: usize) -> Self {
+        Self {
+            input_dim,
+            output_dim,
+            ..Default::default()
+        }
+    }
+
+    /// Builder pattern: set input dimension
+    pub fn with_input_dim(mut self, dim: usize) -> Self {
+        self.input_dim = dim;
+        self
+    }
+
+    /// Builder pattern: set output dimension
+    pub fn with_output_dim(mut self, dim: usize) -> Self {
+        self.output_dim = dim;
+        self
+    }
+
+    /// Builder pattern: set bias usage
+    pub fn with_bias(mut self, use_bias: bool) -> Self {
+        self.use_bias = use_bias;
+        self
+    }
+
+    /// Builder pattern: set initialization scale
+    pub fn with_init_scale(mut self, scale: f32) -> Self {
+        self.init_scale = Some(scale);
+        self
+    }
+}
+
+/// Linear restriction map: rho(x) = Ax + b
+///
+/// Projects vectors from one stalk to another, preserving geometric
+/// relationships while allowing dimension changes.
+#[derive(Debug, Clone)]
+pub struct RestrictionMap {
+    /// Weight matrix A: [output_dim x input_dim] stored row-major
+    weights: Vec<f32>,
+    /// Bias vector b: [output_dim]
+    bias: Option<Vec<f32>>,
+    /// Input dimension
+    input_dim: usize,
+    /// Output dimension
+    output_dim: usize,
+}
+
+impl RestrictionMap {
+    /// Create a new restriction map with Xavier initialization
+    pub fn new(input_dim: usize, output_dim: usize) -> Self {
+        Self::from_config(RestrictionMapConfig::new(input_dim, output_dim))
+    }
+
+    /// Create from configuration
+    pub fn from_config(config: RestrictionMapConfig) -> Self {
+        let scale = config
+            .init_scale
+            .unwrap_or_else(|| (2.0 / (config.input_dim + config.output_dim) as f32).sqrt());
+
+        // Deterministic pseudo-random initialization
+        let mut seed = 42u64;
+        let weights: Vec<f32> = (0..config.output_dim * config.input_dim)
+            .map(|_| {
+                seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+                let u = (seed as f32) / (u64::MAX as f32);
+                (u - 0.5) * 2.0 * scale
+            })
+            .collect();
+
+        let bias = if config.use_bias {
+            Some(vec![0.0; config.output_dim])
+        } else {
+            None
+        };
+
+        Self {
+            weights,
+            bias,
+            input_dim: config.input_dim,
+            output_dim: config.output_dim,
+        }
+    }
+
+    /// Create identity-like restriction map (for same dimension)
+    pub fn identity(dim: usize) -> Self {
+        let mut weights = vec![0.0; dim * dim];
+        for i in 0..dim {
+            weights[i * dim + i] = 1.0;
+        }
+
+        Self {
+            weights,
+            bias: None,
+            input_dim: dim,
+            output_dim: dim,
+        }
+    }
+
+    /// Create from existing weights
+    pub fn from_weights(
+        weights: Vec<f32>,
+        bias: Option<Vec<f32>>,
+        input_dim: usize,
+        output_dim: usize,
+    ) -> AttentionResult<Self> {
+        if weights.len() != output_dim * input_dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: output_dim * input_dim,
+                actual: weights.len(),
+            });
+        }
+
+        if let Some(ref b) = bias {
+            if b.len() != output_dim {
+                return Err(AttentionError::DimensionMismatch {
+                    expected: output_dim,
+                    actual: b.len(),
+                });
+            }
+        }
+
+        Ok(Self {
+            weights,
+            bias,
+            input_dim,
+            output_dim,
+        })
+    }
+
+    /// Apply restriction map: rho(x) = Ax + b
+    ///
+    /// # Arguments
+    ///
+    /// * `x` - Input vector of shape [input_dim]
+    ///
+    /// # Returns
+    ///
+    /// Output vector of shape [output_dim]
+    pub fn apply(&self, x: &[f32]) -> AttentionResult<Vec<f32>> {
+        if x.len() != self.input_dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.input_dim,
+                actual: x.len(),
+            });
+        }
+
+        // Matrix-vector multiplication: y = Ax
+        let mut y = vec![0.0; self.output_dim];
+        for i in 0..self.output_dim {
+            let row_start = i * self.input_dim;
+            y[i] = x
+                .iter()
+                .enumerate()
+                .map(|(j, &xj)| self.weights[row_start + j] * xj)
+                .sum();
+        }
+
+        // Add bias: y = Ax + b
+        if let Some(ref b) = self.bias {
+            for (yi, bi) in y.iter_mut().zip(b.iter()) {
+                *yi += bi;
+            }
+        }
+
+        Ok(y)
+    }
+
+    /// Apply restriction map to batch of vectors
+    ///
+    /// # Arguments
+    ///
+    /// * `batch` - Batch of input vectors
+    ///
+    /// # Returns
+    ///
+    /// Batch of output vectors
+    pub fn apply_batch(&self, batch: &[&[f32]]) -> AttentionResult<Vec<Vec<f32>>> {
+        batch.iter().map(|x| self.apply(x)).collect()
+    }
+
+    /// Compute residual between two restricted vectors
+    ///
+    /// r_ij = rho(x_i) - rho(x_j)
+    ///
+    /// # Arguments
+    ///
+    /// * `x_i` - First input vector
+    /// * `x_j` - Second input vector
+    ///
+    /// # Returns
+    ///
+    /// Residual vector
+    pub fn residual(&self, x_i: &[f32], x_j: &[f32]) -> AttentionResult<Vec<f32>> {
+        let rho_i = self.apply(x_i)?;
+        let rho_j = self.apply(x_j)?;
+
+        Ok(rho_i
+            .iter()
+            .zip(rho_j.iter())
+            .map(|(&a, &b)| a - b)
+            .collect())
+    }
+
+    /// Compute residual energy (squared L2 norm of residual)
+    ///
+    /// E_ij = ||rho(x_i) - rho(x_j)||^2
+    ///
+    /// # Arguments
+    ///
+    /// * `x_i` - First input vector
+    /// * `x_j` - Second input vector
+    ///
+    /// # Returns
+    ///
+    /// Residual energy (non-negative scalar)
+    pub fn energy(&self, x_i: &[f32], x_j: &[f32]) -> AttentionResult<f32> {
+        let residual = self.residual(x_i, x_j)?;
+        Ok(residual.iter().map(|r| r * r).sum())
+    }
+
+    /// Compute weighted residual energy
+    ///
+    /// E_ij = w * ||rho(x_i) - rho(x_j)||^2
+    ///
+    /// # Arguments
+    ///
+    /// * `x_i` - First input vector
+    /// * `x_j` - Second input vector
+    /// * `weight` - Edge weight
+    ///
+    /// # Returns
+    ///
+    /// Weighted residual energy
+    pub fn weighted_energy(&self, x_i: &[f32], x_j: &[f32], weight: f32) -> AttentionResult<f32> {
+        Ok(weight * self.energy(x_i, x_j)?)
+    }
+
+    /// Compute energy matrix for all pairs
+    ///
+    /// E[i,j] = ||rho(x_i) - rho(x_j)||^2
+    ///
+    /// # Arguments
+    ///
+    /// * `vectors` - Input vectors
+    ///
+    /// # Returns
+    ///
+    /// Energy matrix [N x N] stored row-major
+    pub fn energy_matrix(&self, vectors: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        let n = vectors.len();
+
+        // First, apply restriction map to all vectors
+        let restricted: Vec<Vec<f32>> = vectors
+            .iter()
+            .map(|v| self.apply(v))
+            .collect::<AttentionResult<_>>()?;
+
+        // Compute pairwise energies
+        let mut energies = vec![0.0; n * n];
+        for i in 0..n {
+            for j in 0..n {
+                if i == j {
+                    energies[i * n + j] = 0.0;
+                } else {
+                    let energy: f32 = restricted[i]
+                        .iter()
+                        .zip(restricted[j].iter())
+                        .map(|(&a, &b)| (a - b) * (a - b))
+                        .sum();
+                    energies[i * n + j] = energy;
+                }
+            }
+        }
+
+        Ok(energies)
+    }
+
+    /// Get input dimension
+    pub fn input_dim(&self) -> usize {
+        self.input_dim
+    }
+
+    /// Get output dimension
+    pub fn output_dim(&self) -> usize {
+        self.output_dim
+    }
+
+    /// Get weight matrix (read-only)
+    pub fn weights(&self) -> &[f32] {
+        &self.weights
+    }
+
+    /// Get mutable weight matrix (for training)
+    pub fn weights_mut(&mut self) -> &mut [f32] {
+        &mut self.weights
+    }
+
+    /// Get bias vector (read-only)
+    pub fn bias(&self) -> Option<&[f32]> {
+        self.bias.as_deref()
+    }
+
+    /// Get mutable bias vector (for training)
+    pub fn bias_mut(&mut self) -> Option<&mut [f32]> {
+        self.bias.as_deref_mut()
+    }
+
+    /// Update weights with gradient
+    pub fn update_weights(&mut self, gradients: &[f32], learning_rate: f32) {
+        if gradients.len() == self.weights.len() {
+            for (w, g) in self.weights.iter_mut().zip(gradients.iter()) {
+                *w -= learning_rate * g;
+            }
+        }
+    }
+
+    /// Update bias with gradient
+    pub fn update_bias(&mut self, gradients: &[f32], learning_rate: f32) {
+        if let Some(ref mut bias) = self.bias {
+            if gradients.len() == bias.len() {
+                for (b, g) in bias.iter_mut().zip(gradients.iter()) {
+                    *b -= learning_rate * g;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_restriction_map_creation() {
+        let rmap = RestrictionMap::new(64, 32);
+        assert_eq!(rmap.input_dim(), 64);
+        assert_eq!(rmap.output_dim(), 32);
+        assert_eq!(rmap.weights().len(), 64 * 32);
+        assert!(rmap.bias().is_some());
+    }
+
+    #[test]
+    fn test_identity_restriction() {
+        let rmap = RestrictionMap::identity(4);
+        let x = vec![1.0, 2.0, 3.0, 4.0];
+        let y = rmap.apply(&x).unwrap();
+
+        for (xi, yi) in x.iter().zip(y.iter()) {
+            assert!((xi - yi).abs() < 1e-6);
+        }
+    }
+
+    #[test]
+    fn test_apply() {
+        let rmap = RestrictionMap::new(4, 3);
+        let x = vec![1.0, 2.0, 3.0, 4.0];
+        let y = rmap.apply(&x).unwrap();
+
+        assert_eq!(y.len(), 3);
+    }
+
+    #[test]
+    fn test_apply_dimension_mismatch() {
+        let rmap = RestrictionMap::new(4, 3);
+        let x = vec![1.0, 2.0]; // Wrong dimension
+
+        assert!(rmap.apply(&x).is_err());
+    }
+
+    #[test]
+    fn test_residual() {
+        let rmap = RestrictionMap::identity(4);
+        let x_i = vec![1.0, 2.0, 3.0, 4.0];
+        let x_j = vec![2.0, 3.0, 4.0, 5.0];
+        let residual = rmap.residual(&x_i, &x_j).unwrap();
+
+        // Should be x_i - x_j = [-1, -1, -1, -1]
+        for r in &residual {
+            assert!((*r + 1.0).abs() < 1e-6);
+        }
+    }
+
+    #[test]
+    fn test_energy() {
+        let rmap = RestrictionMap::identity(4);
+        let x_i = vec![1.0, 2.0, 3.0, 4.0];
+        let x_j = vec![2.0, 3.0, 4.0, 5.0];
+        let energy = rmap.energy(&x_i, &x_j).unwrap();
+
+        // Residual = [-1, -1, -1, -1], energy = 4
+        assert!((energy - 4.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_energy_symmetry() {
+        let rmap = RestrictionMap::new(8, 8);
+        let x_i = vec![1.0; 8];
+        let x_j = vec![0.5; 8];
+
+        let e_ij = rmap.energy(&x_i, &x_j).unwrap();
+        let e_ji = rmap.energy(&x_j, &x_i).unwrap();
+
+        assert!((e_ij - e_ji).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_energy_matrix() {
+        let rmap = RestrictionMap::identity(4);
+        let v1 = vec![1.0, 0.0, 0.0, 0.0];
+        let v2 = vec![0.0, 1.0, 0.0, 0.0];
+        let v3 = vec![0.0, 0.0, 1.0, 0.0];
+        let vectors: Vec<&[f32]> = vec![&v1, &v2, &v3];
+
+        let energies = rmap.energy_matrix(&vectors).unwrap();
+
+        // Diagonal should be 0
+        assert!(energies[0].abs() < 1e-6); // E[0,0]
+        assert!(energies[4].abs() < 1e-6); // E[1,1]
+        assert!(energies[8].abs() < 1e-6); // E[2,2]
+
+        // Off-diagonal: ||e_i - e_j||^2 = 2 for orthonormal basis
+        assert!((energies[1] - 2.0).abs() < 1e-6); // E[0,1]
+        assert!((energies[3] - 2.0).abs() < 1e-6); // E[1,0]
+    }
+
+    #[test]
+    fn test_batch_apply() {
+        let rmap = RestrictionMap::new(4, 3);
+        let v1 = vec![1.0; 4];
+        let v2 = vec![2.0; 4];
+        let batch: Vec<&[f32]> = vec![&v1, &v2];
+
+        let results = rmap.apply_batch(&batch).unwrap();
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0].len(), 3);
+        assert_eq!(results[1].len(), 3);
+    }
+
+    #[test]
+    fn test_from_weights() {
+        let weights = vec![1.0, 0.0, 0.0, 1.0]; // 2x2 identity
+        let bias = Some(vec![0.5, 0.5]);
+
+        let rmap = RestrictionMap::from_weights(weights, bias, 2, 2).unwrap();
+        let x = vec![1.0, 2.0];
+        let y = rmap.apply(&x).unwrap();
+
+        assert!((y[0] - 1.5).abs() < 1e-6); // 1*1 + 0*2 + 0.5
+        assert!((y[1] - 2.5).abs() < 1e-6); // 0*1 + 1*2 + 0.5
+    }
+
+    #[test]
+    fn test_config_builder() {
+        let config = RestrictionMapConfig::default()
+            .with_input_dim(128)
+            .with_output_dim(64)
+            .with_bias(false)
+            .with_init_scale(0.1);
+
+        assert_eq!(config.input_dim, 128);
+        assert_eq!(config.output_dim, 64);
+        assert!(!config.use_bias);
+        assert_eq!(config.init_scale, Some(0.1));
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/router.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/router.rs
@@ -0,0 +1,665 @@
+//! Token Router for Coherence-Gated Transformer
+//!
+//! Routes tokens to different compute lanes based on coherence energy:
+//!
+//! - **Reflex** (Lane 0): E < theta_reflex, minimal compute (<0.1ms)
+//! - **Standard** (Lane 1): E < theta_standard, normal compute (~1ms)
+//! - **Deep** (Lane 2): E >= theta_standard, maximum compute (~5ms)
+//! - **Escalate** (Lane 3): Irreconcilable incoherence, return uncertainty
+//!
+//! ## Routing Thresholds
+//!
+//! | Threshold | Default | Meaning |
+//! |-----------|---------|---------|
+//! | theta_reflex | 0.01 | Token highly coherent with context |
+//! | theta_standard | 0.1 | Minor inconsistencies |
+//! | theta_deep | 1.0 | Major inconsistencies |
+//! | theta_escalate | 10.0 | Irreconcilable (escalate) |
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::sheaf::SheafAttention;
+use serde::{Deserialize, Serialize};
+
+/// Compute lane for token processing
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub enum ComputeLane {
+    /// Minimal compute (<0.1ms): 1-2 layers, local attention, no FFN
+    /// Use case: Common tokens, clear context
+    Reflex = 0,
+
+    /// Standard compute (~1ms): 6 layers, sparse sheaf attention
+    /// Use case: Normal tokens requiring context integration
+    Standard = 1,
+
+    /// Deep compute (~5ms): 12+ layers, full sheaf + MoE
+    /// Use case: Ambiguous, contradictory, or complex tokens
+    Deep = 2,
+
+    /// Escalate: Return uncertainty, request clarification
+    /// Use case: Irreconcilable incoherence
+    Escalate = 3,
+}
+
+impl ComputeLane {
+    /// Get human-readable description
+    pub fn description(&self) -> &'static str {
+        match self {
+            Self::Reflex => "Reflex (minimal compute)",
+            Self::Standard => "Standard (normal compute)",
+            Self::Deep => "Deep (maximum compute)",
+            Self::Escalate => "Escalate (return uncertainty)",
+        }
+    }
+
+    /// Get typical latency in milliseconds
+    pub fn typical_latency_ms(&self) -> f32 {
+        match self {
+            Self::Reflex => 0.1,
+            Self::Standard => 1.0,
+            Self::Deep => 5.0,
+            Self::Escalate => 0.0, // Async/immediate return
+        }
+    }
+
+    /// Get typical number of layers
+    pub fn typical_layers(&self) -> usize {
+        match self {
+            Self::Reflex => 2,
+            Self::Standard => 6,
+            Self::Deep => 12,
+            Self::Escalate => 0,
+        }
+    }
+
+    /// Check if this lane requires full attention
+    pub fn requires_full_attention(&self) -> bool {
+        matches!(self, Self::Deep)
+    }
+
+    /// Check if this lane uses MoE routing
+    pub fn uses_moe(&self) -> bool {
+        matches!(self, Self::Deep)
+    }
+}
+
+/// Configuration for token router
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TokenRouterConfig {
+    /// Energy threshold for reflex lane (E < theta_reflex -> Reflex)
+    pub theta_reflex: f32,
+    /// Energy threshold for standard lane (E < theta_standard -> Standard)
+    pub theta_standard: f32,
+    /// Energy threshold for deep lane (E < theta_deep -> Deep)
+    pub theta_deep: f32,
+    /// Energy threshold for escalation (E >= theta_escalate -> Escalate)
+    pub theta_escalate: f32,
+    /// Whether to use average energy (true) or total energy (false)
+    pub use_average_energy: bool,
+    /// Minimum context size for routing (smaller contexts default to Standard)
+    pub min_context_size: usize,
+}
+
+impl Default for TokenRouterConfig {
+    fn default() -> Self {
+        Self {
+            theta_reflex: 0.01,
+            theta_standard: 0.1,
+            theta_deep: 1.0,
+            theta_escalate: 10.0,
+            use_average_energy: true,
+            min_context_size: 4,
+        }
+    }
+}
+
+impl TokenRouterConfig {
+    /// Create config with custom thresholds
+    pub fn new(theta_reflex: f32, theta_standard: f32, theta_deep: f32) -> Self {
+        Self {
+            theta_reflex,
+            theta_standard,
+            theta_deep,
+            theta_escalate: theta_deep * 10.0,
+            ..Default::default()
+        }
+    }
+
+    /// Builder: set reflex threshold
+    pub fn with_theta_reflex(mut self, theta: f32) -> Self {
+        self.theta_reflex = theta;
+        self
+    }
+
+    /// Builder: set standard threshold
+    pub fn with_theta_standard(mut self, theta: f32) -> Self {
+        self.theta_standard = theta;
+        self
+    }
+
+    /// Builder: set deep threshold
+    pub fn with_theta_deep(mut self, theta: f32) -> Self {
+        self.theta_deep = theta;
+        self
+    }
+
+    /// Builder: set escalate threshold
+    pub fn with_theta_escalate(mut self, theta: f32) -> Self {
+        self.theta_escalate = theta;
+        self
+    }
+
+    /// Builder: set energy computation method
+    pub fn with_average_energy(mut self, use_avg: bool) -> Self {
+        self.use_average_energy = use_avg;
+        self
+    }
+
+    /// Builder: set minimum context size
+    pub fn with_min_context_size(mut self, size: usize) -> Self {
+        self.min_context_size = size;
+        self
+    }
+
+    /// Validate configuration
+    pub fn validate(&self) -> AttentionResult<()> {
+        if self.theta_reflex <= 0.0 {
+            return Err(AttentionError::InvalidConfig(
+                "theta_reflex must be positive".to_string(),
+            ));
+        }
+        if self.theta_standard <= self.theta_reflex {
+            return Err(AttentionError::InvalidConfig(
+                "theta_standard must be greater than theta_reflex".to_string(),
+            ));
+        }
+        if self.theta_deep <= self.theta_standard {
+            return Err(AttentionError::InvalidConfig(
+                "theta_deep must be greater than theta_standard".to_string(),
+            ));
+        }
+        if self.theta_escalate <= self.theta_deep {
+            return Err(AttentionError::InvalidConfig(
+                "theta_escalate must be greater than theta_deep".to_string(),
+            ));
+        }
+        Ok(())
+    }
+}
+
+/// Routing decision for a token
+#[derive(Debug, Clone)]
+pub struct RoutingDecision {
+    /// Token index in sequence
+    pub token_idx: usize,
+    /// Computed energy for the token
+    pub energy: f32,
+    /// Assigned compute lane
+    pub lane: ComputeLane,
+    /// Confidence in the routing decision (0-1)
+    pub confidence: f32,
+    /// Optional sparse mask indices (for Standard lane)
+    pub sparse_indices: Option<Vec<usize>>,
+}
+
+impl RoutingDecision {
+    /// Create a new routing decision
+    pub fn new(token_idx: usize, energy: f32, lane: ComputeLane) -> Self {
+        // Confidence based on how clearly the energy falls into a lane
+        let confidence = 1.0; // Can be refined based on energy distance to thresholds
+
+        Self {
+            token_idx,
+            energy,
+            lane,
+            confidence,
+            sparse_indices: None,
+        }
+    }
+
+    /// Set sparse indices for this decision
+    pub fn with_sparse_indices(mut self, indices: Vec<usize>) -> Self {
+        self.sparse_indices = Some(indices);
+        self
+    }
+
+    /// Check if this token needs attention
+    pub fn needs_attention(&self) -> bool {
+        !matches!(self.lane, ComputeLane::Escalate)
+    }
+}
+
+/// Token router for coherence-gated transformer
+pub struct TokenRouter {
+    config: TokenRouterConfig,
+}
+
+impl TokenRouter {
+    /// Create a new token router
+    pub fn new(config: TokenRouterConfig) -> Self {
+        Self { config }
+    }
+
+    /// Create with default configuration
+    pub fn default_router() -> Self {
+        Self::new(TokenRouterConfig::default())
+    }
+
+    /// Get configuration
+    pub fn config(&self) -> &TokenRouterConfig {
+        &self.config
+    }
+
+    /// Get mutable configuration (for SONA tuning)
+    pub fn config_mut(&mut self) -> &mut TokenRouterConfig {
+        &mut self.config
+    }
+
+    /// Route a single token based on energy
+    ///
+    /// # Arguments
+    ///
+    /// * `energy` - Pre-computed energy for the token
+    ///
+    /// # Returns
+    ///
+    /// Compute lane for this token
+    pub fn route_by_energy(&self, energy: f32) -> ComputeLane {
+        if energy < self.config.theta_reflex {
+            ComputeLane::Reflex
+        } else if energy < self.config.theta_standard {
+            ComputeLane::Standard
+        } else if energy < self.config.theta_escalate {
+            ComputeLane::Deep
+        } else {
+            ComputeLane::Escalate
+        }
+    }
+
+    /// Route a single token using sheaf attention
+    ///
+    /// # Arguments
+    ///
+    /// * `token` - Token embedding
+    /// * `context` - Context embeddings (keys)
+    /// * `attention` - Sheaf attention layer for energy computation
+    ///
+    /// # Returns
+    ///
+    /// Routing decision for this token
+    pub fn route_token(
+        &self,
+        token_idx: usize,
+        token: &[f32],
+        context: &[&[f32]],
+        attention: &SheafAttention,
+    ) -> AttentionResult<RoutingDecision> {
+        // Handle small contexts
+        if context.len() < self.config.min_context_size {
+            return Ok(RoutingDecision::new(token_idx, 0.0, ComputeLane::Standard));
+        }
+
+        // Compute energy
+        let energy = if self.config.use_average_energy {
+            attention.average_token_energy(token, context)?
+        } else {
+            attention.token_energy(token, context)?
+        };
+
+        let lane = self.route_by_energy(energy);
+
+        Ok(RoutingDecision::new(token_idx, energy, lane))
+    }
+
+    /// Route a batch of tokens
+    ///
+    /// # Arguments
+    ///
+    /// * `tokens` - Token embeddings
+    /// * `context` - Shared context embeddings
+    /// * `attention` - Sheaf attention layer
+    ///
+    /// # Returns
+    ///
+    /// Vector of routing decisions
+    pub fn route_batch(
+        &self,
+        tokens: &[&[f32]],
+        context: &[&[f32]],
+        attention: &SheafAttention,
+    ) -> AttentionResult<Vec<RoutingDecision>> {
+        tokens
+            .iter()
+            .enumerate()
+            .map(|(idx, token)| self.route_token(idx, token, context, attention))
+            .collect()
+    }
+
+    /// Group tokens by their assigned lane
+    ///
+    /// Returns (reflex_indices, standard_indices, deep_indices, escalate_indices)
+    pub fn group_by_lane(
+        decisions: &[RoutingDecision],
+    ) -> (Vec<usize>, Vec<usize>, Vec<usize>, Vec<usize>) {
+        let mut reflex = Vec::new();
+        let mut standard = Vec::new();
+        let mut deep = Vec::new();
+        let mut escalate = Vec::new();
+
+        for decision in decisions {
+            match decision.lane {
+                ComputeLane::Reflex => reflex.push(decision.token_idx),
+                ComputeLane::Standard => standard.push(decision.token_idx),
+                ComputeLane::Deep => deep.push(decision.token_idx),
+                ComputeLane::Escalate => escalate.push(decision.token_idx),
+            }
+        }
+
+        (reflex, standard, deep, escalate)
+    }
+
+    /// Compute lane statistics for a batch of decisions
+    pub fn lane_statistics(decisions: &[RoutingDecision]) -> LaneStatistics {
+        let total = decisions.len();
+        let (reflex, standard, deep, escalate) = Self::group_by_lane(decisions);
+
+        let avg_energy = if total > 0 {
+            decisions.iter().map(|d| d.energy).sum::<f32>() / total as f32
+        } else {
+            0.0
+        };
+
+        let max_energy = decisions.iter().map(|d| d.energy).fold(0.0f32, f32::max);
+
+        let min_energy = decisions
+            .iter()
+            .map(|d| d.energy)
+            .fold(f32::INFINITY, f32::min);
+
+        LaneStatistics {
+            total_tokens: total,
+            reflex_count: reflex.len(),
+            standard_count: standard.len(),
+            deep_count: deep.len(),
+            escalate_count: escalate.len(),
+            average_energy: avg_energy,
+            max_energy,
+            min_energy: if min_energy.is_infinite() {
+                0.0
+            } else {
+                min_energy
+            },
+        }
+    }
+
+    /// Estimate total latency for a batch based on routing
+    pub fn estimate_latency_ms(decisions: &[RoutingDecision]) -> f32 {
+        decisions.iter().map(|d| d.lane.typical_latency_ms()).sum()
+    }
+
+    /// Update thresholds based on desired lane distribution
+    ///
+    /// This can be used by SONA for adaptive tuning.
+    pub fn tune_thresholds(
+        &mut self,
+        current_stats: &LaneStatistics,
+        target_reflex_ratio: f32,
+        target_standard_ratio: f32,
+    ) {
+        let total = current_stats.total_tokens as f32;
+        if total == 0.0 {
+            return;
+        }
+
+        let current_reflex_ratio = current_stats.reflex_count as f32 / total;
+        let current_standard_ratio = current_stats.standard_count as f32 / total;
+
+        // Adjust thresholds to move towards target ratios
+        // More reflex needed -> increase theta_reflex
+        // Less reflex needed -> decrease theta_reflex
+        let reflex_adjustment = (target_reflex_ratio - current_reflex_ratio) * 0.1;
+        let standard_adjustment = (target_standard_ratio - current_standard_ratio) * 0.1;
+
+        // Apply adjustments while maintaining ordering
+        self.config.theta_reflex = (self.config.theta_reflex * (1.0 + reflex_adjustment))
+            .max(0.001)
+            .min(self.config.theta_standard * 0.9);
+
+        self.config.theta_standard = (self.config.theta_standard * (1.0 + standard_adjustment))
+            .max(self.config.theta_reflex * 1.1)
+            .min(self.config.theta_deep * 0.9);
+    }
+}
+
+/// Statistics about lane distribution
+#[derive(Debug, Clone)]
+pub struct LaneStatistics {
+    /// Total number of tokens routed
+    pub total_tokens: usize,
+    /// Tokens routed to Reflex lane
+    pub reflex_count: usize,
+    /// Tokens routed to Standard lane
+    pub standard_count: usize,
+    /// Tokens routed to Deep lane
+    pub deep_count: usize,
+    /// Tokens escalated
+    pub escalate_count: usize,
+    /// Average energy across all tokens
+    pub average_energy: f32,
+    /// Maximum energy
+    pub max_energy: f32,
+    /// Minimum energy
+    pub min_energy: f32,
+}
+
+impl LaneStatistics {
+    /// Get ratio of tokens in reflex lane
+    pub fn reflex_ratio(&self) -> f32 {
+        if self.total_tokens == 0 {
+            0.0
+        } else {
+            self.reflex_count as f32 / self.total_tokens as f32
+        }
+    }
+
+    /// Get ratio of tokens in standard lane
+    pub fn standard_ratio(&self) -> f32 {
+        if self.total_tokens == 0 {
+            0.0
+        } else {
+            self.standard_count as f32 / self.total_tokens as f32
+        }
+    }
+
+    /// Get ratio of tokens in deep lane
+    pub fn deep_ratio(&self) -> f32 {
+        if self.total_tokens == 0 {
+            0.0
+        } else {
+            self.deep_count as f32 / self.total_tokens as f32
+        }
+    }
+
+    /// Get ratio of escalated tokens
+    pub fn escalate_ratio(&self) -> f32 {
+        if self.total_tokens == 0 {
+            0.0
+        } else {
+            self.escalate_count as f32 / self.total_tokens as f32
+        }
+    }
+
+    /// Estimated speedup compared to all-deep processing
+    pub fn estimated_speedup(&self) -> f32 {
+        if self.total_tokens == 0 {
+            1.0
+        } else {
+            let deep_latency = self.total_tokens as f32 * ComputeLane::Deep.typical_latency_ms();
+            let actual_latency = self.reflex_count as f32
+                * ComputeLane::Reflex.typical_latency_ms()
+                + self.standard_count as f32 * ComputeLane::Standard.typical_latency_ms()
+                + self.deep_count as f32 * ComputeLane::Deep.typical_latency_ms();
+
+            if actual_latency > 0.0 {
+                deep_latency / actual_latency
+            } else {
+                1.0
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::sheaf::SheafAttentionConfig;
+
+    #[test]
+    fn test_compute_lane_ordering() {
+        assert!(ComputeLane::Reflex < ComputeLane::Standard);
+        assert!(ComputeLane::Standard < ComputeLane::Deep);
+        assert!(ComputeLane::Deep < ComputeLane::Escalate);
+    }
+
+    #[test]
+    fn test_lane_properties() {
+        assert_eq!(ComputeLane::Reflex.typical_layers(), 2);
+        assert_eq!(ComputeLane::Standard.typical_layers(), 6);
+        assert_eq!(ComputeLane::Deep.typical_layers(), 12);
+
+        assert!(!ComputeLane::Reflex.requires_full_attention());
+        assert!(!ComputeLane::Standard.requires_full_attention());
+        assert!(ComputeLane::Deep.requires_full_attention());
+
+        assert!(!ComputeLane::Reflex.uses_moe());
+        assert!(ComputeLane::Deep.uses_moe());
+    }
+
+    #[test]
+    fn test_config_default() {
+        let config = TokenRouterConfig::default();
+        assert!(config.theta_reflex < config.theta_standard);
+        assert!(config.theta_standard < config.theta_deep);
+        assert!(config.theta_deep < config.theta_escalate);
+    }
+
+    #[test]
+    fn test_config_validation() {
+        assert!(TokenRouterConfig::default().validate().is_ok());
+
+        let bad_config = TokenRouterConfig {
+            theta_reflex: 0.1,
+            theta_standard: 0.05, // Less than reflex
+            ..Default::default()
+        };
+        assert!(bad_config.validate().is_err());
+    }
+
+    #[test]
+    fn test_route_by_energy() {
+        let router = TokenRouter::default_router();
+
+        assert_eq!(router.route_by_energy(0.001), ComputeLane::Reflex);
+        assert_eq!(router.route_by_energy(0.05), ComputeLane::Standard);
+        assert_eq!(router.route_by_energy(0.5), ComputeLane::Deep);
+        assert_eq!(router.route_by_energy(100.0), ComputeLane::Escalate);
+    }
+
+    #[test]
+    fn test_route_token() {
+        let router = TokenRouter::default_router();
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let token = vec![1.0; 8];
+        let c1 = vec![1.0; 8];
+        let c2 = vec![1.0; 8];
+        let c3 = vec![1.0; 8];
+        let c4 = vec![1.0; 8];
+        let context: Vec<&[f32]> = vec![&c1, &c2, &c3, &c4];
+
+        let decision = router.route_token(0, &token, &context, &attention).unwrap();
+        assert_eq!(decision.token_idx, 0);
+        assert!(decision.energy >= 0.0);
+    }
+
+    #[test]
+    fn test_route_batch() {
+        let router = TokenRouter::default_router();
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let t1 = vec![1.0; 8];
+        let t2 = vec![0.5; 8];
+        let tokens: Vec<&[f32]> = vec![&t1, &t2];
+
+        let c1 = vec![1.0; 8];
+        let c2 = vec![1.0; 8];
+        let c3 = vec![1.0; 8];
+        let c4 = vec![1.0; 8];
+        let context: Vec<&[f32]> = vec![&c1, &c2, &c3, &c4];
+
+        let decisions = router.route_batch(&tokens, &context, &attention).unwrap();
+        assert_eq!(decisions.len(), 2);
+    }
+
+    #[test]
+    fn test_group_by_lane() {
+        let decisions = vec![
+            RoutingDecision::new(0, 0.001, ComputeLane::Reflex),
+            RoutingDecision::new(1, 0.05, ComputeLane::Standard),
+            RoutingDecision::new(2, 0.5, ComputeLane::Deep),
+            RoutingDecision::new(3, 0.002, ComputeLane::Reflex),
+        ];
+
+        let (reflex, standard, deep, escalate) = TokenRouter::group_by_lane(&decisions);
+
+        assert_eq!(reflex, vec![0, 3]);
+        assert_eq!(standard, vec![1]);
+        assert_eq!(deep, vec![2]);
+        assert!(escalate.is_empty());
+    }
+
+    #[test]
+    fn test_lane_statistics() {
+        let decisions = vec![
+            RoutingDecision::new(0, 0.001, ComputeLane::Reflex),
+            RoutingDecision::new(1, 0.05, ComputeLane::Standard),
+            RoutingDecision::new(2, 0.5, ComputeLane::Deep),
+            RoutingDecision::new(3, 0.002, ComputeLane::Reflex),
+        ];
+
+        let stats = TokenRouter::lane_statistics(&decisions);
+
+        assert_eq!(stats.total_tokens, 4);
+        assert_eq!(stats.reflex_count, 2);
+        assert_eq!(stats.standard_count, 1);
+        assert_eq!(stats.deep_count, 1);
+        assert_eq!(stats.escalate_count, 0);
+
+        assert!((stats.reflex_ratio() - 0.5).abs() < 1e-6);
+        assert!(stats.estimated_speedup() > 1.0);
+    }
+
+    #[test]
+    fn test_routing_decision_builder() {
+        let decision =
+            RoutingDecision::new(0, 0.1, ComputeLane::Standard).with_sparse_indices(vec![1, 3, 5]);
+
+        assert!(decision.sparse_indices.is_some());
+        assert_eq!(decision.sparse_indices.unwrap(), vec![1, 3, 5]);
+    }
+
+    #[test]
+    fn test_small_context_default() {
+        let router = TokenRouter::default_router();
+        let config = SheafAttentionConfig::new(8);
+        let attention = SheafAttention::new(config);
+
+        let token = vec![1.0; 8];
+        let c1 = vec![1.0; 8];
+        let context: Vec<&[f32]> = vec![&c1]; // Small context
+
+        let decision = router.route_token(0, &token, &context, &attention).unwrap();
+        assert_eq!(decision.lane, ComputeLane::Standard); // Default for small context
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sheaf/sparse.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sheaf/sparse.rs
@@ -0,0 +1,711 @@
+//! Residual-Sparse Attention
+//!
+//! Generates sparse attention masks based on residual energy.
+//! Only computes attention for token pairs with high residuals (incoherent).
+//!
+//! ## Key Insight
+//!
+//! Tokens that are already coherent (low residual) don't need expensive attention.
+//! By only attending to high-residual pairs, we can achieve significant speedups
+//! while maintaining quality.
+//!
+//! ## Sparsity Pattern
+//!
+//! Unlike fixed patterns (local, strided), residual-sparse attention adapts to content:
+//! - Coherent regions: Few attention connections
+//! - Incoherent regions: More attention connections
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::sheaf::restriction::RestrictionMap;
+use crate::traits::SparseMask;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for residual-sparse attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SparseResidualConfig {
+    /// Residual threshold: only attend if residual > threshold
+    pub residual_threshold: f32,
+    /// Maximum sparsity ratio (0.0 = full dense, 1.0 = maximally sparse)
+    pub max_sparsity: f32,
+    /// Minimum connections per query (ensure each query attends to at least k keys)
+    pub min_connections: usize,
+    /// Whether to always include self-attention (diagonal)
+    pub include_self: bool,
+    /// Whether to include local window regardless of residual
+    pub local_window: Option<usize>,
+}
+
+impl Default for SparseResidualConfig {
+    fn default() -> Self {
+        Self {
+            residual_threshold: 0.05,
+            max_sparsity: 0.9,
+            min_connections: 1,
+            include_self: true,
+            local_window: Some(8),
+        }
+    }
+}
+
+impl SparseResidualConfig {
+    /// Create with residual threshold
+    pub fn new(residual_threshold: f32) -> Self {
+        Self {
+            residual_threshold,
+            ..Default::default()
+        }
+    }
+
+    /// Builder: set residual threshold
+    pub fn with_residual_threshold(mut self, threshold: f32) -> Self {
+        self.residual_threshold = threshold;
+        self
+    }
+
+    /// Builder: set max sparsity
+    pub fn with_max_sparsity(mut self, sparsity: f32) -> Self {
+        self.max_sparsity = sparsity.clamp(0.0, 1.0);
+        self
+    }
+
+    /// Builder: set minimum connections
+    pub fn with_min_connections(mut self, min: usize) -> Self {
+        self.min_connections = min;
+        self
+    }
+
+    /// Builder: set self-attention inclusion
+    pub fn with_self_attention(mut self, include: bool) -> Self {
+        self.include_self = include;
+        self
+    }
+
+    /// Builder: set local window
+    pub fn with_local_window(mut self, window: Option<usize>) -> Self {
+        self.local_window = window;
+        self
+    }
+
+    /// Validate configuration
+    pub fn validate(&self) -> AttentionResult<()> {
+        if self.residual_threshold < 0.0 {
+            return Err(AttentionError::InvalidConfig(
+                "residual_threshold must be non-negative".to_string(),
+            ));
+        }
+        if self.max_sparsity < 0.0 || self.max_sparsity > 1.0 {
+            return Err(AttentionError::InvalidConfig(
+                "max_sparsity must be in [0, 1]".to_string(),
+            ));
+        }
+        Ok(())
+    }
+}
+
+/// Sparse mask based on residual energy
+#[derive(Debug, Clone)]
+pub struct ResidualSparseMask {
+    /// Number of queries
+    pub n_queries: usize,
+    /// Number of keys
+    pub n_keys: usize,
+    /// Sparse mask indices: (query_idx, key_idx) pairs
+    pub connections: Vec<(usize, usize)>,
+    /// Optional residual values for each connection
+    pub residuals: Option<Vec<f32>>,
+    /// Sparsity ratio achieved
+    pub sparsity: f32,
+}
+
+impl ResidualSparseMask {
+    /// Create from connections
+    pub fn new(n_queries: usize, n_keys: usize, connections: Vec<(usize, usize)>) -> Self {
+        let total_possible = n_queries * n_keys;
+        let sparsity = if total_possible > 0 {
+            1.0 - (connections.len() as f32 / total_possible as f32)
+        } else {
+            0.0
+        };
+
+        Self {
+            n_queries,
+            n_keys,
+            connections,
+            residuals: None,
+            sparsity,
+        }
+    }
+
+    /// Create with residual values
+    pub fn with_residuals(
+        n_queries: usize,
+        n_keys: usize,
+        connections: Vec<(usize, usize)>,
+        residuals: Vec<f32>,
+    ) -> Self {
+        let total_possible = n_queries * n_keys;
+        let sparsity = if total_possible > 0 {
+            1.0 - (connections.len() as f32 / total_possible as f32)
+        } else {
+            0.0
+        };
+
+        Self {
+            n_queries,
+            n_keys,
+            connections,
+            residuals: Some(residuals),
+            sparsity,
+        }
+    }
+
+    /// Get number of non-zero connections
+    pub fn nnz(&self) -> usize {
+        self.connections.len()
+    }
+
+    /// Convert to dense boolean mask
+    pub fn to_dense_mask(&self) -> Vec<bool> {
+        let mut mask = vec![false; self.n_queries * self.n_keys];
+        for &(i, j) in &self.connections {
+            mask[i * self.n_keys + j] = true;
+        }
+        mask
+    }
+
+    /// Convert to SparseMask (for Attention trait compatibility)
+    pub fn to_sparse_mask(&self) -> SparseMask {
+        let rows: Vec<usize> = self.connections.iter().map(|(i, _)| *i).collect();
+        let cols: Vec<usize> = self.connections.iter().map(|(_, j)| *j).collect();
+
+        SparseMask {
+            rows,
+            cols,
+            values: self.residuals.clone(),
+        }
+    }
+
+    /// Get connections for a specific query
+    pub fn query_connections(&self, query_idx: usize) -> Vec<usize> {
+        self.connections
+            .iter()
+            .filter_map(|&(i, j)| if i == query_idx { Some(j) } else { None })
+            .collect()
+    }
+
+    /// Get connections as CSR format (row pointers and column indices)
+    pub fn to_csr(&self) -> (Vec<usize>, Vec<usize>) {
+        let mut row_ptr = vec![0; self.n_queries + 1];
+        let mut col_idx = Vec::with_capacity(self.connections.len());
+
+        // Count connections per query
+        for &(i, _) in &self.connections {
+            row_ptr[i + 1] += 1;
+        }
+
+        // Cumulative sum
+        for i in 1..=self.n_queries {
+            row_ptr[i] += row_ptr[i - 1];
+        }
+
+        // Fill column indices (assumes connections are sorted by query)
+        let mut current_row = vec![0; self.n_queries];
+        col_idx.resize(self.connections.len(), 0);
+
+        for &(i, j) in &self.connections {
+            let pos = row_ptr[i] + current_row[i];
+            col_idx[pos] = j;
+            current_row[i] += 1;
+        }
+
+        (row_ptr, col_idx)
+    }
+}
+
+/// Sparse attention layer based on residual energy
+pub struct SparseResidualAttention {
+    config: SparseResidualConfig,
+    /// Restriction map for computing residuals
+    restriction_map: RestrictionMap,
+}
+
+impl SparseResidualAttention {
+    /// Create new sparse residual attention
+    pub fn new(config: SparseResidualConfig, restriction_map: RestrictionMap) -> Self {
+        Self {
+            config,
+            restriction_map,
+        }
+    }
+
+    /// Create with dimension (creates default restriction map)
+    pub fn with_dim(config: SparseResidualConfig, dim: usize) -> Self {
+        let restriction_map = RestrictionMap::new(dim, dim);
+        Self::new(config, restriction_map)
+    }
+
+    /// Get configuration
+    pub fn config(&self) -> &SparseResidualConfig {
+        &self.config
+    }
+
+    /// Get restriction map
+    pub fn restriction_map(&self) -> &RestrictionMap {
+        &self.restriction_map
+    }
+
+    /// Compute residual matrix between queries and keys
+    ///
+    /// R[i,j] = ||rho(q_i) - rho(k_j)||^2
+    pub fn compute_residual_matrix(
+        &self,
+        queries: &[&[f32]],
+        keys: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let n_q = queries.len();
+        let n_k = keys.len();
+
+        // Project all queries and keys
+        let q_proj: Vec<Vec<f32>> = queries
+            .iter()
+            .map(|q| self.restriction_map.apply(q))
+            .collect::<AttentionResult<_>>()?;
+
+        let k_proj: Vec<Vec<f32>> = keys
+            .iter()
+            .map(|k| self.restriction_map.apply(k))
+            .collect::<AttentionResult<_>>()?;
+
+        // Compute pairwise residuals
+        let mut residuals = vec![0.0; n_q * n_k];
+        for i in 0..n_q {
+            for j in 0..n_k {
+                let residual: f32 = q_proj[i]
+                    .iter()
+                    .zip(k_proj[j].iter())
+                    .map(|(&q, &k)| (q - k) * (q - k))
+                    .sum();
+                residuals[i * n_k + j] = residual;
+            }
+        }
+
+        Ok(residuals)
+    }
+
+    /// Generate sparse mask based on residual thresholding
+    ///
+    /// Include connections where residual > threshold (incoherent pairs need attention)
+    pub fn generate_mask(
+        &self,
+        queries: &[&[f32]],
+        keys: &[&[f32]],
+    ) -> AttentionResult<ResidualSparseMask> {
+        let n_q = queries.len();
+        let n_k = keys.len();
+
+        let residuals = self.compute_residual_matrix(queries, keys)?;
+
+        let mut connections = Vec::new();
+        let mut connection_residuals = Vec::new();
+
+        for i in 0..n_q {
+            let mut query_connections: Vec<(usize, f32)> = Vec::new();
+
+            for j in 0..n_k {
+                let r = residuals[i * n_k + j];
+
+                // Include self-attention
+                if self.config.include_self && i == j && i < n_k {
+                    query_connections.push((j, r));
+                    continue;
+                }
+
+                // Include local window
+                if let Some(window) = self.config.local_window {
+                    let half_window = window / 2;
+                    if (i as isize - j as isize).unsigned_abs() <= half_window {
+                        query_connections.push((j, r));
+                        continue;
+                    }
+                }
+
+                // Include high-residual pairs (incoherent - need attention)
+                if r > self.config.residual_threshold {
+                    query_connections.push((j, r));
+                }
+            }
+
+            // Ensure minimum connections by adding highest-residual pairs if needed
+            if query_connections.len() < self.config.min_connections {
+                // Sort all pairs by residual (descending) and take top k
+                let mut all_pairs: Vec<(usize, f32)> =
+                    (0..n_k).map(|j| (j, residuals[i * n_k + j])).collect();
+                all_pairs
+                    .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+                for (j, r) in all_pairs.into_iter().take(self.config.min_connections) {
+                    if !query_connections.iter().any(|(jj, _)| *jj == j) {
+                        query_connections.push((j, r));
+                    }
+                }
+            }
+
+            // Enforce max sparsity
+            let max_connections = ((1.0 - self.config.max_sparsity) * n_k as f32).ceil() as usize;
+            if query_connections.len() > max_connections {
+                // Sort by residual (descending) and keep top max_connections
+                query_connections
+                    .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+                query_connections.truncate(max_connections);
+            }
+
+            // Add to global connections
+            for (j, r) in query_connections {
+                connections.push((i, j));
+                connection_residuals.push(r);
+            }
+        }
+
+        // Sort connections by (i, j) for CSR conversion
+        let mut paired: Vec<((usize, usize), f32)> =
+            connections.into_iter().zip(connection_residuals).collect();
+        paired.sort_by_key(|((i, j), _)| (*i, *j));
+
+        let connections: Vec<(usize, usize)> = paired.iter().map(|(c, _)| *c).collect();
+        let residuals: Vec<f32> = paired.iter().map(|(_, r)| *r).collect();
+
+        Ok(ResidualSparseMask::with_residuals(
+            n_q,
+            n_k,
+            connections,
+            residuals,
+        ))
+    }
+
+    /// Compute sparse attention output
+    ///
+    /// Only computes attention for connections in the mask
+    pub fn compute_sparse(
+        &self,
+        queries: &[&[f32]],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: &ResidualSparseMask,
+        beta: f32,
+    ) -> AttentionResult<Vec<Vec<f32>>> {
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+
+        let n_q = queries.len();
+        let dim = if values.is_empty() {
+            0
+        } else {
+            values[0].len()
+        };
+
+        let mut outputs = vec![vec![0.0; dim]; n_q];
+
+        // Group connections by query
+        for i in 0..n_q {
+            let query_conns = mask.query_connections(i);
+            if query_conns.is_empty() {
+                continue;
+            }
+
+            // Compute attention weights for this query's connections
+            let residuals: Vec<f32> = query_conns
+                .iter()
+                .map(|&j| self.restriction_map.energy(queries[i], keys[j]))
+                .collect::<AttentionResult<_>>()?;
+
+            // Convert to attention weights: exp(-beta * E) / Z
+            let logits: Vec<f32> = residuals.iter().map(|&r| -beta * r).collect();
+            let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+
+            let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+            let sum: f32 = exp_logits.iter().sum();
+
+            let weights: Vec<f32> = if sum > 1e-10 {
+                exp_logits.iter().map(|&e| e / sum).collect()
+            } else {
+                vec![1.0 / query_conns.len() as f32; query_conns.len()]
+            };
+
+            // Weighted sum of values
+            for (weight, &j) in weights.iter().zip(query_conns.iter()) {
+                for (out, &val) in outputs[i].iter_mut().zip(values[j].iter()) {
+                    *out += weight * val;
+                }
+            }
+        }
+
+        Ok(outputs)
+    }
+
+    /// Efficient sparse matmul: output = sparse_weights @ values
+    ///
+    /// Uses CSR format for efficiency
+    pub fn sparse_matmul(
+        &self,
+        row_ptr: &[usize],
+        col_idx: &[usize],
+        weights: &[f32],
+        values: &[&[f32]],
+    ) -> Vec<Vec<f32>> {
+        let n_queries = row_ptr.len() - 1;
+        let dim = if values.is_empty() {
+            0
+        } else {
+            values[0].len()
+        };
+
+        let mut outputs = vec![vec![0.0; dim]; n_queries];
+
+        for i in 0..n_queries {
+            let start = row_ptr[i];
+            let end = row_ptr[i + 1];
+
+            for k in start..end {
+                let j = col_idx[k];
+                let w = weights[k];
+
+                for (out, &val) in outputs[i].iter_mut().zip(values[j].iter()) {
+                    *out += w * val;
+                }
+            }
+        }
+
+        outputs
+    }
+}
+
+/// Statistics about sparsity pattern
+#[derive(Debug, Clone)]
+pub struct SparsityStatistics {
+    /// Total number of queries
+    pub n_queries: usize,
+    /// Total number of keys
+    pub n_keys: usize,
+    /// Number of non-zero connections
+    pub nnz: usize,
+    /// Sparsity ratio (0 = dense, 1 = maximally sparse)
+    pub sparsity: f32,
+    /// Average connections per query
+    pub avg_connections: f32,
+    /// Min connections for any query
+    pub min_connections: usize,
+    /// Max connections for any query
+    pub max_connections: usize,
+}
+
+impl SparsityStatistics {
+    /// Compute statistics from mask
+    pub fn from_mask(mask: &ResidualSparseMask) -> Self {
+        let n_q = mask.n_queries;
+        let n_k = mask.n_keys;
+        let nnz = mask.nnz();
+
+        // Count connections per query
+        let mut per_query = vec![0usize; n_q];
+        for &(i, _) in &mask.connections {
+            per_query[i] += 1;
+        }
+
+        let min_conn = per_query.iter().cloned().min().unwrap_or(0);
+        let max_conn = per_query.iter().cloned().max().unwrap_or(0);
+        let avg_conn = if n_q > 0 {
+            nnz as f32 / n_q as f32
+        } else {
+            0.0
+        };
+
+        Self {
+            n_queries: n_q,
+            n_keys: n_k,
+            nnz,
+            sparsity: mask.sparsity,
+            avg_connections: avg_conn,
+            min_connections: min_conn,
+            max_connections: max_conn,
+        }
+    }
+
+    /// Estimated speedup from sparsity
+    pub fn estimated_speedup(&self) -> f32 {
+        if self.sparsity < 1.0 {
+            1.0 / (1.0 - self.sparsity)
+        } else {
+            f32::INFINITY
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_config_default() {
+        let config = SparseResidualConfig::default();
+        assert!(config.residual_threshold > 0.0);
+        assert!(config.max_sparsity > 0.0);
+        assert!(config.include_self);
+    }
+
+    #[test]
+    fn test_config_builder() {
+        let config = SparseResidualConfig::new(0.1)
+            .with_max_sparsity(0.8)
+            .with_min_connections(2)
+            .with_self_attention(false)
+            .with_local_window(None);
+
+        assert_eq!(config.residual_threshold, 0.1);
+        assert_eq!(config.max_sparsity, 0.8);
+        assert_eq!(config.min_connections, 2);
+        assert!(!config.include_self);
+        assert!(config.local_window.is_none());
+    }
+
+    #[test]
+    fn test_sparse_mask_creation() {
+        let connections = vec![(0, 0), (0, 1), (1, 1), (1, 2)];
+        let mask = ResidualSparseMask::new(2, 3, connections);
+
+        assert_eq!(mask.n_queries, 2);
+        assert_eq!(mask.n_keys, 3);
+        assert_eq!(mask.nnz(), 4);
+        assert!((mask.sparsity - (1.0 - 4.0 / 6.0)).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_to_dense_mask() {
+        let connections = vec![(0, 0), (0, 2), (1, 1)];
+        let mask = ResidualSparseMask::new(2, 3, connections);
+
+        let dense = mask.to_dense_mask();
+        assert_eq!(dense.len(), 6);
+        assert!(dense[0]); // (0, 0)
+        assert!(!dense[1]); // (0, 1)
+        assert!(dense[2]); // (0, 2)
+        assert!(!dense[3]); // (1, 0)
+        assert!(dense[4]); // (1, 1)
+        assert!(!dense[5]); // (1, 2)
+    }
+
+    #[test]
+    fn test_query_connections() {
+        let connections = vec![(0, 0), (0, 2), (1, 1), (1, 2)];
+        let mask = ResidualSparseMask::new(2, 3, connections);
+
+        assert_eq!(mask.query_connections(0), vec![0, 2]);
+        assert_eq!(mask.query_connections(1), vec![1, 2]);
+    }
+
+    #[test]
+    fn test_to_csr() {
+        let connections = vec![(0, 0), (0, 2), (1, 1), (1, 2)];
+        let mask = ResidualSparseMask::new(2, 3, connections);
+
+        let (row_ptr, col_idx) = mask.to_csr();
+
+        assert_eq!(row_ptr, vec![0, 2, 4]);
+        assert_eq!(col_idx, vec![0, 2, 1, 2]);
+    }
+
+    #[test]
+    fn test_generate_mask() {
+        let config = SparseResidualConfig::default()
+            .with_local_window(None)
+            .with_self_attention(false)
+            .with_min_connections(0);
+
+        let rmap = RestrictionMap::identity(4);
+        let sparse = SparseResidualAttention::new(config, rmap);
+
+        // Create queries and keys with varying similarity
+        let q1 = vec![1.0, 0.0, 0.0, 0.0];
+        let q2 = vec![0.0, 1.0, 0.0, 0.0];
+        let k1 = vec![1.0, 0.0, 0.0, 0.0]; // Similar to q1
+        let k2 = vec![0.0, 0.0, 1.0, 0.0]; // Different from both
+
+        let queries: Vec<&[f32]> = vec![&q1, &q2];
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+
+        let mask = sparse.generate_mask(&queries, &keys).unwrap();
+
+        // Should have connections for high-residual pairs
+        assert!(mask.nnz() > 0);
+    }
+
+    #[test]
+    fn test_compute_sparse() {
+        let config = SparseResidualConfig::default();
+        let rmap = RestrictionMap::identity(4);
+        let sparse = SparseResidualAttention::new(config, rmap);
+
+        let q1 = vec![1.0, 0.0, 0.0, 0.0];
+        let k1 = vec![1.0, 0.0, 0.0, 0.0];
+        let k2 = vec![0.0, 1.0, 0.0, 0.0];
+        let v1 = vec![1.0, 2.0, 3.0, 4.0];
+        let v2 = vec![5.0, 6.0, 7.0, 8.0];
+
+        let queries: Vec<&[f32]> = vec![&q1];
+        let keys: Vec<&[f32]> = vec![&k1, &k2];
+        let values: Vec<&[f32]> = vec![&v1, &v2];
+
+        let mask = sparse.generate_mask(&queries, &keys).unwrap();
+        let output = sparse
+            .compute_sparse(&queries, &keys, &values, &mask, 1.0)
+            .unwrap();
+
+        assert_eq!(output.len(), 1);
+        assert_eq!(output[0].len(), 4);
+    }
+
+    #[test]
+    fn test_sparsity_statistics() {
+        let connections = vec![(0, 0), (0, 1), (1, 0), (1, 1), (1, 2)];
+        let mask = ResidualSparseMask::new(2, 3, connections);
+
+        let stats = SparsityStatistics::from_mask(&mask);
+
+        assert_eq!(stats.n_queries, 2);
+        assert_eq!(stats.n_keys, 3);
+        assert_eq!(stats.nnz, 5);
+        assert_eq!(stats.min_connections, 2);
+        assert_eq!(stats.max_connections, 3);
+        assert!((stats.avg_connections - 2.5).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_sparse_matmul() {
+        let config = SparseResidualConfig::default();
+        let rmap = RestrictionMap::identity(2);
+        let sparse = SparseResidualAttention::new(config, rmap);
+
+        // 2x3 sparse matrix with weights
+        let row_ptr = vec![0, 2, 3];
+        let col_idx = vec![0, 1, 2];
+        let weights = vec![0.5, 0.5, 1.0];
+
+        let v1 = vec![1.0, 2.0];
+        let v2 = vec![3.0, 4.0];
+        let v3 = vec![5.0, 6.0];
+        let values: Vec<&[f32]> = vec![&v1, &v2, &v3];
+
+        let output = sparse.sparse_matmul(&row_ptr, &col_idx, &weights, &values);
+
+        assert_eq!(output.len(), 2);
+        // Row 0: 0.5 * [1,2] + 0.5 * [3,4] = [2, 3]
+        assert!((output[0][0] - 2.0).abs() < 1e-6);
+        assert!((output[0][1] - 3.0).abs() < 1e-6);
+        // Row 1: 1.0 * [5,6] = [5, 6]
+        assert!((output[1][0] - 5.0).abs() < 1e-6);
+        assert!((output[1][1] - 6.0).abs() < 1e-6);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sparse/flash.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sparse/flash.rs
@@ -0,0 +1,227 @@
+//! Flash attention - memory-efficient attention with tiled computation
+//!
+//! Memory: O(block_size) for attention matrix instead of O(n²)
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+
+/// Flash attention with block-wise computation
+///
+/// Computes attention in tiles to minimize memory usage while maintaining numerical stability.
+pub struct FlashAttention {
+    dim: usize,
+    block_size: usize,
+    scale: f32,
+    causal: bool,
+}
+
+impl FlashAttention {
+    /// Create new flash attention
+    pub fn new(dim: usize, block_size: usize) -> Self {
+        Self {
+            dim,
+            block_size,
+            scale: 1.0 / (dim as f32).sqrt(),
+            causal: false,
+        }
+    }
+
+    /// Create with causal masking
+    pub fn causal(dim: usize, block_size: usize) -> Self {
+        Self {
+            dim,
+            block_size,
+            scale: 1.0 / (dim as f32).sqrt(),
+            causal: true,
+        }
+    }
+
+    /// Compute attention scores for a block
+    fn compute_block_scores(&self, query: &[f32], keys: &[&[f32]], start_idx: usize) -> Vec<f32> {
+        keys.iter()
+            .enumerate()
+            .map(|(j, key)| {
+                if self.causal && start_idx + j > 0 {
+                    // Simplified causal: assuming query is at position 0
+                    f32::NEG_INFINITY
+                } else {
+                    query
+                        .iter()
+                        .zip(key.iter())
+                        .map(|(q, k)| q * k)
+                        .sum::<f32>()
+                        * self.scale
+                }
+            })
+            .collect()
+    }
+}
+
+impl Attention for FlashAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+        if query.len() != self.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.dim,
+                actual: query.len(),
+            });
+        }
+
+        let n = keys.len();
+        let value_dim = values[0].len();
+
+        // Online softmax with tiled computation
+        let mut output = vec![0.0f32; value_dim];
+        let mut max_so_far = f32::NEG_INFINITY;
+        let mut sum_exp = 0.0f32;
+
+        // Process in blocks
+        for block_start in (0..n).step_by(self.block_size) {
+            let block_end = (block_start + self.block_size).min(n);
+            let block_keys: Vec<&[f32]> = keys[block_start..block_end].to_vec();
+
+            // Compute attention scores for this block
+            let block_scores = self.compute_block_scores(query, &block_keys, block_start);
+
+            // Find block maximum
+            let block_max = block_scores
+                .iter()
+                .copied()
+                .filter(|x| x.is_finite())
+                .fold(f32::NEG_INFINITY, f32::max);
+
+            if !block_max.is_finite() {
+                continue; // Skip fully masked blocks
+            }
+
+            // New maximum
+            let new_max = max_so_far.max(block_max);
+
+            // Rescale previous accumulations
+            if max_so_far.is_finite() {
+                let rescale = (max_so_far - new_max).exp();
+                sum_exp *= rescale;
+                output.iter_mut().for_each(|o| *o *= rescale);
+            }
+
+            // Add contribution from this block
+            for (local_idx, &score) in block_scores.iter().enumerate() {
+                if score.is_finite() {
+                    let exp_score = (score - new_max).exp();
+                    sum_exp += exp_score;
+
+                    let global_idx = block_start + local_idx;
+                    for (j, &vj) in values[global_idx].iter().enumerate() {
+                        output[j] += exp_score * vj;
+                    }
+                }
+            }
+
+            max_so_far = new_max;
+        }
+
+        // Final normalization
+        if sum_exp > 1e-8 {
+            output.iter_mut().for_each(|o| *o /= sum_exp);
+        }
+
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::attention::ScaledDotProductAttention;
+
+    #[test]
+    fn test_flash_attention() {
+        let attention = FlashAttention::new(64, 16);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = (0..256).map(|_| vec![0.3; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..256).map(|_| vec![1.0; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_flash_matches_standard() {
+        let dim = 32;
+        let flash = FlashAttention::new(dim, 8);
+        let standard = ScaledDotProductAttention::new(dim);
+
+        let query = vec![0.5; dim];
+        let keys: Vec<Vec<f32>> = (0..16).map(|i| vec![(i as f32) * 0.1; dim]).collect();
+        let values: Vec<Vec<f32>> = (0..16).map(|i| vec![(i as f32) * 0.2; dim]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let flash_result = flash.compute(&query, &keys_refs, &values_refs).unwrap();
+        let standard_result = standard.compute(&query, &keys_refs, &values_refs).unwrap();
+
+        // Results should be approximately equal
+        for (f, s) in flash_result.iter().zip(standard_result.iter()) {
+            assert!((f - s).abs() < 1e-4, "Flash: {}, Standard: {}", f, s);
+        }
+    }
+
+    #[test]
+    fn test_causal_flash() {
+        let attention = FlashAttention::causal(32, 8);
+
+        let query = vec![1.0; 32];
+        let keys = vec![vec![0.5; 32]; 20];
+        let values = vec![vec![1.0; 32]; 20];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sparse/linear.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sparse/linear.rs
@@ -0,0 +1,237 @@
+//! Linear attention using random feature approximation (Performer-style)
+//!
+//! Complexity: O(n * k * d) where k = number of random features
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+
+/// Kernel type for linear attention
+#[derive(Clone, Debug)]
+pub enum KernelType {
+    /// FAVOR+ softmax approximation
+    Softmax,
+    /// ReLU kernel
+    ReLU,
+    /// ELU kernel
+    ELU,
+}
+
+/// Linear attention with random feature maps
+///
+/// Uses kernel trick to achieve O(n * k * d) complexity instead of O(n² * d).
+pub struct LinearAttention {
+    dim: usize,
+    num_features: usize,
+    kernel: KernelType,
+    /// Random projection matrix [num_features x dim]
+    random_features: Vec<f32>,
+}
+
+impl LinearAttention {
+    /// Create new linear attention
+    pub fn new(dim: usize, num_features: usize) -> Self {
+        Self::with_kernel(dim, num_features, KernelType::Softmax)
+    }
+
+    /// Create with specific kernel type
+    pub fn with_kernel(dim: usize, num_features: usize, kernel: KernelType) -> Self {
+        // Initialize random features using Box-Muller for Gaussian
+        let random_features = Self::generate_random_features(dim, num_features);
+
+        Self {
+            dim,
+            num_features,
+            kernel,
+            random_features,
+        }
+    }
+
+    fn generate_random_features(dim: usize, num_features: usize) -> Vec<f32> {
+        use std::f32::consts::PI;
+
+        let mut features = Vec::with_capacity(num_features * dim);
+        let mut seed = 42u64;
+
+        for _ in 0..((num_features * dim + 1) / 2) {
+            // Simple LCG for reproducibility
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            let u1 = (seed as f32) / (u64::MAX as f32);
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            let u2 = (seed as f32) / (u64::MAX as f32);
+
+            // Box-Muller transform
+            let r = (-2.0 * u1.max(1e-10).ln()).sqrt();
+            let theta = 2.0 * PI * u2;
+
+            features.push(r * theta.cos());
+            if features.len() < num_features * dim {
+                features.push(r * theta.sin());
+            }
+        }
+
+        features.truncate(num_features * dim);
+
+        // Normalize columns
+        let scale = 1.0 / (dim as f32).sqrt();
+        features.iter_mut().for_each(|x| *x *= scale);
+
+        features
+    }
+
+    /// Apply feature map to input
+    fn feature_map(&self, x: &[f32]) -> Vec<f32> {
+        let mut phi = vec![0.0f32; self.num_features];
+
+        for (i, phi_i) in phi.iter_mut().enumerate() {
+            let projection: f32 = x
+                .iter()
+                .enumerate()
+                .map(|(j, &xj)| xj * self.random_features[i * self.dim + j])
+                .sum();
+
+            *phi_i = match self.kernel {
+                KernelType::Softmax => {
+                    // FAVOR+: exp(projection - ||x||²/2) / sqrt(num_features)
+                    let norm_sq: f32 = x.iter().map(|xi| xi * xi).sum();
+                    (projection - norm_sq / 2.0).exp() / (self.num_features as f32).sqrt()
+                }
+                KernelType::ReLU => projection.max(0.0),
+                KernelType::ELU => {
+                    if projection >= 0.0 {
+                        projection
+                    } else {
+                        projection.exp() - 1.0
+                    }
+                }
+            };
+        }
+
+        phi
+    }
+}
+
+impl Attention for LinearAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+        if query.len() != self.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.dim,
+                actual: query.len(),
+            });
+        }
+
+        // Compute phi(Q)
+        let phi_q = self.feature_map(query);
+
+        // Compute sum_i phi(K_i)^T * V_i  and  sum_i phi(K_i)
+        let value_dim = values[0].len();
+        let mut kv_sum = vec![0.0f32; self.num_features * value_dim]; // [num_features x value_dim]
+        let mut k_sum = vec![0.0f32; self.num_features];
+
+        for (key, value) in keys.iter().zip(values.iter()) {
+            let phi_k = self.feature_map(key);
+
+            // Accumulate phi(K)^T * V (outer product contribution)
+            for (i, &phi_ki) in phi_k.iter().enumerate() {
+                for (j, &vj) in value.iter().enumerate() {
+                    kv_sum[i * value_dim + j] += phi_ki * vj;
+                }
+                k_sum[i] += phi_ki;
+            }
+        }
+
+        // Compute output: (phi(Q)^T * KV_sum) / (phi(Q)^T * K_sum)
+        let mut output = vec![0.0f32; value_dim];
+        let mut normalizer = 0.0f32;
+
+        for (i, &phi_qi) in phi_q.iter().enumerate() {
+            for (j, out_j) in output.iter_mut().enumerate() {
+                *out_j += phi_qi * kv_sum[i * value_dim + j];
+            }
+            normalizer += phi_qi * k_sum[i];
+        }
+
+        // Normalize
+        if normalizer.abs() > 1e-8 {
+            output.iter_mut().for_each(|x| *x /= normalizer);
+        }
+
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_linear_attention() {
+        let attention = LinearAttention::new(64, 32);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = (0..100).map(|_| vec![0.3; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..100).map(|_| vec![1.0; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_kernel_types() {
+        for kernel in [KernelType::Softmax, KernelType::ReLU, KernelType::ELU] {
+            let attention = LinearAttention::with_kernel(32, 16, kernel);
+
+            let query = vec![1.0; 32];
+            let keys = vec![vec![0.5; 32]; 10];
+            let values = vec![vec![1.0; 32]; 10];
+
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+            assert_eq!(result.len(), 32);
+        }
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sparse/local_global.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sparse/local_global.rs
@@ -0,0 +1,193 @@
+//! Local-Global attention for efficient long-range dependencies
+//!
+//! Complexity: O(n * (w + g)) where w = window size, g = global tokens
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use crate::utils::stable_softmax;
+
+/// Local-Global attention mechanism
+///
+/// Combines local windowed attention with global tokens for O(n*(w+g)) complexity.
+pub struct LocalGlobalAttention {
+    dim: usize,
+    local_window: usize,
+    num_global_tokens: usize,
+    scale: f32,
+}
+
+impl LocalGlobalAttention {
+    /// Create new local-global attention
+    pub fn new(dim: usize, local_window: usize, num_global_tokens: usize) -> Self {
+        Self {
+            dim,
+            local_window,
+            num_global_tokens,
+            scale: 1.0 / (dim as f32).sqrt(),
+        }
+    }
+
+    /// Compute attention scores for local window
+    fn compute_local_scores(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        position: usize,
+    ) -> Vec<(usize, f32)> {
+        let n = keys.len();
+        let half_window = self.local_window / 2;
+        let start = position.saturating_sub(half_window);
+        let end = (position + half_window + 1).min(n);
+
+        (start..end)
+            .map(|j| {
+                let score: f32 = query
+                    .iter()
+                    .zip(keys[j].iter())
+                    .map(|(q, k)| q * k)
+                    .sum::<f32>()
+                    * self.scale;
+                (j, score)
+            })
+            .collect()
+    }
+
+    /// Compute attention scores for global tokens
+    fn compute_global_scores(&self, query: &[f32], keys: &[&[f32]]) -> Vec<(usize, f32)> {
+        let num_global = self.num_global_tokens.min(keys.len());
+
+        (0..num_global)
+            .map(|j| {
+                let score: f32 = query
+                    .iter()
+                    .zip(keys[j].iter())
+                    .map(|(q, k)| q * k)
+                    .sum::<f32>()
+                    * self.scale;
+                (j, score)
+            })
+            .collect()
+    }
+}
+
+impl Attention for LocalGlobalAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty keys".to_string()));
+        }
+        if keys.len() != values.len() {
+            return Err(AttentionError::DimensionMismatch {
+                expected: keys.len(),
+                actual: values.len(),
+            });
+        }
+        if query.len() != self.dim {
+            return Err(AttentionError::DimensionMismatch {
+                expected: self.dim,
+                actual: query.len(),
+            });
+        }
+
+        // For simplicity, compute at position 0 (middle of sequence would be typical)
+        let position = keys.len() / 2;
+
+        // Collect all attended positions and scores
+        let mut attended: Vec<(usize, f32)> = Vec::new();
+
+        // Add global scores
+        attended.extend(self.compute_global_scores(query, keys));
+
+        // Add local scores
+        for (idx, score) in self.compute_local_scores(query, keys, position) {
+            if !attended.iter().any(|(i, _)| *i == idx) {
+                attended.push((idx, score));
+            }
+        }
+
+        if attended.is_empty() {
+            return Err(AttentionError::ComputationError(
+                "No attended positions".to_string(),
+            ));
+        }
+
+        // Softmax over attended positions
+        let scores: Vec<f32> = attended.iter().map(|(_, s)| *s).collect();
+        let weights = stable_softmax(&scores);
+
+        // Weighted sum of values
+        let mut output = vec![0.0f32; self.dim];
+        for ((idx, _), weight) in attended.iter().zip(weights.iter()) {
+            for (o, v) in output.iter_mut().zip(values[*idx].iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(usize, bool)> = m
+                .iter()
+                .copied()
+                .enumerate()
+                .filter(|(_, keep)| *keep)
+                .collect();
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(i, _)| keys[*i]).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(i, _)| values[*i]).collect();
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_local_global_attention() {
+        let attention = LocalGlobalAttention::new(64, 8, 2);
+
+        let query = vec![0.5; 64];
+        let keys: Vec<Vec<f32>> = (0..100).map(|_| vec![0.3; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..100).map(|i| vec![i as f32; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 64);
+    }
+
+    #[test]
+    fn test_small_sequence() {
+        let attention = LocalGlobalAttention::new(32, 4, 1);
+
+        let query = vec![1.0; 32];
+        let keys = vec![vec![0.5; 32]; 5];
+        let values = vec![vec![1.0; 32]; 5];
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let result = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(result.len(), 32);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sparse/mask.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sparse/mask.rs
@@ -0,0 +1,207 @@
+//! Sparse mask utilities for attention patterns
+
+use std::collections::HashSet;
+
+/// Sparse mask for attention patterns
+#[derive(Clone, Debug)]
+pub struct AttentionMask {
+    /// Sparse indices as (row, col) pairs
+    pub indices: Vec<(usize, usize)>,
+    /// Shape of the full attention matrix
+    pub shape: (usize, usize),
+    /// Set for O(1) lookup
+    lookup: HashSet<(usize, usize)>,
+}
+
+impl AttentionMask {
+    /// Create a new sparse mask from indices
+    pub fn new(indices: Vec<(usize, usize)>, shape: (usize, usize)) -> Self {
+        let lookup: HashSet<_> = indices.iter().copied().collect();
+        Self {
+            indices,
+            shape,
+            lookup,
+        }
+    }
+
+    /// Check if position is masked (should attend)
+    #[inline]
+    pub fn is_attended(&self, row: usize, col: usize) -> bool {
+        self.lookup.contains(&(row, col))
+    }
+
+    /// Apply mask to attention scores (set non-attended to -inf)
+    pub fn apply(&self, scores: &mut [f32], seq_len: usize) {
+        for i in 0..seq_len {
+            for j in 0..seq_len {
+                if !self.is_attended(i, j) {
+                    scores[i * seq_len + j] = f32::NEG_INFINITY;
+                }
+            }
+        }
+    }
+
+    /// Create a local window mask
+    pub fn local_window(n: usize, window_size: usize) -> Self {
+        let mut indices = Vec::new();
+        let half_window = window_size / 2;
+
+        for i in 0..n {
+            let start = i.saturating_sub(half_window);
+            let end = (i + half_window + 1).min(n);
+            for j in start..end {
+                indices.push((i, j));
+            }
+        }
+
+        Self::new(indices, (n, n))
+    }
+
+    /// Create a causal mask (lower triangular)
+    pub fn causal(n: usize) -> Self {
+        let mut indices = Vec::new();
+        for i in 0..n {
+            for j in 0..=i {
+                indices.push((i, j));
+            }
+        }
+        Self::new(indices, (n, n))
+    }
+
+    /// Create a strided mask
+    pub fn strided(n: usize, stride: usize) -> Self {
+        let mut indices = Vec::new();
+        for i in 0..n {
+            for j in (0..n).step_by(stride) {
+                indices.push((i, j));
+            }
+            // Always attend to self
+            indices.push((i, i));
+        }
+        let mut indices: Vec<_> = indices
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect();
+        indices.sort();
+        Self::new(indices, (n, n))
+    }
+
+    /// Number of non-zero entries
+    pub fn nnz(&self) -> usize {
+        self.indices.len()
+    }
+
+    /// Sparsity ratio (0 = all zeros, 1 = all ones)
+    pub fn density(&self) -> f32 {
+        self.nnz() as f32 / (self.shape.0 * self.shape.1) as f32
+    }
+}
+
+/// Builder for creating sparse masks
+pub struct SparseMaskBuilder {
+    n: usize,
+    indices: Vec<(usize, usize)>,
+}
+
+impl SparseMaskBuilder {
+    pub fn new(n: usize) -> Self {
+        Self {
+            n,
+            indices: Vec::new(),
+        }
+    }
+
+    /// Add local window pattern
+    pub fn with_local_window(mut self, window_size: usize) -> Self {
+        let half_window = window_size / 2;
+        for i in 0..self.n {
+            let start = i.saturating_sub(half_window);
+            let end = (i + half_window + 1).min(self.n);
+            for j in start..end {
+                self.indices.push((i, j));
+            }
+        }
+        self
+    }
+
+    /// Add global tokens (all positions attend to these)
+    pub fn with_global_tokens(mut self, global_indices: &[usize]) -> Self {
+        for i in 0..self.n {
+            for &g in global_indices {
+                if g < self.n {
+                    self.indices.push((i, g));
+                    self.indices.push((g, i));
+                }
+            }
+        }
+        self
+    }
+
+    /// Add causal masking
+    pub fn with_causal(mut self) -> Self {
+        for i in 0..self.n {
+            for j in 0..=i {
+                self.indices.push((i, j));
+            }
+        }
+        self
+    }
+
+    /// Build the mask
+    pub fn build(self) -> AttentionMask {
+        let mut indices: Vec<_> = self
+            .indices
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect();
+        indices.sort();
+        AttentionMask::new(indices, (self.n, self.n))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_local_window_mask() {
+        let mask = AttentionMask::local_window(10, 3);
+
+        // Position 5 should attend to positions 4, 5, 6
+        assert!(mask.is_attended(5, 4));
+        assert!(mask.is_attended(5, 5));
+        assert!(mask.is_attended(5, 6));
+
+        // Position 5 should not attend to position 0
+        assert!(!mask.is_attended(5, 0));
+    }
+
+    #[test]
+    fn test_causal_mask() {
+        let mask = AttentionMask::causal(5);
+
+        // Lower triangle should be attended
+        assert!(mask.is_attended(2, 0));
+        assert!(mask.is_attended(2, 1));
+        assert!(mask.is_attended(2, 2));
+
+        // Upper triangle should not
+        assert!(!mask.is_attended(2, 3));
+        assert!(!mask.is_attended(2, 4));
+    }
+
+    #[test]
+    fn test_builder() {
+        let mask = SparseMaskBuilder::new(10)
+            .with_local_window(3)
+            .with_global_tokens(&[0])
+            .build();
+
+        // All positions should attend to global token 0
+        for i in 0..10 {
+            assert!(mask.is_attended(i, 0));
+        }
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/sparse/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/sparse/mod.rs
@@ -0,0 +1,13 @@
+//! Sparse attention mechanisms for efficient computation on long sequences
+//!
+//! This module provides sparse attention patterns that reduce complexity from O(n²) to sub-quadratic.
+
+pub mod flash;
+pub mod linear;
+pub mod local_global;
+pub mod mask;
+
+pub use flash::FlashAttention;
+pub use linear::LinearAttention;
+pub use local_global::LocalGlobalAttention;
+pub use mask::{AttentionMask, SparseMaskBuilder};
--- a/vendor/ruvector/crates/ruvector-attention/src/topology/coherence.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/topology/coherence.rs
@@ -0,0 +1,327 @@
+//! Window Coherence Metrics
+//!
+//! Fast structural metrics for measuring attention window stability.
+//! These are permission signals, not similarity signals.
+
+use serde::{Deserialize, Serialize};
+
+/// Coherence metric type
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum CoherenceMetric {
+    /// k-NN graph boundary ratio
+    BoundaryMass,
+    /// Cut proxy score (edge cut estimate)
+    CutProxy,
+    /// Disagreement across neighbor labels
+    Disagreement,
+    /// Average neighbor similarity variance
+    SimilarityVariance,
+}
+
+/// Per-window coherence scores
+#[derive(Debug, Clone)]
+pub struct WindowCoherence {
+    /// Overall coherence score (0 = fragmented, 1 = coherent)
+    pub score: f32,
+    /// Individual metric scores
+    pub metric_scores: Vec<f32>,
+    /// Which metrics were used
+    pub metrics: Vec<CoherenceMetric>,
+    /// Number of keys in window
+    pub window_size: usize,
+    /// Whether this coherence is stale (needs update)
+    pub is_stale: bool,
+    /// Token count since last update
+    pub tokens_since_update: usize,
+}
+
+impl WindowCoherence {
+    /// Compute coherence from keys
+    pub fn compute(keys: &[&[f32]], k_neighbors: usize, metrics: &[CoherenceMetric]) -> Self {
+        let n = keys.len();
+        if n < 2 {
+            return Self {
+                score: 1.0,
+                metric_scores: vec![1.0],
+                metrics: metrics.to_vec(),
+                window_size: n,
+                is_stale: false,
+                tokens_since_update: 0,
+            };
+        }
+
+        // Build k-NN graph (fast approximate)
+        let knn_graph = Self::build_knn_graph(keys, k_neighbors);
+
+        // Compute each metric
+        let metric_scores: Vec<f32> = metrics
+            .iter()
+            .map(|m| Self::compute_metric(*m, keys, &knn_graph))
+            .collect();
+
+        // Average scores for overall coherence
+        let score = metric_scores.iter().sum::<f32>() / metric_scores.len() as f32;
+
+        Self {
+            score,
+            metric_scores,
+            metrics: metrics.to_vec(),
+            window_size: n,
+            is_stale: false,
+            tokens_since_update: 0,
+        }
+    }
+
+    /// Mark as stale (needs recomputation)
+    pub fn mark_stale(&mut self) {
+        self.is_stale = true;
+    }
+
+    /// Increment token counter
+    pub fn tick(&mut self) {
+        self.tokens_since_update += 1;
+    }
+
+    /// Check if update is needed based on period
+    pub fn needs_update(&self, update_period: usize) -> bool {
+        self.is_stale || self.tokens_since_update >= update_period
+    }
+
+    /// Build approximate k-NN graph
+    /// Returns [N × k] indices of nearest neighbors
+    fn build_knn_graph(keys: &[&[f32]], k: usize) -> Vec<Vec<usize>> {
+        let n = keys.len();
+        let k = k.min(n - 1);
+
+        keys.iter()
+            .enumerate()
+            .map(|(i, key)| {
+                let mut distances: Vec<(usize, f32)> = keys
+                    .iter()
+                    .enumerate()
+                    .filter(|(j, _)| *j != i)
+                    .map(|(j, k2)| (j, Self::squared_distance(key, k2)))
+                    .collect();
+
+                distances.sort_unstable_by(|a, b| {
+                    a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
+                });
+
+                distances.iter().take(k).map(|(j, _)| *j).collect()
+            })
+            .collect()
+    }
+
+    /// Squared Euclidean distance
+    #[inline]
+    fn squared_distance(a: &[f32], b: &[f32]) -> f32 {
+        a.iter()
+            .zip(b.iter())
+            .map(|(&ai, &bi)| (ai - bi) * (ai - bi))
+            .sum()
+    }
+
+    /// Compute specific metric
+    fn compute_metric(metric: CoherenceMetric, keys: &[&[f32]], knn_graph: &[Vec<usize>]) -> f32 {
+        match metric {
+            CoherenceMetric::BoundaryMass => Self::boundary_mass(knn_graph),
+            CoherenceMetric::CutProxy => Self::cut_proxy(knn_graph),
+            CoherenceMetric::Disagreement => Self::disagreement(keys, knn_graph),
+            CoherenceMetric::SimilarityVariance => Self::similarity_variance(keys, knn_graph),
+        }
+    }
+
+    /// Boundary mass: fraction of edges going to "far" neighbors
+    /// High coherence = most edges go to nearby neighbors
+    fn boundary_mass(knn_graph: &[Vec<usize>]) -> f32 {
+        if knn_graph.is_empty() {
+            return 1.0;
+        }
+
+        let n = knn_graph.len();
+        let mut internal_edges = 0;
+        let mut total_edges = 0;
+
+        for (i, neighbors) in knn_graph.iter().enumerate() {
+            for &j in neighbors {
+                total_edges += 1;
+                // "Internal" if neighbor is within n/4 positions
+                if (i as i32 - j as i32).unsigned_abs() as usize <= n / 4 {
+                    internal_edges += 1;
+                }
+            }
+        }
+
+        if total_edges == 0 {
+            return 1.0;
+        }
+
+        internal_edges as f32 / total_edges as f32
+    }
+
+    /// Cut proxy: estimate of graph cut cost
+    /// High coherence = low cut (well-connected)
+    fn cut_proxy(knn_graph: &[Vec<usize>]) -> f32 {
+        if knn_graph.is_empty() {
+            return 1.0;
+        }
+
+        let n = knn_graph.len();
+        let half = n / 2;
+
+        // Count edges crossing the midpoint
+        let mut crossing = 0;
+        let mut total = 0;
+
+        for (i, neighbors) in knn_graph.iter().enumerate() {
+            for &j in neighbors {
+                total += 1;
+                if (i < half) != (j < half) {
+                    crossing += 1;
+                }
+            }
+        }
+
+        if total == 0 {
+            return 1.0;
+        }
+
+        // Invert: high coherence = few crossings
+        1.0 - (crossing as f32 / total as f32)
+    }
+
+    /// Disagreement: variance in neighbor similarities
+    /// High coherence = neighbors have similar similarities
+    fn disagreement(keys: &[&[f32]], knn_graph: &[Vec<usize>]) -> f32 {
+        if knn_graph.is_empty() || keys.is_empty() {
+            return 1.0;
+        }
+
+        let mut total_variance = 0.0f32;
+        let mut count = 0;
+
+        for (i, neighbors) in knn_graph.iter().enumerate() {
+            if neighbors.is_empty() {
+                continue;
+            }
+
+            // Similarities to neighbors
+            let sims: Vec<f32> = neighbors
+                .iter()
+                .map(|&j| Self::cosine_similarity(keys[i], keys[j]))
+                .collect();
+
+            let mean: f32 = sims.iter().sum::<f32>() / sims.len() as f32;
+            let variance: f32 =
+                sims.iter().map(|s| (s - mean) * (s - mean)).sum::<f32>() / sims.len() as f32;
+
+            total_variance += variance;
+            count += 1;
+        }
+
+        if count == 0 {
+            return 1.0;
+        }
+
+        // Low variance = high coherence
+        let avg_variance = total_variance / count as f32;
+        1.0 - avg_variance.min(1.0)
+    }
+
+    /// Similarity variance across window
+    fn similarity_variance(keys: &[&[f32]], knn_graph: &[Vec<usize>]) -> f32 {
+        if knn_graph.is_empty() || keys.is_empty() {
+            return 1.0;
+        }
+
+        // Collect all neighbor similarities
+        let mut all_sims = Vec::new();
+        for (i, neighbors) in knn_graph.iter().enumerate() {
+            for &j in neighbors {
+                all_sims.push(Self::cosine_similarity(keys[i], keys[j]));
+            }
+        }
+
+        if all_sims.is_empty() {
+            return 1.0;
+        }
+
+        let mean: f32 = all_sims.iter().sum::<f32>() / all_sims.len() as f32;
+        let variance: f32 = all_sims
+            .iter()
+            .map(|s| (s - mean) * (s - mean))
+            .sum::<f32>()
+            / all_sims.len() as f32;
+
+        // Low variance + high mean = high coherence
+        let coherence = mean * (1.0 - variance.sqrt().min(1.0));
+        coherence.max(0.0).min(1.0)
+    }
+
+    /// Cosine similarity
+    #[inline]
+    fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+        let dot: f32 = a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum();
+        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+
+        if norm_a < 1e-8 || norm_b < 1e-8 {
+            return 0.0;
+        }
+
+        (dot / (norm_a * norm_b)).clamp(-1.0, 1.0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_coherence_computation() {
+        let keys: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32 * 0.1; 32]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let coherence = WindowCoherence::compute(
+            &keys_refs,
+            5,
+            &[
+                CoherenceMetric::BoundaryMass,
+                CoherenceMetric::SimilarityVariance,
+            ],
+        );
+
+        assert!(coherence.score >= 0.0 && coherence.score <= 1.0);
+        assert_eq!(coherence.window_size, 20);
+    }
+
+    #[test]
+    fn test_coherent_window() {
+        // Highly similar keys = high coherence
+        let keys: Vec<Vec<f32>> = (0..10).map(|_| vec![0.5f32; 16]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let coherence = WindowCoherence::compute(&keys_refs, 3, &[CoherenceMetric::Disagreement]);
+
+        // Should be very coherent
+        assert!(coherence.score > 0.8);
+    }
+
+    #[test]
+    fn test_stale_tracking() {
+        let keys: Vec<Vec<f32>> = vec![vec![1.0; 8]; 5];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let mut coherence =
+            WindowCoherence::compute(&keys_refs, 2, &[CoherenceMetric::BoundaryMass]);
+
+        assert!(!coherence.needs_update(4));
+
+        coherence.tick();
+        coherence.tick();
+        coherence.tick();
+        coherence.tick();
+
+        assert!(coherence.needs_update(4));
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/topology/gated_attention.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/topology/gated_attention.rs
@@ -0,0 +1,429 @@
+//! Topology Gated Attention
+//!
+//! Main attention mechanism that uses topological coherence as a permission signal.
+
+use super::coherence::{CoherenceMetric, WindowCoherence};
+use super::policy::{AttentionMode, AttentionPolicy, PolicyConfig};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for topology-gated attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TopologyGatedConfig {
+    /// Model dimension
+    pub dim: usize,
+    /// Number of neighbors for coherence graph
+    pub k_neighbors: usize,
+    /// Coherence metrics to use
+    pub metrics: Vec<CoherenceMetric>,
+    /// Policy configuration
+    pub policy: PolicyConfig,
+    /// Temperature for softmax
+    pub temperature: f32,
+    /// Base attention width
+    pub base_width: usize,
+}
+
+impl Default for TopologyGatedConfig {
+    fn default() -> Self {
+        Self {
+            dim: 512,
+            k_neighbors: 8,
+            metrics: vec![
+                CoherenceMetric::BoundaryMass,
+                CoherenceMetric::SimilarityVariance,
+            ],
+            policy: PolicyConfig::default(),
+            temperature: 1.0,
+            base_width: 64,
+        }
+    }
+}
+
+/// Topology Gated Attention
+///
+/// Uses structural coherence to control attention behavior:
+/// - Stable mode: full attention, normal updates
+/// - Cautious mode: reduced width, increased sparsity
+/// - Freeze mode: retrieval only, no updates
+#[derive(Debug, Clone)]
+pub struct TopologyGatedAttention {
+    config: TopologyGatedConfig,
+    policy: AttentionPolicy,
+    cached_coherence: Option<WindowCoherence>,
+}
+
+impl TopologyGatedAttention {
+    /// Create new topology-gated attention
+    pub fn new(config: TopologyGatedConfig) -> Self {
+        let policy = AttentionPolicy::new(config.policy.clone());
+
+        Self {
+            config,
+            policy,
+            cached_coherence: None,
+        }
+    }
+
+    /// Create with dimension
+    pub fn with_dim(dim: usize) -> Self {
+        Self::new(TopologyGatedConfig {
+            dim,
+            ..Default::default()
+        })
+    }
+
+    /// Update coherence from keys (call periodically, not every token)
+    pub fn update_coherence(&mut self, keys: &[&[f32]]) {
+        let coherence =
+            WindowCoherence::compute(keys, self.config.k_neighbors, &self.config.metrics);
+        self.policy.determine_mode(coherence.score);
+        self.cached_coherence = Some(coherence);
+    }
+
+    /// Get current mode
+    pub fn current_mode(&self) -> AttentionMode {
+        self.policy.current_mode()
+    }
+
+    /// Check if coherence update is needed
+    pub fn needs_coherence_update(&self) -> bool {
+        match &self.cached_coherence {
+            Some(c) => c.needs_update(self.config.policy.update_period),
+            None => true,
+        }
+    }
+
+    /// Tick coherence counter
+    pub fn tick_coherence(&mut self) {
+        if let Some(ref mut c) = self.cached_coherence {
+            c.tick();
+        }
+    }
+
+    /// Get effective attention width
+    pub fn get_attention_width(&self) -> usize {
+        self.policy.get_attention_width(self.config.base_width)
+    }
+
+    /// Check if updates are allowed
+    pub fn allows_updates(&self) -> bool {
+        self.policy.allows_updates()
+    }
+
+    /// Compute gated attention
+    pub fn compute_gated(
+        &mut self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // Update coherence if needed
+        if self.needs_coherence_update() {
+            self.update_coherence(keys);
+        } else {
+            self.tick_coherence();
+        }
+
+        match self.current_mode() {
+            AttentionMode::Stable => {
+                // Full attention
+                self.full_attention(query, keys, values)
+            }
+            AttentionMode::Cautious => {
+                // Sparse attention with reduced width
+                let width = self.get_attention_width();
+                self.sparse_attention(query, keys, values, width)
+            }
+            AttentionMode::Freeze => {
+                // Retrieval only: just return query projection
+                // (no attention, just pass-through with light weighting)
+                self.retrieval_only(query, keys, values)
+            }
+        }
+    }
+
+    /// Full attention (stable mode)
+    fn full_attention(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("No keys".into()));
+        }
+
+        // Standard scaled dot-product attention
+        let scale = 1.0 / (self.config.dim as f32).sqrt();
+
+        let logits: Vec<f32> = keys
+            .iter()
+            .map(|k| Self::dot_product_simd(query, k) * scale / self.config.temperature)
+            .collect();
+
+        let weights = Self::stable_softmax(&logits);
+
+        self.weighted_sum(&weights, values)
+    }
+
+    /// Sparse attention with limited width (cautious mode)
+    fn sparse_attention(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        width: usize,
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("No keys".into()));
+        }
+
+        let width = width.min(keys.len());
+
+        // Get top-k keys by dot product
+        let scale = 1.0 / (self.config.dim as f32).sqrt();
+        let mut scores: Vec<(usize, f32)> = keys
+            .iter()
+            .enumerate()
+            .map(|(i, k)| (i, Self::dot_product_simd(query, k) * scale))
+            .collect();
+
+        scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Take top-k
+        let top_k: Vec<(usize, f32)> = scores.into_iter().take(width).collect();
+
+        // Compute attention over selected keys
+        let logits: Vec<f32> = top_k
+            .iter()
+            .map(|(_, s)| s / self.config.temperature)
+            .collect();
+
+        let weights = Self::stable_softmax(&logits);
+
+        // Weighted sum of selected values
+        let selected_values: Vec<&[f32]> = top_k.iter().map(|(i, _)| values[*i]).collect();
+
+        self.weighted_sum(&weights, &selected_values)
+    }
+
+    /// Retrieval-only mode (freeze mode)
+    fn retrieval_only(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if keys.is_empty() {
+            return Err(AttentionError::InvalidConfig("No keys".into()));
+        }
+
+        // Find single best match and return its value
+        // This is ultra-sparse: only 1 key contributes
+        let scale = 1.0 / (self.config.dim as f32).sqrt();
+
+        let best_idx = keys
+            .iter()
+            .enumerate()
+            .map(|(i, k)| (i, Self::dot_product_simd(query, k) * scale))
+            .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
+            .map(|(i, _)| i)
+            .unwrap_or(0);
+
+        Ok(values[best_idx].to_vec())
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+
+    /// Stable softmax
+    fn stable_softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        exp_logits.iter().map(|&e| e / sum).collect()
+    }
+
+    /// Weighted sum
+    fn weighted_sum(&self, weights: &[f32], values: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        if weights.is_empty() || values.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty inputs".into()));
+        }
+
+        let dim = values[0].len();
+        let mut output = vec![0.0f32; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, &v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+}
+
+impl Attention for TopologyGatedAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // For trait, use clone to allow mutation
+        let mut att = self.clone();
+        att.compute_gated(query, keys, values)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(&[f32], &[f32])> = keys
+                .iter()
+                .zip(values.iter())
+                .enumerate()
+                .filter(|(i, _)| m.get(*i).copied().unwrap_or(true))
+                .map(|(_, (k, v))| (*k, *v))
+                .collect();
+
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(k, _)| *k).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(_, v)| *v).collect();
+
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_topology_gated_attention() {
+        let mut attention = TopologyGatedAttention::with_dim(32);
+
+        let query = vec![0.5f32; 32];
+        let keys: Vec<Vec<f32>> = (0..20).map(|i| vec![0.1 + i as f32 * 0.02; 32]).collect();
+        let values: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; 32]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention
+            .compute_gated(&query, &keys_refs, &values_refs)
+            .unwrap();
+        assert_eq!(output.len(), 32);
+    }
+
+    #[test]
+    fn test_mode_affects_output() {
+        let config = TopologyGatedConfig {
+            dim: 16,
+            base_width: 32,
+            policy: PolicyConfig {
+                stable_threshold: 0.9, // Very high threshold
+                freeze_threshold: 0.8,
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        let mut attention = TopologyGatedAttention::new(config);
+
+        // Create diverse keys (low coherence)
+        let keys: Vec<Vec<f32>> = (0..10)
+            .map(|i| {
+                let mut v = vec![0.0f32; 16];
+                v[i % 16] = 1.0;
+                v
+            })
+            .collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; 16]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        attention.update_coherence(&keys_refs);
+
+        // With diverse keys, should trigger freeze mode
+        let query = vec![0.5f32; 16];
+        let _output = attention
+            .compute_gated(&query, &keys_refs, &values_refs)
+            .unwrap();
+
+        // Mode should be freeze or cautious due to low coherence
+        let mode = attention.current_mode();
+        assert!(mode == AttentionMode::Freeze || mode == AttentionMode::Cautious);
+    }
+
+    #[test]
+    fn test_coherence_update_period() {
+        let config = TopologyGatedConfig {
+            dim: 16,
+            policy: PolicyConfig {
+                update_period: 4,
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        let mut attention = TopologyGatedAttention::new(config);
+
+        // No coherence yet
+        assert!(attention.needs_coherence_update());
+
+        let keys: Vec<Vec<f32>> = vec![vec![1.0; 16]; 5];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        attention.update_coherence(&keys_refs);
+        assert!(!attention.needs_coherence_update());
+
+        // Tick 4 times
+        for _ in 0..4 {
+            attention.tick_coherence();
+        }
+
+        assert!(attention.needs_coherence_update());
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/topology/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/topology/mod.rs
@@ -0,0 +1,32 @@
+//! Topology Gated Attention
+//!
+//! Uses topological structure as a permission signal for attention.
+//!
+//! ## Key Concepts
+//!
+//! 1. **Window-Level Coherence**: Compute one coherence score per window, reuse for all queries
+//! 2. **Fast Graph Primitives**: Use k-NN lists instead of full graph construction
+//! 3. **3-Mode Policy**: stable/cautious/freeze based on coherence
+//! 4. **Amortized Updates**: Update coherence every T tokens, not every token
+//!
+//! ## Modes
+//!
+//! - **Stable**: Full attention, normal updates
+//! - **Cautious**: Reduced attention width, increased sparsity
+//! - **Freeze**: Retrieval only, no updates, no writes
+
+mod coherence;
+mod gated_attention;
+mod policy;
+
+pub use coherence::{CoherenceMetric, WindowCoherence};
+pub use gated_attention::{TopologyGatedAttention, TopologyGatedConfig};
+pub use policy::{AttentionMode, AttentionPolicy, PolicyConfig};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/topology/policy.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/topology/policy.rs
@@ -0,0 +1,259 @@
+//! Attention Control Policy
+//!
+//! 3-mode policy for controlling attention based on coherence:
+//! - Stable: full attention width
+//! - Cautious: reduced width, increased sparsity
+//! - Freeze: retrieval only, no updates
+
+use serde::{Deserialize, Serialize};
+
+/// Attention operating mode
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum AttentionMode {
+    /// Full attention, normal updates
+    Stable,
+    /// Reduced attention width, increased sparsity
+    Cautious,
+    /// Retrieval only, no updates, no writes
+    Freeze,
+}
+
+/// Policy configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PolicyConfig {
+    /// Coherence threshold for stable mode (above this = stable)
+    pub stable_threshold: f32,
+    /// Coherence threshold for freeze mode (below this = freeze)
+    pub freeze_threshold: f32,
+    /// Attention width multiplier in cautious mode (0.5 = half width)
+    pub cautious_width_factor: f32,
+    /// Sparsity increase in cautious mode (2.0 = twice as sparse)
+    pub cautious_sparsity_factor: f32,
+    /// How many tokens between coherence updates
+    pub update_period: usize,
+    /// Hysteresis factor to prevent mode oscillation
+    pub hysteresis: f32,
+}
+
+impl Default for PolicyConfig {
+    fn default() -> Self {
+        Self {
+            stable_threshold: 0.7,
+            freeze_threshold: 0.3,
+            cautious_width_factor: 0.5,
+            cautious_sparsity_factor: 2.0,
+            update_period: 4,
+            hysteresis: 0.05,
+        }
+    }
+}
+
+/// Attention control policy
+#[derive(Debug, Clone)]
+pub struct AttentionPolicy {
+    config: PolicyConfig,
+    current_mode: AttentionMode,
+    mode_history: Vec<AttentionMode>,
+}
+
+impl AttentionPolicy {
+    /// Create new policy
+    pub fn new(config: PolicyConfig) -> Self {
+        Self {
+            config,
+            current_mode: AttentionMode::Stable,
+            mode_history: Vec::new(),
+        }
+    }
+
+    /// Determine mode from coherence score
+    pub fn determine_mode(&mut self, coherence: f32) -> AttentionMode {
+        let new_mode = self.compute_mode(coherence);
+
+        // Apply hysteresis to prevent oscillation
+        let mode = self.apply_hysteresis(new_mode, coherence);
+
+        // Record history
+        self.mode_history.push(mode);
+        if self.mode_history.len() > 16 {
+            self.mode_history.remove(0);
+        }
+
+        self.current_mode = mode;
+        mode
+    }
+
+    /// Compute mode without hysteresis
+    fn compute_mode(&self, coherence: f32) -> AttentionMode {
+        if coherence >= self.config.stable_threshold {
+            AttentionMode::Stable
+        } else if coherence <= self.config.freeze_threshold {
+            AttentionMode::Freeze
+        } else {
+            AttentionMode::Cautious
+        }
+    }
+
+    /// Apply hysteresis to mode transitions
+    fn apply_hysteresis(&self, new_mode: AttentionMode, coherence: f32) -> AttentionMode {
+        let h = self.config.hysteresis;
+
+        match (self.current_mode, new_mode) {
+            // Stable -> Cautious: require coherence to drop below threshold - hysteresis
+            (AttentionMode::Stable, AttentionMode::Cautious) => {
+                if coherence < self.config.stable_threshold - h {
+                    AttentionMode::Cautious
+                } else {
+                    AttentionMode::Stable
+                }
+            }
+            // Cautious -> Stable: require coherence to rise above threshold + hysteresis
+            (AttentionMode::Cautious, AttentionMode::Stable) => {
+                if coherence > self.config.stable_threshold + h {
+                    AttentionMode::Stable
+                } else {
+                    AttentionMode::Cautious
+                }
+            }
+            // Cautious -> Freeze: require coherence to drop below threshold - hysteresis
+            (AttentionMode::Cautious, AttentionMode::Freeze) => {
+                if coherence < self.config.freeze_threshold - h {
+                    AttentionMode::Freeze
+                } else {
+                    AttentionMode::Cautious
+                }
+            }
+            // Freeze -> Cautious: require coherence to rise above threshold + hysteresis
+            (AttentionMode::Freeze, AttentionMode::Cautious) => {
+                if coherence > self.config.freeze_threshold + h {
+                    AttentionMode::Cautious
+                } else {
+                    AttentionMode::Freeze
+                }
+            }
+            // Same mode or big jump (Stable <-> Freeze): accept new mode
+            _ => new_mode,
+        }
+    }
+
+    /// Get current mode
+    pub fn current_mode(&self) -> AttentionMode {
+        self.current_mode
+    }
+
+    /// Get attention width for current mode
+    pub fn get_attention_width(&self, base_width: usize) -> usize {
+        match self.current_mode {
+            AttentionMode::Stable => base_width,
+            AttentionMode::Cautious => {
+                ((base_width as f32 * self.config.cautious_width_factor) as usize).max(1)
+            }
+            AttentionMode::Freeze => 0, // No attention updates
+        }
+    }
+
+    /// Get sparsity factor for current mode
+    pub fn get_sparsity_factor(&self) -> f32 {
+        match self.current_mode {
+            AttentionMode::Stable => 1.0,
+            AttentionMode::Cautious => self.config.cautious_sparsity_factor,
+            AttentionMode::Freeze => f32::INFINITY, // Maximum sparsity
+        }
+    }
+
+    /// Check if updates are allowed
+    pub fn allows_updates(&self) -> bool {
+        self.current_mode != AttentionMode::Freeze
+    }
+
+    /// Check if writes are allowed
+    pub fn allows_writes(&self) -> bool {
+        self.current_mode != AttentionMode::Freeze
+    }
+
+    /// Get mode stability (how often mode has been same recently)
+    pub fn mode_stability(&self) -> f32 {
+        if self.mode_history.is_empty() {
+            return 1.0;
+        }
+
+        let current = self.current_mode;
+        let matches = self.mode_history.iter().filter(|&&m| m == current).count();
+        matches as f32 / self.mode_history.len() as f32
+    }
+
+    /// Reset to stable mode
+    pub fn reset(&mut self) {
+        self.current_mode = AttentionMode::Stable;
+        self.mode_history.clear();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_policy_modes() {
+        let mut policy = AttentionPolicy::new(PolicyConfig::default());
+
+        // High coherence = stable
+        assert_eq!(policy.determine_mode(0.9), AttentionMode::Stable);
+
+        // Medium coherence = cautious
+        assert_eq!(policy.determine_mode(0.5), AttentionMode::Cautious);
+
+        // Low coherence = freeze
+        assert_eq!(policy.determine_mode(0.1), AttentionMode::Freeze);
+    }
+
+    #[test]
+    fn test_attention_width() {
+        let mut policy = AttentionPolicy::new(PolicyConfig::default());
+
+        policy.determine_mode(0.9);
+        assert_eq!(policy.get_attention_width(100), 100);
+
+        policy.determine_mode(0.5);
+        assert_eq!(policy.get_attention_width(100), 50);
+
+        policy.determine_mode(0.1);
+        assert_eq!(policy.get_attention_width(100), 0);
+    }
+
+    #[test]
+    fn test_hysteresis() {
+        let mut policy = AttentionPolicy::new(PolicyConfig {
+            stable_threshold: 0.7,
+            freeze_threshold: 0.3,
+            hysteresis: 0.1,
+            ..Default::default()
+        });
+
+        // Start stable
+        policy.determine_mode(0.8);
+        assert_eq!(policy.current_mode(), AttentionMode::Stable);
+
+        // Drop to 0.65 (below 0.7 but above 0.7 - 0.1 = 0.6)
+        policy.determine_mode(0.65);
+        // Should stay stable due to hysteresis
+        assert_eq!(policy.current_mode(), AttentionMode::Stable);
+
+        // Drop to 0.55 (below 0.6)
+        policy.determine_mode(0.55);
+        assert_eq!(policy.current_mode(), AttentionMode::Cautious);
+    }
+
+    #[test]
+    fn test_update_permissions() {
+        let mut policy = AttentionPolicy::new(PolicyConfig::default());
+
+        policy.determine_mode(0.8);
+        assert!(policy.allows_updates());
+        assert!(policy.allows_writes());
+
+        policy.determine_mode(0.1);
+        assert!(!policy.allows_updates());
+        assert!(!policy.allows_writes());
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/training/curriculum.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/training/curriculum.rs
@@ -0,0 +1,356 @@
+//! Curriculum learning for attention training
+//!
+//! Provides schedulers for progressive training difficulty.
+
+/// Decay type for temperature/parameter annealing
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
+pub enum DecayType {
+    #[default]
+    Linear,
+    Exponential,
+    Cosine,
+    Step,
+}
+
+/// Curriculum learning stage
+#[derive(Clone, Debug)]
+pub struct CurriculumStage {
+    pub name: String,
+    pub difficulty: f32,       // 0.0 = easy, 1.0 = hard
+    pub duration: usize,       // Steps in this stage
+    pub temperature: f32,      // Softmax temperature
+    pub negative_count: usize, // Number of negatives
+}
+
+impl CurriculumStage {
+    pub fn new(name: &str) -> Self {
+        Self {
+            name: name.to_string(),
+            difficulty: 0.5,
+            duration: 1000,
+            temperature: 1.0,
+            negative_count: 10,
+        }
+    }
+
+    pub fn difficulty(mut self, d: f32) -> Self {
+        self.difficulty = d.clamp(0.0, 1.0);
+        self
+    }
+
+    pub fn duration(mut self, d: usize) -> Self {
+        self.duration = d;
+        self
+    }
+
+    pub fn temperature(mut self, t: f32) -> Self {
+        self.temperature = t.max(0.01);
+        self
+    }
+
+    pub fn negative_count(mut self, n: usize) -> Self {
+        self.negative_count = n.max(1);
+        self
+    }
+}
+
+/// Curriculum scheduler for progressive training
+pub struct CurriculumScheduler {
+    stages: Vec<CurriculumStage>,
+    current_stage: usize,
+    steps_in_stage: usize,
+    total_steps: usize,
+}
+
+impl CurriculumScheduler {
+    pub fn new() -> Self {
+        Self {
+            stages: Vec::new(),
+            current_stage: 0,
+            steps_in_stage: 0,
+            total_steps: 0,
+        }
+    }
+
+    /// Add a stage to the curriculum
+    pub fn add_stage(mut self, stage: CurriculumStage) -> Self {
+        self.stages.push(stage);
+        self
+    }
+
+    /// Build a default easy-to-hard curriculum
+    pub fn default_curriculum(total_steps: usize) -> Self {
+        let stage_duration = total_steps / 4;
+
+        Self::new()
+            .add_stage(
+                CurriculumStage::new("warm_up")
+                    .difficulty(0.1)
+                    .duration(stage_duration)
+                    .temperature(2.0)
+                    .negative_count(5),
+            )
+            .add_stage(
+                CurriculumStage::new("easy")
+                    .difficulty(0.3)
+                    .duration(stage_duration)
+                    .temperature(1.0)
+                    .negative_count(10),
+            )
+            .add_stage(
+                CurriculumStage::new("medium")
+                    .difficulty(0.6)
+                    .duration(stage_duration)
+                    .temperature(0.5)
+                    .negative_count(20),
+            )
+            .add_stage(
+                CurriculumStage::new("hard")
+                    .difficulty(1.0)
+                    .duration(stage_duration)
+                    .temperature(0.1)
+                    .negative_count(50),
+            )
+    }
+
+    /// Get current stage
+    pub fn current_stage(&self) -> Option<&CurriculumStage> {
+        self.stages.get(self.current_stage)
+    }
+
+    /// Advance one step and return current stage
+    pub fn step(&mut self) -> Option<&CurriculumStage> {
+        if self.stages.is_empty() {
+            return None;
+        }
+
+        self.steps_in_stage += 1;
+        self.total_steps += 1;
+
+        // Check if we should advance to next stage
+        if let Some(stage) = self.stages.get(self.current_stage) {
+            if self.steps_in_stage >= stage.duration && self.current_stage < self.stages.len() - 1 {
+                self.current_stage += 1;
+                self.steps_in_stage = 0;
+            }
+        }
+
+        self.current_stage()
+    }
+
+    /// Get current difficulty (0.0 to 1.0)
+    pub fn difficulty(&self) -> f32 {
+        self.current_stage().map(|s| s.difficulty).unwrap_or(1.0)
+    }
+
+    /// Get current temperature
+    pub fn temperature(&self) -> f32 {
+        self.current_stage().map(|s| s.temperature).unwrap_or(1.0)
+    }
+
+    /// Get current negative count
+    pub fn negative_count(&self) -> usize {
+        self.current_stage().map(|s| s.negative_count).unwrap_or(10)
+    }
+
+    /// Check if training is complete
+    pub fn is_complete(&self) -> bool {
+        if self.stages.is_empty() {
+            return true;
+        }
+        self.current_stage >= self.stages.len() - 1
+            && self.steps_in_stage >= self.stages.last().map(|s| s.duration).unwrap_or(0)
+    }
+
+    /// Get progress (0.0 to 1.0)
+    pub fn progress(&self) -> f32 {
+        let total_duration: usize = self.stages.iter().map(|s| s.duration).sum();
+        if total_duration == 0 {
+            return 1.0;
+        }
+        self.total_steps as f32 / total_duration as f32
+    }
+
+    /// Reset curriculum
+    pub fn reset(&mut self) {
+        self.current_stage = 0;
+        self.steps_in_stage = 0;
+        self.total_steps = 0;
+    }
+}
+
+impl Default for CurriculumScheduler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Temperature annealing scheduler
+pub struct TemperatureAnnealing {
+    initial_temp: f32,
+    final_temp: f32,
+    total_steps: usize,
+    current_step: usize,
+    decay_type: DecayType,
+    step_size: usize, // For step decay
+}
+
+impl TemperatureAnnealing {
+    pub fn new(initial: f32, final_temp: f32, steps: usize) -> Self {
+        Self {
+            initial_temp: initial,
+            final_temp: final_temp,
+            total_steps: steps,
+            current_step: 0,
+            decay_type: DecayType::Linear,
+            step_size: steps / 10,
+        }
+    }
+
+    pub fn with_decay(mut self, decay: DecayType) -> Self {
+        self.decay_type = decay;
+        self
+    }
+
+    pub fn with_step_size(mut self, size: usize) -> Self {
+        self.step_size = size;
+        self
+    }
+
+    /// Get current temperature and advance
+    pub fn step(&mut self) -> f32 {
+        let temp = self.get_temp();
+        self.current_step += 1;
+        temp
+    }
+
+    /// Get current temperature without advancing
+    pub fn get_temp(&self) -> f32 {
+        if self.current_step >= self.total_steps {
+            return self.final_temp;
+        }
+
+        let progress = self.current_step as f32 / self.total_steps as f32;
+        let range = self.initial_temp - self.final_temp;
+
+        match self.decay_type {
+            DecayType::Linear => self.initial_temp - range * progress,
+            DecayType::Exponential => {
+                let decay_rate =
+                    (self.final_temp / self.initial_temp).ln() / self.total_steps as f32;
+                self.initial_temp * (decay_rate * self.current_step as f32).exp()
+            }
+            DecayType::Cosine => {
+                self.final_temp + 0.5 * range * (1.0 + (std::f32::consts::PI * progress).cos())
+            }
+            DecayType::Step => {
+                let num_steps = self.current_step / self.step_size.max(1);
+                let step_decay =
+                    range * num_steps as f32 / (self.total_steps / self.step_size.max(1)) as f32;
+                (self.initial_temp - step_decay).max(self.final_temp)
+            }
+        }
+    }
+
+    /// Reset annealing
+    pub fn reset(&mut self) {
+        self.current_step = 0;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_curriculum_stages() {
+        let mut curriculum = CurriculumScheduler::new()
+            .add_stage(CurriculumStage::new("easy").duration(10).difficulty(0.2))
+            .add_stage(CurriculumStage::new("hard").duration(10).difficulty(0.8));
+
+        assert_eq!(curriculum.current_stage().unwrap().name, "easy");
+        assert!((curriculum.difficulty() - 0.2).abs() < 1e-5);
+
+        // Progress through first stage
+        for _ in 0..10 {
+            curriculum.step();
+        }
+
+        assert_eq!(curriculum.current_stage().unwrap().name, "hard");
+        assert!((curriculum.difficulty() - 0.8).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_default_curriculum() {
+        let mut curriculum = CurriculumScheduler::default_curriculum(400);
+
+        assert_eq!(curriculum.stages.len(), 4);
+        assert_eq!(curriculum.current_stage().unwrap().name, "warm_up");
+
+        // Progress to end
+        for _ in 0..400 {
+            curriculum.step();
+        }
+
+        assert!(curriculum.is_complete());
+    }
+
+    #[test]
+    fn test_temperature_linear() {
+        let mut annealing = TemperatureAnnealing::new(1.0, 0.1, 100);
+
+        let temp_start = annealing.step();
+        assert!((temp_start - 1.0).abs() < 0.1);
+
+        for _ in 0..99 {
+            annealing.step();
+        }
+
+        let temp_end = annealing.get_temp();
+        assert!((temp_end - 0.1).abs() < 0.1);
+    }
+
+    #[test]
+    fn test_temperature_cosine() {
+        let mut annealing = TemperatureAnnealing::new(1.0, 0.0, 100).with_decay(DecayType::Cosine);
+
+        // Halfway should be approximately middle value
+        for _ in 0..50 {
+            annealing.step();
+        }
+
+        let temp_mid = annealing.get_temp();
+        assert!(temp_mid > 0.4 && temp_mid < 0.6);
+    }
+
+    #[test]
+    fn test_temperature_step() {
+        let mut annealing = TemperatureAnnealing::new(1.0, 0.0, 100)
+            .with_decay(DecayType::Step)
+            .with_step_size(25);
+
+        let temp_0 = annealing.get_temp();
+        for _ in 0..25 {
+            annealing.step();
+        }
+        let temp_25 = annealing.get_temp();
+
+        // Should have dropped
+        assert!(temp_25 < temp_0);
+    }
+
+    #[test]
+    fn test_curriculum_progress() {
+        let mut curriculum = CurriculumScheduler::new()
+            .add_stage(CurriculumStage::new("stage1").duration(50))
+            .add_stage(CurriculumStage::new("stage2").duration(50));
+
+        assert!((curriculum.progress() - 0.0).abs() < 1e-5);
+
+        for _ in 0..50 {
+            curriculum.step();
+        }
+
+        assert!((curriculum.progress() - 0.5).abs() < 0.05);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/training/loss.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/training/loss.rs
@@ -0,0 +1,359 @@
+//! Loss functions for attention-based learning
+//!
+//! Includes contrastive losses optimized for representation learning.
+
+/// Reduction method for loss computation
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
+pub enum Reduction {
+    #[default]
+    Mean,
+    Sum,
+    None,
+}
+
+/// Loss trait for attention training
+pub trait Loss: Send + Sync {
+    /// Compute loss value
+    fn compute(&self, anchor: &[f32], positive: &[f32], negatives: &[&[f32]]) -> f32;
+
+    /// Compute loss with gradients for anchor
+    fn compute_with_gradients(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        negatives: &[&[f32]],
+    ) -> (f32, Vec<f32>);
+}
+
+/// InfoNCE contrastive loss
+///
+/// L = -log(exp(sim(a,p)/τ) / Σexp(sim(a,n)/τ))
+pub struct InfoNCELoss {
+    temperature: f32,
+}
+
+impl InfoNCELoss {
+    pub fn new(temperature: f32) -> Self {
+        Self {
+            temperature: temperature.max(0.01),
+        }
+    }
+
+    fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
+        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+        dot / (norm_a * norm_b)
+    }
+}
+
+impl Loss for InfoNCELoss {
+    fn compute(&self, anchor: &[f32], positive: &[f32], negatives: &[&[f32]]) -> f32 {
+        let pos_sim = Self::cosine_similarity(anchor, positive) / self.temperature;
+
+        let neg_sims: Vec<f32> = negatives
+            .iter()
+            .map(|n| Self::cosine_similarity(anchor, n) / self.temperature)
+            .collect();
+
+        // Stable log-sum-exp
+        let max_sim = neg_sims
+            .iter()
+            .copied()
+            .chain(std::iter::once(pos_sim))
+            .fold(f32::NEG_INFINITY, f32::max);
+
+        let sum_exp: f32 =
+            neg_sims.iter().map(|s| (s - max_sim).exp()).sum::<f32>() + (pos_sim - max_sim).exp();
+
+        let log_sum_exp = max_sim + sum_exp.ln();
+
+        log_sum_exp - pos_sim
+    }
+
+    fn compute_with_gradients(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        negatives: &[&[f32]],
+    ) -> (f32, Vec<f32>) {
+        let dim = anchor.len();
+        let pos_sim = Self::cosine_similarity(anchor, positive) / self.temperature;
+
+        let neg_sims: Vec<f32> = negatives
+            .iter()
+            .map(|n| Self::cosine_similarity(anchor, n) / self.temperature)
+            .collect();
+
+        // Compute softmax weights
+        let max_sim = neg_sims
+            .iter()
+            .copied()
+            .chain(std::iter::once(pos_sim))
+            .fold(f32::NEG_INFINITY, f32::max);
+
+        let pos_exp = (pos_sim - max_sim).exp();
+        let neg_exps: Vec<f32> = neg_sims.iter().map(|s| (s - max_sim).exp()).collect();
+        let total_exp: f32 = pos_exp + neg_exps.iter().sum::<f32>();
+
+        let pos_weight = pos_exp / total_exp;
+        let neg_weights: Vec<f32> = neg_exps.iter().map(|e| e / total_exp).collect();
+
+        // Loss value
+        let loss = -(pos_weight.ln());
+
+        // Gradient with respect to anchor
+        // ∂L/∂anchor = (p_pos - 1) * ∂sim(a,p)/∂a + Σ p_neg_i * ∂sim(a,n_i)/∂a
+        let norm_a: f32 = anchor.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+        let norm_p: f32 = positive.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+
+        let mut gradients = vec![0.0f32; dim];
+
+        // Gradient from positive
+        let dot_ap: f32 = anchor.iter().zip(positive.iter()).map(|(a, p)| a * p).sum();
+        for i in 0..dim {
+            let d_sim = (positive[i] / (norm_a * norm_p))
+                - (anchor[i] * dot_ap / (norm_a.powi(3) * norm_p));
+            gradients[i] += (pos_weight - 1.0) * d_sim / self.temperature;
+        }
+
+        // Gradient from negatives
+        for (neg, &weight) in negatives.iter().zip(neg_weights.iter()) {
+            let norm_n: f32 = neg.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+            let dot_an: f32 = anchor.iter().zip(neg.iter()).map(|(a, n)| a * n).sum();
+
+            for i in 0..dim {
+                let d_sim =
+                    (neg[i] / (norm_a * norm_n)) - (anchor[i] * dot_an / (norm_a.powi(3) * norm_n));
+                gradients[i] += weight * d_sim / self.temperature;
+            }
+        }
+
+        (loss, gradients)
+    }
+}
+
+/// Local contrastive loss for neighborhood preservation
+pub struct LocalContrastiveLoss {
+    margin: f32,
+    reduction: Reduction,
+}
+
+impl LocalContrastiveLoss {
+    pub fn new(margin: f32) -> Self {
+        Self {
+            margin,
+            reduction: Reduction::Mean,
+        }
+    }
+
+    pub fn with_reduction(mut self, reduction: Reduction) -> Self {
+        self.reduction = reduction;
+        self
+    }
+
+    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
+        a.iter()
+            .zip(b.iter())
+            .map(|(x, y)| (x - y).powi(2))
+            .sum::<f32>()
+            .sqrt()
+    }
+}
+
+impl Loss for LocalContrastiveLoss {
+    fn compute(&self, anchor: &[f32], positive: &[f32], negatives: &[&[f32]]) -> f32 {
+        let d_pos = Self::euclidean_distance(anchor, positive);
+
+        let losses: Vec<f32> = negatives
+            .iter()
+            .map(|neg| {
+                let d_neg = Self::euclidean_distance(anchor, neg);
+                (d_pos - d_neg + self.margin).max(0.0)
+            })
+            .collect();
+
+        match self.reduction {
+            Reduction::Mean => losses.iter().sum::<f32>() / losses.len().max(1) as f32,
+            Reduction::Sum => losses.iter().sum(),
+            Reduction::None => losses.first().copied().unwrap_or(0.0),
+        }
+    }
+
+    fn compute_with_gradients(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        negatives: &[&[f32]],
+    ) -> (f32, Vec<f32>) {
+        let dim = anchor.len();
+        let d_pos = Self::euclidean_distance(anchor, positive);
+
+        let mut total_loss = 0.0f32;
+        let mut gradients = vec![0.0f32; dim];
+        let mut active_count = 0;
+
+        for neg in negatives.iter() {
+            let d_neg = Self::euclidean_distance(anchor, neg);
+            let margin_loss = d_pos - d_neg + self.margin;
+
+            if margin_loss > 0.0 {
+                total_loss += margin_loss;
+                active_count += 1;
+
+                // Gradient: ∂L/∂a = (a - p)/d_pos - (a - n)/d_neg
+                for i in 0..dim {
+                    if d_pos > 1e-8 {
+                        gradients[i] += (anchor[i] - positive[i]) / d_pos;
+                    }
+                    if d_neg > 1e-8 {
+                        gradients[i] -= (anchor[i] - neg[i]) / d_neg;
+                    }
+                }
+            }
+        }
+
+        let loss = match self.reduction {
+            Reduction::Mean if active_count > 0 => {
+                gradients.iter_mut().for_each(|g| *g /= active_count as f32);
+                total_loss / active_count as f32
+            }
+            Reduction::Sum => total_loss,
+            _ => total_loss / negatives.len().max(1) as f32,
+        };
+
+        (loss, gradients)
+    }
+}
+
+/// Spectral regularization for smooth representations
+pub struct SpectralRegularization {
+    weight: f32,
+}
+
+impl SpectralRegularization {
+    pub fn new(weight: f32) -> Self {
+        Self { weight }
+    }
+
+    /// Compute spectral norm regularization for a batch of embeddings
+    pub fn compute_batch(&self, embeddings: &[&[f32]]) -> f32 {
+        if embeddings.is_empty() {
+            return 0.0;
+        }
+
+        let dim = embeddings[0].len();
+        let n = embeddings.len();
+
+        // Compute covariance matrix diagonal approximation
+        let mut var_sum = 0.0f32;
+
+        for d in 0..dim {
+            let mean: f32 = embeddings.iter().map(|e| e[d]).sum::<f32>() / n as f32;
+            let var: f32 = embeddings
+                .iter()
+                .map(|e| (e[d] - mean).powi(2))
+                .sum::<f32>()
+                / n as f32;
+            var_sum += var;
+        }
+
+        // Regularization: encourage uniform variance across dimensions
+        let avg_var = var_sum / dim as f32;
+        let var_of_var: f32 = {
+            let mut sum = 0.0;
+            for d in 0..dim {
+                let mean: f32 = embeddings.iter().map(|e| e[d]).sum::<f32>() / n as f32;
+                let var: f32 = embeddings
+                    .iter()
+                    .map(|e| (e[d] - mean).powi(2))
+                    .sum::<f32>()
+                    / n as f32;
+                sum += (var - avg_var).powi(2);
+            }
+            sum / dim as f32
+        };
+
+        self.weight * var_of_var
+    }
+}
+
+impl Loss for SpectralRegularization {
+    fn compute(&self, anchor: &[f32], positive: &[f32], negatives: &[&[f32]]) -> f32 {
+        let mut all_embeddings: Vec<&[f32]> = Vec::with_capacity(2 + negatives.len());
+        all_embeddings.push(anchor);
+        all_embeddings.push(positive);
+        all_embeddings.extend(negatives.iter().copied());
+
+        self.compute_batch(&all_embeddings)
+    }
+
+    fn compute_with_gradients(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        negatives: &[&[f32]],
+    ) -> (f32, Vec<f32>) {
+        let loss = self.compute(anchor, positive, negatives);
+        // Simplified: no gradient for spectral reg (typically used as auxiliary)
+        let gradients = vec![0.0f32; anchor.len()];
+        (loss, gradients)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_infonce_loss() {
+        let loss = InfoNCELoss::new(0.07);
+
+        let anchor = vec![1.0, 0.0, 0.0];
+        let positive = vec![0.9, 0.1, 0.0];
+        let negatives: Vec<Vec<f32>> = vec![vec![0.0, 1.0, 0.0], vec![0.0, 0.0, 1.0]];
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+        let loss_val = loss.compute(&anchor, &positive, &neg_refs);
+        assert!(loss_val >= 0.0);
+    }
+
+    #[test]
+    fn test_infonce_gradients() {
+        let loss = InfoNCELoss::new(0.1);
+
+        let anchor = vec![0.5; 64];
+        let positive = vec![0.6; 64];
+        let negatives: Vec<Vec<f32>> = vec![vec![0.1; 64]; 5];
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+        let (loss_val, grads) = loss.compute_with_gradients(&anchor, &positive, &neg_refs);
+
+        assert!(loss_val >= 0.0);
+        assert_eq!(grads.len(), 64);
+    }
+
+    #[test]
+    fn test_local_contrastive() {
+        let loss = LocalContrastiveLoss::new(1.0);
+
+        let anchor = vec![0.0, 0.0];
+        let positive = vec![0.1, 0.0]; // Close
+        let negatives: Vec<Vec<f32>> = vec![vec![2.0, 0.0], vec![0.0, 2.0]]; // Far
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+        let loss_val = loss.compute(&anchor, &positive, &neg_refs);
+        assert!(loss_val >= 0.0);
+    }
+
+    #[test]
+    fn test_spectral_regularization() {
+        let reg = SpectralRegularization::new(0.01);
+
+        let embeddings: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.1; 32]).collect();
+        let emb_refs: Vec<&[f32]> = embeddings.iter().map(|e| e.as_slice()).collect();
+
+        let loss_val = reg.compute_batch(&emb_refs);
+        assert!(loss_val >= 0.0);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/training/mining.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/training/mining.rs
@@ -0,0 +1,351 @@
+//! Hard negative mining strategies
+//!
+//! Provides various methods for selecting informative negative samples.
+
+/// Mining strategy enumeration
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
+pub enum MiningStrategy {
+    #[default]
+    Random,
+    HardNegative,
+    SemiHard,
+    DistanceWeighted,
+}
+
+/// Trait for negative sample mining
+pub trait NegativeMiner: Send + Sync {
+    /// Mine negatives for an anchor from a candidate pool
+    fn mine(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        candidates: &[&[f32]],
+        num_negatives: usize,
+    ) -> Vec<usize>;
+
+    /// Get mining strategy
+    fn strategy(&self) -> MiningStrategy;
+}
+
+/// Hard negative miner that selects closest negatives
+pub struct HardNegativeMiner {
+    strategy: MiningStrategy,
+    margin: f32,
+    temperature: f32,
+}
+
+impl HardNegativeMiner {
+    pub fn new(strategy: MiningStrategy) -> Self {
+        Self {
+            strategy,
+            margin: 0.1,
+            temperature: 1.0,
+        }
+    }
+
+    pub fn with_margin(mut self, margin: f32) -> Self {
+        self.margin = margin;
+        self
+    }
+
+    pub fn with_temperature(mut self, temp: f32) -> Self {
+        self.temperature = temp;
+        self
+    }
+
+    fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
+        a.iter()
+            .zip(b.iter())
+            .map(|(x, y)| (x - y).powi(2))
+            .sum::<f32>()
+            .sqrt()
+    }
+
+    fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
+        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
+        dot / (norm_a * norm_b)
+    }
+
+    /// Select random indices
+    fn random_selection(num_candidates: usize, num_select: usize, seed: u64) -> Vec<usize> {
+        let mut indices: Vec<usize> = (0..num_candidates).collect();
+        let mut current_seed = seed;
+
+        // Fisher-Yates shuffle
+        for i in (1..indices.len()).rev() {
+            current_seed = current_seed
+                .wrapping_mul(6364136223846793005)
+                .wrapping_add(1);
+            let j = (current_seed as usize) % (i + 1);
+            indices.swap(i, j);
+        }
+
+        indices.truncate(num_select.min(num_candidates));
+        indices
+    }
+
+    /// Select hardest negatives (closest to anchor)
+    fn hard_negative_selection(
+        &self,
+        anchor: &[f32],
+        candidates: &[&[f32]],
+        num_select: usize,
+    ) -> Vec<usize> {
+        let mut indexed_sims: Vec<(usize, f32)> = candidates
+            .iter()
+            .enumerate()
+            .map(|(i, c)| (i, Self::cosine_similarity(anchor, c)))
+            .collect();
+
+        // Sort by similarity descending (higher sim = harder negative)
+        indexed_sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        indexed_sims
+            .into_iter()
+            .take(num_select.min(candidates.len()))
+            .map(|(i, _)| i)
+            .collect()
+    }
+
+    /// Select semi-hard negatives (within margin of positive)
+    fn semi_hard_selection(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        candidates: &[&[f32]],
+        num_select: usize,
+    ) -> Vec<usize> {
+        let d_pos = Self::euclidean_distance(anchor, positive);
+
+        let mut semi_hard: Vec<(usize, f32)> = candidates
+            .iter()
+            .enumerate()
+            .filter_map(|(i, c)| {
+                let d_neg = Self::euclidean_distance(anchor, c);
+                // Semi-hard: d_pos < d_neg < d_pos + margin
+                if d_neg > d_pos && d_neg < d_pos + self.margin {
+                    Some((i, d_neg))
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Sort by distance (prefer harder ones)
+        semi_hard.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        let mut result: Vec<usize> = semi_hard.into_iter().map(|(i, _)| i).collect();
+
+        // If not enough semi-hard, fill with hard negatives
+        if result.len() < num_select {
+            let hard = self.hard_negative_selection(anchor, candidates, num_select - result.len());
+            for idx in hard {
+                if !result.contains(&idx) {
+                    result.push(idx);
+                }
+            }
+        }
+
+        result.truncate(num_select);
+        result
+    }
+
+    /// Distance-weighted sampling
+    fn distance_weighted_selection(
+        &self,
+        anchor: &[f32],
+        candidates: &[&[f32]],
+        num_select: usize,
+    ) -> Vec<usize> {
+        if candidates.is_empty() {
+            return vec![];
+        }
+
+        // Compute weights based on similarity (closer = higher weight)
+        let sims: Vec<f32> = candidates
+            .iter()
+            .map(|c| Self::cosine_similarity(anchor, c) / self.temperature)
+            .collect();
+
+        // Softmax weights
+        let max_sim = sims.iter().copied().fold(f32::NEG_INFINITY, f32::max);
+        let exp_sims: Vec<f32> = sims.iter().map(|s| (s - max_sim).exp()).collect();
+        let sum_exp: f32 = exp_sims.iter().sum();
+        let probs: Vec<f32> = exp_sims.iter().map(|e| e / sum_exp).collect();
+
+        // Sample without replacement using the probabilities
+        let mut remaining: Vec<(usize, f32)> = probs.into_iter().enumerate().collect();
+        let mut selected = Vec::with_capacity(num_select);
+        let mut seed = 42u64;
+
+        while selected.len() < num_select && !remaining.is_empty() {
+            // Random value
+            seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
+            let r = (seed as f32) / (u64::MAX as f32);
+
+            // Select based on cumulative probability
+            let total: f32 = remaining.iter().map(|(_, p)| p).sum();
+            let mut cumsum = 0.0;
+            let mut select_idx = 0;
+
+            for (i, (_, p)) in remaining.iter().enumerate() {
+                cumsum += p / total;
+                if r < cumsum {
+                    select_idx = i;
+                    break;
+                }
+            }
+
+            let (orig_idx, _) = remaining.remove(select_idx);
+            selected.push(orig_idx);
+        }
+
+        selected
+    }
+}
+
+impl NegativeMiner for HardNegativeMiner {
+    fn mine(
+        &self,
+        anchor: &[f32],
+        positive: &[f32],
+        candidates: &[&[f32]],
+        num_negatives: usize,
+    ) -> Vec<usize> {
+        match self.strategy {
+            MiningStrategy::Random => Self::random_selection(candidates.len(), num_negatives, 42),
+            MiningStrategy::HardNegative => {
+                self.hard_negative_selection(anchor, candidates, num_negatives)
+            }
+            MiningStrategy::SemiHard => {
+                self.semi_hard_selection(anchor, positive, candidates, num_negatives)
+            }
+            MiningStrategy::DistanceWeighted => {
+                self.distance_weighted_selection(anchor, candidates, num_negatives)
+            }
+        }
+    }
+
+    fn strategy(&self) -> MiningStrategy {
+        self.strategy
+    }
+}
+
+/// In-batch negative mining (uses other batch items as negatives)
+pub struct InBatchMiner {
+    exclude_positive: bool,
+}
+
+impl InBatchMiner {
+    pub fn new() -> Self {
+        Self {
+            exclude_positive: true,
+        }
+    }
+
+    pub fn include_positive(mut self) -> Self {
+        self.exclude_positive = false;
+        self
+    }
+
+    /// Get negative indices from a batch for a given anchor index
+    pub fn get_negatives(
+        &self,
+        anchor_idx: usize,
+        positive_idx: usize,
+        batch_size: usize,
+    ) -> Vec<usize> {
+        (0..batch_size)
+            .filter(|&i| i != anchor_idx && (!self.exclude_positive || i != positive_idx))
+            .collect()
+    }
+}
+
+impl Default for InBatchMiner {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_random_mining() {
+        let miner = HardNegativeMiner::new(MiningStrategy::Random);
+
+        let anchor = vec![1.0, 0.0, 0.0];
+        let positive = vec![0.9, 0.1, 0.0];
+        let candidates: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32 * 0.05; 3]).collect();
+        let cand_refs: Vec<&[f32]> = candidates.iter().map(|c| c.as_slice()).collect();
+
+        let selected = miner.mine(&anchor, &positive, &cand_refs, 5);
+        assert_eq!(selected.len(), 5);
+    }
+
+    #[test]
+    fn test_hard_negative_mining() {
+        let miner = HardNegativeMiner::new(MiningStrategy::HardNegative);
+
+        let anchor = vec![1.0, 0.0, 0.0];
+        let positive = vec![0.9, 0.1, 0.0];
+        // Create candidates with varying similarity to anchor
+        let candidates: Vec<Vec<f32>> = vec![
+            vec![0.9, 0.1, 0.0], // Similar to anchor
+            vec![0.5, 0.5, 0.0], // Medium
+            vec![0.0, 1.0, 0.0], // Different
+            vec![0.0, 0.0, 1.0], // Different
+        ];
+        let cand_refs: Vec<&[f32]> = candidates.iter().map(|c| c.as_slice()).collect();
+
+        let selected = miner.mine(&anchor, &positive, &cand_refs, 2);
+
+        // Should select the most similar ones first
+        assert!(selected.contains(&0)); // Most similar
+    }
+
+    #[test]
+    fn test_semi_hard_mining() {
+        let miner = HardNegativeMiner::new(MiningStrategy::SemiHard).with_margin(1.0);
+
+        let anchor = vec![0.0, 0.0];
+        let positive = vec![0.5, 0.0]; // Distance 0.5
+        let candidates: Vec<Vec<f32>> = vec![
+            vec![0.3, 0.0], // Too easy (d = 0.3 < 0.5)
+            vec![0.7, 0.0], // Semi-hard (0.5 < 0.7 < 1.5)
+            vec![1.0, 0.0], // Semi-hard
+            vec![3.0, 0.0], // Too hard (d = 3.0 > 1.5)
+        ];
+        let cand_refs: Vec<&[f32]> = candidates.iter().map(|c| c.as_slice()).collect();
+
+        let selected = miner.mine(&anchor, &positive, &cand_refs, 2);
+        assert!(!selected.is_empty());
+    }
+
+    #[test]
+    fn test_distance_weighted() {
+        let miner = HardNegativeMiner::new(MiningStrategy::DistanceWeighted).with_temperature(0.5);
+
+        let anchor = vec![1.0, 0.0];
+        let positive = vec![0.9, 0.1];
+        let candidates: Vec<Vec<f32>> = (0..10).map(|i| vec![0.1 * i as f32; 2]).collect();
+        let cand_refs: Vec<&[f32]> = candidates.iter().map(|c| c.as_slice()).collect();
+
+        let selected = miner.mine(&anchor, &positive, &cand_refs, 3);
+        assert_eq!(selected.len(), 3);
+    }
+
+    #[test]
+    fn test_in_batch_miner() {
+        let miner = InBatchMiner::new();
+
+        let negatives = miner.get_negatives(2, 5, 10);
+
+        assert!(!negatives.contains(&2)); // Exclude anchor
+        assert!(!negatives.contains(&5)); // Exclude positive
+        assert_eq!(negatives.len(), 8);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/training/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/training/mod.rs
@@ -0,0 +1,42 @@
+//! Training utilities for attention-based graph neural networks
+//!
+//! This module provides training infrastructure including:
+//! - Loss functions (InfoNCE, contrastive, spectral regularization)
+//! - Optimizers (SGD, Adam, AdamW)
+//! - Curriculum learning schedulers
+//! - Hard negative mining strategies
+
+pub mod curriculum;
+pub mod loss;
+pub mod mining;
+pub mod optimizer;
+
+pub use curriculum::{CurriculumScheduler, CurriculumStage, DecayType, TemperatureAnnealing};
+pub use loss::{InfoNCELoss, LocalContrastiveLoss, Loss, Reduction, SpectralRegularization};
+pub use mining::{HardNegativeMiner, MiningStrategy, NegativeMiner};
+pub use optimizer::{Adam, AdamW, Optimizer, SGD};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_training_components_integration() {
+        // Test optimizer with loss
+        let mut optimizer = Adam::new(128, 0.001);
+        let loss = InfoNCELoss::new(0.07);
+
+        let mut params = vec![0.5; 128];
+        let anchor = vec![1.0; 128];
+        let positive = vec![0.9; 128];
+        let negatives: Vec<Vec<f32>> = (0..5).map(|_| vec![0.1; 128]).collect();
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|v| v.as_slice()).collect();
+
+        let (loss_val, gradients) = loss.compute_with_gradients(&anchor, &positive, &neg_refs);
+
+        assert!(loss_val >= 0.0);
+        assert_eq!(gradients.len(), anchor.len());
+
+        optimizer.step(&mut params, &gradients);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/training/optimizer.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/training/optimizer.rs
@@ -0,0 +1,400 @@
+//! Optimizers for attention training
+//!
+//! Provides standard optimizers with momentum and adaptive learning rates.
+
+/// Optimizer trait for parameter updates
+pub trait Optimizer: Send + Sync {
+    /// Update parameters using gradients
+    fn step(&mut self, params: &mut [f32], gradients: &[f32]);
+
+    /// Reset optimizer state
+    fn reset(&mut self);
+
+    /// Get current learning rate
+    fn learning_rate(&self) -> f32;
+
+    /// Set learning rate
+    fn set_learning_rate(&mut self, lr: f32);
+}
+
+/// Stochastic Gradient Descent with momentum
+pub struct SGD {
+    lr: f32,
+    momentum: f32,
+    weight_decay: f32,
+    velocity: Vec<f32>,
+    nesterov: bool,
+}
+
+impl SGD {
+    pub fn new(dim: usize, lr: f32) -> Self {
+        Self {
+            lr,
+            momentum: 0.0,
+            weight_decay: 0.0,
+            velocity: vec![0.0; dim],
+            nesterov: false,
+        }
+    }
+
+    pub fn with_momentum(mut self, momentum: f32) -> Self {
+        self.momentum = momentum;
+        self
+    }
+
+    pub fn with_weight_decay(mut self, wd: f32) -> Self {
+        self.weight_decay = wd;
+        self
+    }
+
+    pub fn with_nesterov(mut self, nesterov: bool) -> Self {
+        self.nesterov = nesterov;
+        self
+    }
+}
+
+impl Optimizer for SGD {
+    fn step(&mut self, params: &mut [f32], gradients: &[f32]) {
+        if self.velocity.len() != params.len() {
+            self.velocity = vec![0.0; params.len()];
+        }
+
+        for i in 0..params.len() {
+            let mut g = gradients[i];
+
+            // Weight decay
+            if self.weight_decay > 0.0 {
+                g += self.weight_decay * params[i];
+            }
+
+            // Update velocity
+            self.velocity[i] = self.momentum * self.velocity[i] + g;
+
+            // Update parameters
+            if self.nesterov {
+                params[i] -= self.lr * (g + self.momentum * self.velocity[i]);
+            } else {
+                params[i] -= self.lr * self.velocity[i];
+            }
+        }
+    }
+
+    fn reset(&mut self) {
+        self.velocity.fill(0.0);
+    }
+
+    fn learning_rate(&self) -> f32 {
+        self.lr
+    }
+
+    fn set_learning_rate(&mut self, lr: f32) {
+        self.lr = lr;
+    }
+}
+
+/// Adam optimizer with bias correction
+pub struct Adam {
+    lr: f32,
+    beta1: f32,
+    beta2: f32,
+    epsilon: f32,
+    weight_decay: f32,
+    m: Vec<f32>, // First moment
+    v: Vec<f32>, // Second moment
+    t: usize,    // Timestep
+}
+
+impl Adam {
+    pub fn new(dim: usize, lr: f32) -> Self {
+        Self {
+            lr,
+            beta1: 0.9,
+            beta2: 0.999,
+            epsilon: 1e-8,
+            weight_decay: 0.0,
+            m: vec![0.0; dim],
+            v: vec![0.0; dim],
+            t: 0,
+        }
+    }
+
+    pub fn with_betas(mut self, beta1: f32, beta2: f32) -> Self {
+        self.beta1 = beta1;
+        self.beta2 = beta2;
+        self
+    }
+
+    pub fn with_epsilon(mut self, eps: f32) -> Self {
+        self.epsilon = eps;
+        self
+    }
+
+    pub fn with_weight_decay(mut self, wd: f32) -> Self {
+        self.weight_decay = wd;
+        self
+    }
+}
+
+impl Optimizer for Adam {
+    fn step(&mut self, params: &mut [f32], gradients: &[f32]) {
+        if self.m.len() != params.len() {
+            self.m = vec![0.0; params.len()];
+            self.v = vec![0.0; params.len()];
+        }
+
+        self.t += 1;
+        let bias_correction1 = 1.0 - self.beta1.powi(self.t as i32);
+        let bias_correction2 = 1.0 - self.beta2.powi(self.t as i32);
+
+        for i in 0..params.len() {
+            let g = gradients[i];
+
+            // Update moments
+            self.m[i] = self.beta1 * self.m[i] + (1.0 - self.beta1) * g;
+            self.v[i] = self.beta2 * self.v[i] + (1.0 - self.beta2) * g * g;
+
+            // Bias-corrected estimates
+            let m_hat = self.m[i] / bias_correction1;
+            let v_hat = self.v[i] / bias_correction2;
+
+            // Update with optional weight decay
+            let update = m_hat / (v_hat.sqrt() + self.epsilon);
+            params[i] -= self.lr * (update + self.weight_decay * params[i]);
+        }
+    }
+
+    fn reset(&mut self) {
+        self.m.fill(0.0);
+        self.v.fill(0.0);
+        self.t = 0;
+    }
+
+    fn learning_rate(&self) -> f32 {
+        self.lr
+    }
+
+    fn set_learning_rate(&mut self, lr: f32) {
+        self.lr = lr;
+    }
+}
+
+/// AdamW optimizer (decoupled weight decay)
+pub struct AdamW {
+    inner: Adam,
+    weight_decay: f32,
+}
+
+impl AdamW {
+    pub fn new(dim: usize, lr: f32) -> Self {
+        Self {
+            inner: Adam::new(dim, lr),
+            weight_decay: 0.01,
+        }
+    }
+
+    pub fn with_weight_decay(mut self, wd: f32) -> Self {
+        self.weight_decay = wd;
+        self
+    }
+
+    pub fn with_betas(mut self, beta1: f32, beta2: f32) -> Self {
+        self.inner = self.inner.with_betas(beta1, beta2);
+        self
+    }
+}
+
+impl Optimizer for AdamW {
+    fn step(&mut self, params: &mut [f32], gradients: &[f32]) {
+        if self.inner.m.len() != params.len() {
+            self.inner.m = vec![0.0; params.len()];
+            self.inner.v = vec![0.0; params.len()];
+        }
+
+        self.inner.t += 1;
+        let bias_correction1 = 1.0 - self.inner.beta1.powi(self.inner.t as i32);
+        let bias_correction2 = 1.0 - self.inner.beta2.powi(self.inner.t as i32);
+
+        for i in 0..params.len() {
+            let g = gradients[i];
+
+            // Update moments
+            self.inner.m[i] = self.inner.beta1 * self.inner.m[i] + (1.0 - self.inner.beta1) * g;
+            self.inner.v[i] = self.inner.beta2 * self.inner.v[i] + (1.0 - self.inner.beta2) * g * g;
+
+            // Bias-corrected estimates
+            let m_hat = self.inner.m[i] / bias_correction1;
+            let v_hat = self.inner.v[i] / bias_correction2;
+
+            // Decoupled weight decay (applied to params directly, not through gradient)
+            params[i] *= 1.0 - self.inner.lr * self.weight_decay;
+
+            // Adam update
+            params[i] -= self.inner.lr * m_hat / (v_hat.sqrt() + self.inner.epsilon);
+        }
+    }
+
+    fn reset(&mut self) {
+        self.inner.reset();
+    }
+
+    fn learning_rate(&self) -> f32 {
+        self.inner.lr
+    }
+
+    fn set_learning_rate(&mut self, lr: f32) {
+        self.inner.lr = lr;
+    }
+}
+
+/// Learning rate scheduler
+pub struct LearningRateScheduler {
+    initial_lr: f32,
+    warmup_steps: usize,
+    decay_steps: usize,
+    min_lr: f32,
+    current_step: usize,
+}
+
+impl LearningRateScheduler {
+    pub fn new(initial_lr: f32) -> Self {
+        Self {
+            initial_lr,
+            warmup_steps: 0,
+            decay_steps: 100000,
+            min_lr: 1e-7,
+            current_step: 0,
+        }
+    }
+
+    pub fn with_warmup(mut self, steps: usize) -> Self {
+        self.warmup_steps = steps;
+        self
+    }
+
+    pub fn with_decay(mut self, steps: usize) -> Self {
+        self.decay_steps = steps;
+        self
+    }
+
+    pub fn with_min_lr(mut self, min_lr: f32) -> Self {
+        self.min_lr = min_lr;
+        self
+    }
+
+    /// Get current learning rate and advance step
+    pub fn step(&mut self) -> f32 {
+        let lr = self.get_lr();
+        self.current_step += 1;
+        lr
+    }
+
+    /// Get learning rate without advancing
+    pub fn get_lr(&self) -> f32 {
+        if self.current_step < self.warmup_steps {
+            // Linear warmup
+            self.initial_lr * (self.current_step + 1) as f32 / self.warmup_steps as f32
+        } else {
+            // Cosine decay
+            let progress = (self.current_step - self.warmup_steps) as f32 / self.decay_steps as f32;
+            let decay = 0.5 * (1.0 + (std::f32::consts::PI * progress.min(1.0)).cos());
+            self.min_lr + (self.initial_lr - self.min_lr) * decay
+        }
+    }
+
+    /// Reset scheduler
+    pub fn reset(&mut self) {
+        self.current_step = 0;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sgd() {
+        let mut opt = SGD::new(4, 0.1);
+        let mut params = vec![1.0, 2.0, 3.0, 4.0];
+        let gradients = vec![0.1, 0.2, 0.3, 0.4];
+
+        opt.step(&mut params, &gradients);
+
+        assert!(params[0] < 1.0);
+        assert!(params[1] < 2.0);
+    }
+
+    #[test]
+    fn test_sgd_momentum() {
+        let mut opt = SGD::new(4, 0.1).with_momentum(0.9);
+        let mut params = vec![1.0; 4];
+        let gradients = vec![1.0; 4];
+
+        // Multiple steps should accumulate momentum
+        for _ in 0..5 {
+            opt.step(&mut params, &gradients);
+        }
+
+        assert!(params[0] < 0.0);
+    }
+
+    #[test]
+    fn test_adam() {
+        let mut opt = Adam::new(64, 0.001);
+        let mut params = vec![0.5; 64];
+        let gradients = vec![0.1; 64];
+
+        for _ in 0..100 {
+            opt.step(&mut params, &gradients);
+        }
+
+        // Should have moved toward 0
+        assert!(params[0] < 0.5);
+    }
+
+    #[test]
+    fn test_adamw() {
+        let mut opt = AdamW::new(32, 0.001).with_weight_decay(0.01);
+        let mut params = vec![1.0; 32];
+        let gradients = vec![0.0; 32]; // No gradient, only weight decay
+
+        for _ in 0..100 {
+            opt.step(&mut params, &gradients);
+        }
+
+        // Weight decay should shrink params
+        assert!(params[0] < 1.0);
+    }
+
+    #[test]
+    fn test_lr_scheduler_warmup() {
+        let mut scheduler = LearningRateScheduler::new(0.001).with_warmup(100);
+
+        let lr_start = scheduler.step();
+        assert!(lr_start < 0.001); // Still warming up
+
+        for _ in 0..99 {
+            scheduler.step();
+        }
+
+        let lr_end_warmup = scheduler.get_lr();
+        assert!((lr_end_warmup - 0.001).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_lr_scheduler_decay() {
+        let mut scheduler = LearningRateScheduler::new(0.001)
+            .with_warmup(0)
+            .with_decay(100)
+            .with_min_lr(0.0001);
+
+        let lr_start = scheduler.step();
+        assert!((lr_start - 0.001).abs() < 1e-5);
+
+        for _ in 0..100 {
+            scheduler.step();
+        }
+
+        let lr_end = scheduler.get_lr();
+        assert!((lr_end - 0.0001).abs() < 1e-5);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/traits.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/traits.rs
@@ -0,0 +1,299 @@
+//! Trait definitions for attention mechanisms.
+//!
+//! This module defines the core traits that all attention mechanisms implement,
+//! including standard attention, graph attention, geometric attention, and
+//! trainable attention with backward pass support.
+
+use crate::error::AttentionResult;
+
+/// Mask for sparse attention patterns.
+#[derive(Clone, Debug)]
+pub struct SparseMask {
+    /// Row indices for sparse mask
+    pub rows: Vec<usize>,
+    /// Column indices for sparse mask
+    pub cols: Vec<usize>,
+    /// Optional values (if not provided, defaults to 1.0)
+    pub values: Option<Vec<f32>>,
+}
+
+/// Edge information for graph attention.
+#[derive(Clone, Debug)]
+pub struct EdgeInfo {
+    /// Source node index
+    pub src: usize,
+    /// Destination node index
+    pub dst: usize,
+    /// Optional edge features
+    pub features: Option<Vec<f32>>,
+}
+
+/// Core attention mechanism trait.
+///
+/// Implements the basic attention computation: Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
+pub trait Attention: Send + Sync {
+    /// Computes attention over the given query, keys, and values.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Query vector of shape [d_model]
+    /// * `keys` - Slice of key vectors, each of shape [d_model]
+    /// * `values` - Slice of value vectors, each of shape [d_model]
+    ///
+    /// # Returns
+    ///
+    /// Output vector of shape [d_model]
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>>;
+
+    /// Computes attention with optional mask.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Query vector of shape [d_model]
+    /// * `keys` - Slice of key vectors, each of shape [d_model]
+    /// * `values` - Slice of value vectors, each of shape [d_model]
+    /// * `mask` - Optional attention mask (true = attend, false = mask out)
+    ///
+    /// # Returns
+    ///
+    /// Output vector of shape [d_model]
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>>;
+
+    /// Returns the model dimension.
+    fn dim(&self) -> usize;
+
+    /// Returns the number of attention heads (1 for single-head attention).
+    fn num_heads(&self) -> usize {
+        1
+    }
+}
+
+/// Graph attention mechanism trait.
+///
+/// Extends basic attention to operate over graph structures with explicit edges.
+pub trait GraphAttention: Attention {
+    /// Computes attention using graph structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `node_features` - Features for all nodes, shape [num_nodes, d_model]
+    /// * `edges` - Edge information (source, destination, optional features)
+    ///
+    /// # Returns
+    ///
+    /// Updated node features of shape [num_nodes, d_model]
+    fn compute_with_edges(
+        &self,
+        node_features: &[Vec<f32>],
+        edges: &[EdgeInfo],
+    ) -> AttentionResult<Vec<Vec<f32>>>;
+
+    /// Computes attention weights for edges.
+    ///
+    /// # Arguments
+    ///
+    /// * `src_feature` - Source node feature
+    /// * `dst_feature` - Destination node feature
+    /// * `edge_feature` - Optional edge feature
+    ///
+    /// # Returns
+    ///
+    /// Attention weight for this edge
+    fn compute_edge_attention(
+        &self,
+        src_feature: &[f32],
+        dst_feature: &[f32],
+        edge_feature: Option<&[f32]>,
+    ) -> AttentionResult<f32>;
+}
+
+/// Geometric attention mechanism trait.
+///
+/// Implements attention in hyperbolic or other geometric spaces with curvature.
+pub trait GeometricAttention: Attention {
+    /// Computes attention in geometric space with specified curvature.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Query vector in geometric space
+    /// * `keys` - Key vectors in geometric space
+    /// * `values` - Value vectors
+    /// * `curvature` - Curvature parameter (negative for hyperbolic space)
+    ///
+    /// # Returns
+    ///
+    /// Output vector in geometric space
+    fn compute_geometric(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        curvature: f32,
+    ) -> AttentionResult<Vec<f32>>;
+
+    /// Projects vector to geometric space.
+    fn project_to_geometric(&self, vector: &[f32], curvature: f32) -> AttentionResult<Vec<f32>>;
+
+    /// Projects vector back from geometric space.
+    fn project_from_geometric(&self, vector: &[f32], curvature: f32) -> AttentionResult<Vec<f32>>;
+}
+
+/// Sparse attention mechanism trait.
+///
+/// Implements efficient attention over sparse patterns.
+pub trait SparseAttention: Attention {
+    /// Computes sparse attention using the provided mask.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Query vector
+    /// * `keys` - Key vectors
+    /// * `values` - Value vectors
+    /// * `mask` - Sparse mask defining attention pattern
+    ///
+    /// # Returns
+    ///
+    /// Output vector
+    fn compute_sparse(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: &SparseMask,
+    ) -> AttentionResult<Vec<f32>>;
+
+    /// Generates a sparse mask for the given sequence length.
+    ///
+    /// # Arguments
+    ///
+    /// * `seq_len` - Sequence length
+    ///
+    /// # Returns
+    ///
+    /// Sparse mask for attention computation
+    fn generate_mask(&self, seq_len: usize) -> AttentionResult<SparseMask>;
+}
+
+/// Gradient information for backward pass.
+#[derive(Clone, Debug)]
+pub struct Gradients {
+    /// Gradient w.r.t. query
+    pub query_grad: Vec<f32>,
+    /// Gradient w.r.t. keys
+    pub keys_grad: Vec<Vec<f32>>,
+    /// Gradient w.r.t. values
+    pub values_grad: Vec<Vec<f32>>,
+    /// Gradient w.r.t. attention weights (for analysis)
+    pub attention_weights_grad: Option<Vec<f32>>,
+}
+
+/// Trainable attention mechanism with backward pass support.
+pub trait TrainableAttention: Attention {
+    /// Forward pass with gradient tracking.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Query vector
+    /// * `keys` - Key vectors
+    /// * `values` - Value vectors
+    ///
+    /// # Returns
+    ///
+    /// Tuple of (output, attention_weights) for gradient computation
+    fn forward(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<(Vec<f32>, Vec<f32>)>;
+
+    /// Backward pass for gradient computation.
+    ///
+    /// # Arguments
+    ///
+    /// * `grad_output` - Gradient from downstream layers
+    /// * `query` - Query from forward pass
+    /// * `keys` - Keys from forward pass
+    /// * `values` - Values from forward pass
+    /// * `attention_weights` - Attention weights from forward pass
+    ///
+    /// # Returns
+    ///
+    /// Gradients w.r.t. inputs
+    fn backward(
+        &self,
+        grad_output: &[f32],
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        attention_weights: &[f32],
+    ) -> AttentionResult<Gradients>;
+
+    /// Updates parameters using computed gradients.
+    ///
+    /// # Arguments
+    ///
+    /// * `gradients` - Computed gradients
+    /// * `learning_rate` - Learning rate for update
+    fn update_parameters(
+        &mut self,
+        gradients: &Gradients,
+        learning_rate: f32,
+    ) -> AttentionResult<()>;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sparse_mask_creation() {
+        let mask = SparseMask {
+            rows: vec![0, 1, 2],
+            cols: vec![0, 1, 2],
+            values: None,
+        };
+
+        assert_eq!(mask.rows.len(), 3);
+        assert_eq!(mask.cols.len(), 3);
+        assert!(mask.values.is_none());
+    }
+
+    #[test]
+    fn test_edge_info_creation() {
+        let edge = EdgeInfo {
+            src: 0,
+            dst: 1,
+            features: Some(vec![0.5, 0.3]),
+        };
+
+        assert_eq!(edge.src, 0);
+        assert_eq!(edge.dst, 1);
+        assert_eq!(edge.features.as_ref().unwrap().len(), 2);
+    }
+
+    #[test]
+    fn test_gradients_creation() {
+        let grads = Gradients {
+            query_grad: vec![0.1, 0.2],
+            keys_grad: vec![vec![0.3, 0.4]],
+            values_grad: vec![vec![0.5, 0.6]],
+            attention_weights_grad: None,
+        };
+
+        assert_eq!(grads.query_grad.len(), 2);
+        assert_eq!(grads.keys_grad.len(), 1);
+        assert!(grads.attention_weights_grad.is_none());
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/transport/cached_projections.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/transport/cached_projections.rs
@@ -0,0 +1,241 @@
+//! Cached Projections for Fast OT
+//!
+//! Pre-compute and cache random projections per window to avoid
+//! redundant computation across queries.
+
+use rand::prelude::*;
+use rand::rngs::StdRng;
+
+/// Cache for random projection directions
+#[derive(Debug, Clone)]
+pub struct ProjectionCache {
+    /// Random unit directions [P × dim]
+    pub directions: Vec<Vec<f32>>,
+    /// Number of projections
+    pub num_projections: usize,
+    /// Dimension
+    pub dim: usize,
+}
+
+impl ProjectionCache {
+    /// Create new projection cache with P random directions
+    pub fn new(dim: usize, num_projections: usize, seed: u64) -> Self {
+        let mut rng = StdRng::seed_from_u64(seed);
+
+        let directions: Vec<Vec<f32>> = (0..num_projections)
+            .map(|_| {
+                let mut dir: Vec<f32> = (0..dim)
+                    .map(|_| rng.sample::<f32, _>(rand::distributions::Standard) * 2.0 - 1.0)
+                    .collect();
+                // Normalize to unit vector
+                let norm: f32 = dir.iter().map(|x| x * x).sum::<f32>().sqrt();
+                if norm > 1e-8 {
+                    for x in &mut dir {
+                        *x /= norm;
+                    }
+                }
+                dir
+            })
+            .collect();
+
+        Self {
+            directions,
+            num_projections,
+            dim,
+        }
+    }
+
+    /// Project a single vector onto all directions
+    /// Returns [P] projected values
+    #[inline]
+    pub fn project(&self, vector: &[f32]) -> Vec<f32> {
+        self.directions
+            .iter()
+            .map(|dir| Self::dot_product_simd(vector, dir))
+            .collect()
+    }
+
+    /// Project a single vector into pre-allocated buffer
+    #[inline]
+    pub fn project_into(&self, vector: &[f32], out: &mut [f32]) {
+        for (i, dir) in self.directions.iter().enumerate() {
+            out[i] = Self::dot_product_simd(vector, dir);
+        }
+    }
+
+    /// SIMD-friendly 4-way unrolled dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len();
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+}
+
+/// Per-window cache containing sorted projections
+#[derive(Debug, Clone)]
+pub struct WindowCache {
+    /// Projected keys [num_keys × P]
+    pub key_projections: Vec<Vec<f32>>,
+    /// Sorted indices per projection [P × num_keys]
+    pub sorted_indices: Vec<Vec<usize>>,
+    /// Sorted values per projection [P × num_keys]
+    pub sorted_values: Vec<Vec<f32>>,
+    /// Histogram bins per projection [P × num_bins]
+    pub histograms: Option<Vec<Vec<f32>>>,
+    /// CDF per projection [P × num_bins]
+    pub cdfs: Option<Vec<Vec<f32>>>,
+    /// Number of keys in window
+    pub num_keys: usize,
+}
+
+impl WindowCache {
+    /// Build cache from keys using projection cache
+    pub fn build(keys: &[&[f32]], proj_cache: &ProjectionCache) -> Self {
+        let num_keys = keys.len();
+        let num_proj = proj_cache.num_projections;
+
+        // Project all keys
+        let key_projections: Vec<Vec<f32>> = keys.iter().map(|k| proj_cache.project(k)).collect();
+
+        // Sort indices and values for each projection
+        let mut sorted_indices = vec![Vec::with_capacity(num_keys); num_proj];
+        let mut sorted_values = vec![Vec::with_capacity(num_keys); num_proj];
+
+        for p in 0..num_proj {
+            let mut indexed: Vec<(usize, f32)> = key_projections
+                .iter()
+                .enumerate()
+                .map(|(i, projs)| (i, projs[p]))
+                .collect();
+            indexed.sort_unstable_by(|a, b| {
+                a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
+            });
+
+            sorted_indices[p] = indexed.iter().map(|(i, _)| *i).collect();
+            sorted_values[p] = indexed.iter().map(|(_, v)| *v).collect();
+        }
+
+        Self {
+            key_projections,
+            sorted_indices,
+            sorted_values,
+            histograms: None,
+            cdfs: None,
+            num_keys,
+        }
+    }
+
+    /// Build histograms for ultra-fast CDF comparison
+    pub fn build_histograms(&mut self, num_bins: usize) {
+        let num_proj = self.sorted_values.len();
+
+        let mut histograms = vec![vec![0.0f32; num_bins]; num_proj];
+        let mut cdfs = vec![vec![0.0f32; num_bins]; num_proj];
+
+        for p in 0..num_proj {
+            let vals = &self.sorted_values[p];
+            if vals.is_empty() {
+                continue;
+            }
+
+            let min_val = vals[0];
+            let max_val = vals[vals.len() - 1];
+            let range = (max_val - min_val).max(1e-8);
+
+            // Build histogram
+            for &v in vals {
+                let bin = ((v - min_val) / range * (num_bins - 1) as f32)
+                    .clamp(0.0, (num_bins - 1) as f32) as usize;
+                histograms[p][bin] += 1.0 / self.num_keys as f32;
+            }
+
+            // Build CDF
+            let mut cumsum = 0.0f32;
+            for bin in 0..num_bins {
+                cumsum += histograms[p][bin];
+                cdfs[p][bin] = cumsum;
+            }
+        }
+
+        self.histograms = Some(histograms);
+        self.cdfs = Some(cdfs);
+    }
+
+    /// Get sorted values for a projection
+    #[inline]
+    pub fn get_sorted(&self, projection_idx: usize) -> &[f32] {
+        &self.sorted_values[projection_idx]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_projection_cache() {
+        let cache = ProjectionCache::new(64, 8, 42);
+
+        assert_eq!(cache.num_projections, 8);
+        assert_eq!(cache.dim, 64);
+
+        // Check directions are unit vectors
+        for dir in &cache.directions {
+            let norm: f32 = dir.iter().map(|x| x * x).sum::<f32>().sqrt();
+            assert!((norm - 1.0).abs() < 1e-5);
+        }
+    }
+
+    #[test]
+    fn test_window_cache() {
+        let proj_cache = ProjectionCache::new(32, 4, 42);
+
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.1; 32]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let window_cache = WindowCache::build(&keys_refs, &proj_cache);
+
+        assert_eq!(window_cache.num_keys, 10);
+        assert_eq!(window_cache.sorted_indices.len(), 4);
+    }
+
+    #[test]
+    fn test_histograms() {
+        let proj_cache = ProjectionCache::new(16, 2, 42);
+
+        let keys: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32 * 0.05; 16]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let mut window_cache = WindowCache::build(&keys_refs, &proj_cache);
+        window_cache.build_histograms(10);
+
+        assert!(window_cache.cdfs.is_some());
+
+        // CDF should end at 1.0
+        let cdfs = window_cache.cdfs.as_ref().unwrap();
+        for cdf in cdfs {
+            assert!((cdf[cdf.len() - 1] - 1.0).abs() < 1e-5);
+        }
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/transport/centroid_ot.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/transport/centroid_ot.rs
@@ -0,0 +1,443 @@
+//! Centroid-Based Optimal Transport Attention
+//!
+//! Clusters keys into M centroids and computes OT between query and centroids.
+//! Much faster than full pairwise OT.
+//!
+//! ## Algorithm
+//!
+//! 1. Cluster keys into M centroids using k-means
+//! 2. Store centroid vectors and weights (fraction of keys in each cluster)
+//! 3. For each query, compute transport to centroid distribution
+//! 4. Convert transport cost to attention logits
+
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for Centroid OT Attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CentroidOTConfig {
+    /// Model dimension
+    pub dim: usize,
+    /// Number of centroids (16-32 typical)
+    pub num_centroids: usize,
+    /// Number of k-means iterations
+    pub kmeans_iterations: usize,
+    /// Temperature for softmax
+    pub temperature: f32,
+    /// Regularization for Sinkhorn (0.1 typical)
+    pub sinkhorn_reg: f32,
+    /// Max Sinkhorn iterations
+    pub sinkhorn_iterations: usize,
+    /// Random seed
+    pub seed: u64,
+}
+
+impl Default for CentroidOTConfig {
+    fn default() -> Self {
+        Self {
+            dim: 512,
+            num_centroids: 16,
+            kmeans_iterations: 10,
+            temperature: 1.0,
+            sinkhorn_reg: 0.1,
+            sinkhorn_iterations: 20,
+            seed: 42,
+        }
+    }
+}
+
+/// Cached centroid information for a window
+#[derive(Debug, Clone)]
+pub struct CentroidCache {
+    /// Centroid vectors [M × dim]
+    pub centroids: Vec<Vec<f32>>,
+    /// Weights for each centroid (sum to 1)
+    pub weights: Vec<f32>,
+    /// Assignment of each key to centroid
+    pub assignments: Vec<usize>,
+    /// Number of keys
+    pub num_keys: usize,
+}
+
+impl CentroidCache {
+    /// Build centroid cache using k-means
+    pub fn build(keys: &[&[f32]], num_centroids: usize, iterations: usize, seed: u64) -> Self {
+        let num_keys = keys.len();
+        let m = num_centroids.min(num_keys);
+
+        if num_keys == 0 || keys[0].is_empty() {
+            return Self {
+                centroids: vec![],
+                weights: vec![],
+                assignments: vec![],
+                num_keys: 0,
+            };
+        }
+
+        let dim = keys[0].len();
+
+        // Initialize centroids with random keys
+        use rand::prelude::*;
+        let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+        let mut indices: Vec<usize> = (0..num_keys).collect();
+        indices.shuffle(&mut rng);
+
+        let mut centroids: Vec<Vec<f32>> =
+            indices.iter().take(m).map(|&i| keys[i].to_vec()).collect();
+
+        let mut assignments = vec![0usize; num_keys];
+
+        // K-means iterations
+        for _ in 0..iterations {
+            // Assign each key to nearest centroid
+            for (key_idx, key) in keys.iter().enumerate() {
+                let mut min_dist = f32::MAX;
+                let mut best_centroid = 0;
+
+                for (c_idx, centroid) in centroids.iter().enumerate() {
+                    let dist = Self::squared_distance(key, centroid);
+                    if dist < min_dist {
+                        min_dist = dist;
+                        best_centroid = c_idx;
+                    }
+                }
+
+                assignments[key_idx] = best_centroid;
+            }
+
+            // Update centroids
+            let mut new_centroids = vec![vec![0.0f32; dim]; m];
+            let mut counts = vec![0usize; m];
+
+            for (key_idx, &assignment) in assignments.iter().enumerate() {
+                counts[assignment] += 1;
+                for (d, &v) in keys[key_idx].iter().enumerate() {
+                    new_centroids[assignment][d] += v;
+                }
+            }
+
+            for c_idx in 0..m {
+                if counts[c_idx] > 0 {
+                    for d in 0..dim {
+                        new_centroids[c_idx][d] /= counts[c_idx] as f32;
+                    }
+                    centroids[c_idx] = new_centroids[c_idx].clone();
+                }
+            }
+        }
+
+        // Compute weights
+        let mut counts = vec![0usize; m];
+        for &a in &assignments {
+            counts[a] += 1;
+        }
+        let weights: Vec<f32> = counts.iter().map(|&c| c as f32 / num_keys as f32).collect();
+
+        Self {
+            centroids,
+            weights,
+            assignments,
+            num_keys,
+        }
+    }
+
+    /// Squared Euclidean distance (SIMD-friendly)
+    #[inline]
+    fn squared_distance(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len();
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            let d0 = a[base] - b[base];
+            let d1 = a[base + 1] - b[base + 1];
+            let d2 = a[base + 2] - b[base + 2];
+            let d3 = a[base + 3] - b[base + 3];
+            sum0 += d0 * d0;
+            sum1 += d1 * d1;
+            sum2 += d2 * d2;
+            sum3 += d3 * d3;
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            let d = a[base + i] - b[base + i];
+            sum0 += d * d;
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+}
+
+/// Centroid-based OT Attention
+///
+/// Computes attention by finding optimal transport between query and
+/// centroid distribution, then distributing attention to original keys.
+#[derive(Debug, Clone)]
+pub struct CentroidOTAttention {
+    config: CentroidOTConfig,
+}
+
+impl CentroidOTAttention {
+    /// Create new Centroid OT attention
+    pub fn new(config: CentroidOTConfig) -> Self {
+        Self { config }
+    }
+
+    /// Create with dimension only
+    pub fn with_dim(dim: usize) -> Self {
+        Self::new(CentroidOTConfig {
+            dim,
+            ..Default::default()
+        })
+    }
+
+    /// Build centroid cache for a window
+    pub fn build_cache(&self, keys: &[&[f32]]) -> CentroidCache {
+        CentroidCache::build(
+            keys,
+            self.config.num_centroids,
+            self.config.kmeans_iterations,
+            self.config.seed,
+        )
+    }
+
+    /// Compute attention using cached centroids
+    pub fn compute_with_cache(
+        &self,
+        query: &[f32],
+        cache: &CentroidCache,
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        if cache.centroids.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty cache".into()));
+        }
+
+        // Compute distances from query to each centroid
+        let centroid_distances: Vec<f32> = cache
+            .centroids
+            .iter()
+            .map(|c| CentroidCache::squared_distance(query, c).sqrt())
+            .collect();
+
+        // Convert to centroid attention weights
+        let centroid_logits: Vec<f32> = centroid_distances
+            .iter()
+            .map(|d| -d / self.config.temperature)
+            .collect();
+
+        let centroid_weights = Self::stable_softmax(&centroid_logits);
+
+        // Distribute centroid weights to original keys
+        let mut key_weights = vec![0.0f32; cache.num_keys];
+        for (key_idx, &assignment) in cache.assignments.iter().enumerate() {
+            // Key weight = centroid weight / number of keys in cluster
+            let cluster_size = cache
+                .assignments
+                .iter()
+                .filter(|&&a| a == assignment)
+                .count();
+            if cluster_size > 0 {
+                key_weights[key_idx] = centroid_weights[assignment] / cluster_size as f32;
+            }
+        }
+
+        // Weighted sum of values
+        self.weighted_sum(&key_weights, values)
+    }
+
+    /// Fast Sinkhorn transport (simplified for point-to-distribution)
+    #[allow(dead_code)]
+    fn sinkhorn_distance(&self, query: &[f32], cache: &CentroidCache) -> f32 {
+        let m = cache.centroids.len();
+        if m == 0 {
+            return 0.0;
+        }
+
+        // Cost matrix: 1 × M (query to each centroid)
+        let costs: Vec<f32> = cache
+            .centroids
+            .iter()
+            .map(|c| CentroidCache::squared_distance(query, c))
+            .collect();
+
+        // Source is delta at query (weight 1)
+        // Target is centroid distribution (cache.weights)
+
+        // Log-domain Sinkhorn
+        let reg = self.config.sinkhorn_reg;
+        let log_k: Vec<f32> = costs.iter().map(|c| -c / reg).collect();
+
+        let mut log_v = vec![0.0f32; m];
+        let log_b: Vec<f32> = cache.weights.iter().map(|w| w.ln().max(-20.0)).collect();
+
+        for _ in 0..self.config.sinkhorn_iterations {
+            // Update log_v
+            let log_sum: f32 = log_k
+                .iter()
+                .zip(log_v.iter())
+                .map(|(&lk, &lv)| lk + lv)
+                .fold(f32::NEG_INFINITY, |max, x| if x > max { x } else { max });
+
+            let exp_sum: f32 = log_k
+                .iter()
+                .zip(log_v.iter())
+                .map(|(&lk, &lv)| (lk + lv - log_sum).exp())
+                .sum();
+
+            let log_u = -log_sum - exp_sum.ln();
+
+            // Update log_v
+            for j in 0..m {
+                log_v[j] = log_b[j] - (log_u + log_k[j]);
+            }
+        }
+
+        // Compute transport cost
+        let mut total_cost = 0.0f32;
+        for j in 0..m {
+            let gamma = (log_v[j] + log_k[j]).exp();
+            total_cost += gamma * costs[j];
+        }
+
+        total_cost
+    }
+
+    /// Stable softmax
+    fn stable_softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        exp_logits.iter().map(|&e| e / sum).collect()
+    }
+
+    /// Weighted sum of values
+    fn weighted_sum(&self, weights: &[f32], values: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        if weights.is_empty() || values.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty inputs".into()));
+        }
+
+        let dim = values[0].len();
+        let mut output = vec![0.0f32; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, &v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+}
+
+impl Attention for CentroidOTAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let cache = self.build_cache(keys);
+        self.compute_with_cache(query, &cache, values)
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            let filtered: Vec<(&[f32], &[f32])> = keys
+                .iter()
+                .zip(values.iter())
+                .enumerate()
+                .filter(|(i, _)| m.get(*i).copied().unwrap_or(true))
+                .map(|(_, (k, v))| (*k, *v))
+                .collect();
+
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(k, _)| *k).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(_, v)| *v).collect();
+
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_centroid_cache() {
+        let keys: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 32]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let cache = CentroidCache::build(&keys_refs, 8, 5, 42);
+
+        assert_eq!(cache.centroids.len(), 8);
+        assert_eq!(cache.weights.len(), 8);
+        assert_eq!(cache.assignments.len(), 50);
+
+        // Weights should sum to 1
+        let weight_sum: f32 = cache.weights.iter().sum();
+        assert!((weight_sum - 1.0).abs() < 1e-5);
+    }
+
+    #[test]
+    fn test_centroid_ot_attention() {
+        let attention = CentroidOTAttention::with_dim(32);
+
+        let query = vec![0.5f32; 32];
+        let keys: Vec<Vec<f32>> = (0..30).map(|i| vec![i as f32 * 0.05; 32]).collect();
+        let values: Vec<Vec<f32>> = (0..30).map(|i| vec![i as f32; 32]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 32);
+    }
+
+    #[test]
+    fn test_cache_reuse() {
+        let attention = CentroidOTAttention::with_dim(64);
+
+        let keys: Vec<Vec<f32>> = (0..40).map(|i| vec![i as f32 * 0.025; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..40).map(|i| vec![i as f32; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        // Build cache once
+        let cache = attention.build_cache(&keys_refs);
+
+        // Reuse for multiple queries
+        for q in 0..10 {
+            let query = vec![q as f32 * 0.1; 64];
+            let output = attention
+                .compute_with_cache(&query, &cache, &values_refs)
+                .unwrap();
+            assert_eq!(output.len(), 64);
+        }
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/transport/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/transport/mod.rs
@@ -0,0 +1,35 @@
+//! Optimal Transport Attention
+//!
+//! Fast attention mechanisms using Optimal Transport theory.
+//!
+//! ## Key Optimizations
+//!
+//! 1. **Sliced Wasserstein**: Random 1D projections with cached sorted orders
+//! 2. **Centroid OT**: Cluster keys into M centroids, transport to prototypes only
+//! 3. **Two-Stage**: Cheap prefilter + expensive OT kernel on candidates
+//! 4. **Histogram CDF**: Replace sorting with binned CDFs for SIMD-friendly ops
+//!
+//! ## Performance Targets
+//!
+//! - Candidates C: 32-64
+//! - Projections P: 8-16
+//! - Centroids M: 16-32
+
+mod cached_projections;
+mod centroid_ot;
+mod sliced_wasserstein;
+
+pub use cached_projections::{ProjectionCache, WindowCache};
+pub use centroid_ot::{CentroidCache, CentroidOTAttention, CentroidOTConfig};
+pub use sliced_wasserstein::{SlicedWassersteinAttention, SlicedWassersteinConfig};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_module_exists() {
+        // Basic module test
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/transport/sliced_wasserstein.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/transport/sliced_wasserstein.rs
@@ -0,0 +1,443 @@
+//! Sliced Wasserstein Attention
+//!
+//! Attention using Optimal Transport distances via random 1D projections.
+//!
+//! ## Algorithm
+//!
+//! 1. Pre-compute P random projections and cache sorted orders per window
+//! 2. For each query:
+//!    a. Project query onto all P directions
+//!    b. Compare to cached sorted key distributions
+//!    c. Convert transport costs to attention logits
+//!
+//! ## Optimizations
+//!
+//! - Window-level caching of sorted projections
+//! - Two-stage: dot-product prefilter + OT on candidates
+//! - Histogram CDF for ultra-fast comparisons
+//! - SIMD-friendly kernels throughout
+
+use super::cached_projections::{ProjectionCache, WindowCache};
+use crate::error::{AttentionError, AttentionResult};
+use crate::traits::Attention;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for Sliced Wasserstein Attention
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SlicedWassersteinConfig {
+    /// Model dimension
+    pub dim: usize,
+    /// Number of random projections (8-16 typical)
+    pub num_projections: usize,
+    /// Number of candidates for two-stage filtering (32-64 typical)
+    pub num_candidates: usize,
+    /// Temperature for softmax
+    pub temperature: f32,
+    /// Whether to use histogram-based CDF (faster but less precise)
+    pub use_histograms: bool,
+    /// Number of histogram bins if using histograms
+    pub num_bins: usize,
+    /// Random seed for reproducibility
+    pub seed: u64,
+    /// Wasserstein power (1 or 2)
+    pub wasserstein_power: f32,
+}
+
+impl Default for SlicedWassersteinConfig {
+    fn default() -> Self {
+        Self {
+            dim: 512,
+            num_projections: 8,
+            num_candidates: 48,
+            temperature: 1.0,
+            use_histograms: false,
+            num_bins: 32,
+            seed: 42,
+            wasserstein_power: 2.0,
+        }
+    }
+}
+
+impl SlicedWassersteinConfig {
+    /// Create config with dimension
+    pub fn with_dim(dim: usize) -> Self {
+        Self {
+            dim,
+            ..Default::default()
+        }
+    }
+}
+
+/// Sliced Wasserstein Attention
+///
+/// Uses OT distance instead of dot product for attention scoring.
+/// Robust to local permutations and better for comparing distributions.
+#[derive(Debug, Clone)]
+pub struct SlicedWassersteinAttention {
+    config: SlicedWassersteinConfig,
+    projection_cache: ProjectionCache,
+}
+
+impl SlicedWassersteinAttention {
+    /// Create new Sliced Wasserstein attention
+    pub fn new(config: SlicedWassersteinConfig) -> Self {
+        let projection_cache =
+            ProjectionCache::new(config.dim, config.num_projections, config.seed);
+
+        Self {
+            config,
+            projection_cache,
+        }
+    }
+
+    /// Create with dimension only (uses defaults)
+    pub fn with_dim(dim: usize) -> Self {
+        Self::new(SlicedWassersteinConfig::with_dim(dim))
+    }
+
+    /// Build window cache for a set of keys
+    pub fn build_window_cache(&self, keys: &[&[f32]]) -> WindowCache {
+        let mut cache = WindowCache::build(keys, &self.projection_cache);
+        if self.config.use_histograms {
+            cache.build_histograms(self.config.num_bins);
+        }
+        cache
+    }
+
+    /// Compute attention using pre-built window cache (fast path)
+    pub fn compute_with_cache(
+        &self,
+        query: &[f32],
+        window_cache: &WindowCache,
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        let num_keys = window_cache.num_keys;
+        if num_keys == 0 {
+            return Err(AttentionError::InvalidConfig("No keys provided".into()));
+        }
+
+        // Project query
+        let query_projections = self.projection_cache.project(query);
+
+        // Compute OT distances to all keys
+        let distances = self.compute_ot_distances(&query_projections, window_cache);
+
+        // Convert to attention weights (negative distance = higher attention)
+        let logits: Vec<f32> = distances
+            .iter()
+            .map(|d| -d / self.config.temperature)
+            .collect();
+
+        // Softmax
+        let weights = Self::stable_softmax(&logits);
+
+        // Weighted sum of values
+        self.weighted_sum(&weights, values)
+    }
+
+    /// Compute with two-stage filtering (prefilter + OT on candidates)
+    pub fn compute_two_stage(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        window_cache: &WindowCache,
+    ) -> AttentionResult<Vec<f32>> {
+        let num_keys = keys.len();
+        if num_keys == 0 {
+            return Err(AttentionError::InvalidConfig("No keys provided".into()));
+        }
+
+        let num_candidates = self.config.num_candidates.min(num_keys);
+
+        // Stage 1: Cheap dot-product prefilter to get top-C candidates
+        let mut dot_scores: Vec<(usize, f32)> = keys
+            .iter()
+            .enumerate()
+            .map(|(i, k)| (i, Self::dot_product_simd(query, k)))
+            .collect();
+        dot_scores
+            .sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        let candidate_indices: Vec<usize> = dot_scores
+            .iter()
+            .take(num_candidates)
+            .map(|(i, _)| *i)
+            .collect();
+
+        // Stage 2: OT distance only on candidates
+        let query_projections = self.projection_cache.project(query);
+
+        let candidate_distances: Vec<(usize, f32)> = candidate_indices
+            .iter()
+            .map(|&idx| {
+                let key_projs = &window_cache.key_projections[idx];
+                let ot_dist = self.compute_1d_ot_distance(&query_projections, key_projs);
+                (idx, ot_dist)
+            })
+            .collect();
+
+        // Convert to attention weights
+        let logits: Vec<f32> = candidate_distances
+            .iter()
+            .map(|(_, d)| -d / self.config.temperature)
+            .collect();
+
+        let weights = Self::stable_softmax(&logits);
+
+        // Weighted sum using only candidate values
+        let candidate_values: Vec<&[f32]> = candidate_indices.iter().map(|&i| values[i]).collect();
+
+        self.weighted_sum(&weights, &candidate_values)
+    }
+
+    /// Compute 1D OT distances using sorted projections
+    fn compute_ot_distances(&self, query_projs: &[f32], cache: &WindowCache) -> Vec<f32> {
+        let num_keys = cache.num_keys;
+        let mut distances = vec![0.0f32; num_keys];
+
+        // For each key, sum OT distances across all projections
+        for key_idx in 0..num_keys {
+            let key_projs = &cache.key_projections[key_idx];
+            distances[key_idx] = self.compute_1d_ot_distance(query_projs, key_projs);
+        }
+
+        distances
+    }
+
+    /// Compute OT distance between two projected points
+    /// Simple case: |q - k|^p averaged across projections
+    #[inline]
+    fn compute_1d_ot_distance(&self, query_projs: &[f32], key_projs: &[f32]) -> f32 {
+        let p = self.config.wasserstein_power;
+        let num_proj = query_projs.len();
+
+        if (p - 2.0).abs() < 0.01 {
+            // W2: squared differences (SIMD-friendly)
+            let sum: f32 = query_projs
+                .iter()
+                .zip(key_projs.iter())
+                .map(|(&q, &k)| {
+                    let d = q - k;
+                    d * d
+                })
+                .sum();
+            (sum / num_proj as f32).sqrt()
+        } else if (p - 1.0).abs() < 0.01 {
+            // W1: absolute differences
+            let sum: f32 = query_projs
+                .iter()
+                .zip(key_projs.iter())
+                .map(|(&q, &k)| (q - k).abs())
+                .sum();
+            sum / num_proj as f32
+        } else {
+            // General case
+            let sum: f32 = query_projs
+                .iter()
+                .zip(key_projs.iter())
+                .map(|(&q, &k)| (q - k).abs().powf(p))
+                .sum();
+            (sum / num_proj as f32).powf(1.0 / p)
+        }
+    }
+
+    /// Compute OT distance to the window distribution using sorted values
+    /// This compares query to the empirical CDF of keys
+    #[allow(dead_code)]
+    fn compute_distributional_ot(&self, query_projs: &[f32], cache: &WindowCache) -> f32 {
+        let num_proj = query_projs.len();
+        let mut total_dist = 0.0f32;
+
+        for p in 0..num_proj {
+            let sorted = cache.get_sorted(p);
+            let q_val = query_projs[p];
+
+            // Find where query falls in the sorted distribution
+            // and compute distance to nearest quantile
+            let n = sorted.len() as f32;
+            let mut min_dist = f32::MAX;
+
+            for (i, &k_val) in sorted.iter().enumerate() {
+                let quantile_dist = ((i as f32 + 0.5) / n - 0.5).abs();
+                let value_dist = (q_val - k_val).abs();
+                min_dist = min_dist.min(value_dist + 0.1 * quantile_dist);
+            }
+
+            total_dist += min_dist;
+        }
+
+        total_dist / num_proj as f32
+    }
+
+    /// Stable softmax implementation
+    #[inline]
+    fn stable_softmax(logits: &[f32]) -> Vec<f32> {
+        if logits.is_empty() {
+            return vec![];
+        }
+
+        let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let exp_logits: Vec<f32> = logits.iter().map(|&l| (l - max_logit).exp()).collect();
+        let sum: f32 = exp_logits.iter().sum();
+
+        exp_logits.iter().map(|&e| e / sum).collect()
+    }
+
+    /// SIMD-friendly dot product
+    #[inline(always)]
+    fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
+        let len = a.len().min(b.len());
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        let mut sum0 = 0.0f32;
+        let mut sum1 = 0.0f32;
+        let mut sum2 = 0.0f32;
+        let mut sum3 = 0.0f32;
+
+        for i in 0..chunks {
+            let base = i * 4;
+            sum0 += a[base] * b[base];
+            sum1 += a[base + 1] * b[base + 1];
+            sum2 += a[base + 2] * b[base + 2];
+            sum3 += a[base + 3] * b[base + 3];
+        }
+
+        let base = chunks * 4;
+        for i in 0..remainder {
+            sum0 += a[base + i] * b[base + i];
+        }
+
+        sum0 + sum1 + sum2 + sum3
+    }
+
+    /// Weighted sum of values
+    fn weighted_sum(&self, weights: &[f32], values: &[&[f32]]) -> AttentionResult<Vec<f32>> {
+        if weights.is_empty() || values.is_empty() {
+            return Err(AttentionError::InvalidConfig("Empty inputs".into()));
+        }
+
+        let dim = values[0].len();
+        let mut output = vec![0.0f32; dim];
+
+        for (weight, value) in weights.iter().zip(values.iter()) {
+            for (o, &v) in output.iter_mut().zip(value.iter()) {
+                *o += weight * v;
+            }
+        }
+
+        Ok(output)
+    }
+}
+
+impl Attention for SlicedWassersteinAttention {
+    fn compute(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+    ) -> AttentionResult<Vec<f32>> {
+        // Build cache and compute
+        let cache = self.build_window_cache(keys);
+
+        if self.config.num_candidates < keys.len() {
+            self.compute_two_stage(query, keys, values, &cache)
+        } else {
+            self.compute_with_cache(query, &cache, values)
+        }
+    }
+
+    fn compute_with_mask(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        values: &[&[f32]],
+        mask: Option<&[bool]>,
+    ) -> AttentionResult<Vec<f32>> {
+        if let Some(m) = mask {
+            // Filter by mask
+            let filtered: Vec<(usize, &[f32], &[f32])> = keys
+                .iter()
+                .zip(values.iter())
+                .enumerate()
+                .filter(|(i, _)| m.get(*i).copied().unwrap_or(true))
+                .map(|(i, (k, v))| (i, *k, *v))
+                .collect();
+
+            let filtered_keys: Vec<&[f32]> = filtered.iter().map(|(_, k, _)| *k).collect();
+            let filtered_values: Vec<&[f32]> = filtered.iter().map(|(_, _, v)| *v).collect();
+
+            self.compute(query, &filtered_keys, &filtered_values)
+        } else {
+            self.compute(query, keys, values)
+        }
+    }
+
+    fn dim(&self) -> usize {
+        self.config.dim
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sliced_wasserstein_attention() {
+        let attention = SlicedWassersteinAttention::with_dim(32);
+
+        let query = vec![1.0f32; 32];
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![0.5 + i as f32 * 0.1; 32]).collect();
+        let values: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; 32]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 32);
+    }
+
+    #[test]
+    fn test_window_cache_reuse() {
+        let attention = SlicedWassersteinAttention::with_dim(64);
+
+        let keys: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32 * 0.05; 64]).collect();
+        let values: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32; 64]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        // Build cache once
+        let cache = attention.build_window_cache(&keys_refs);
+
+        // Reuse for multiple queries
+        for _ in 0..5 {
+            let query = vec![0.5f32; 64];
+            let output = attention
+                .compute_with_cache(&query, &cache, &values_refs)
+                .unwrap();
+            assert_eq!(output.len(), 64);
+        }
+    }
+
+    #[test]
+    fn test_two_stage_filtering() {
+        let config = SlicedWassersteinConfig {
+            dim: 32,
+            num_candidates: 8,
+            ..Default::default()
+        };
+        let attention = SlicedWassersteinAttention::new(config);
+
+        let query = vec![1.0f32; 32];
+        let keys: Vec<Vec<f32>> = (0..50).map(|i| vec![0.5 + i as f32 * 0.02; 32]).collect();
+        let values: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32; 32]).collect();
+
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+        let output = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        assert_eq!(output.len(), 32);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/unified_report/metrics.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/unified_report/metrics.rs
@@ -0,0 +1,122 @@
+//! Individual Geometry Metrics
+
+use serde::{Deserialize, Serialize};
+
+/// Type of geometric metric
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum MetricType {
+    /// Sliced Wasserstein OT distance
+    OTDistance,
+    /// Topology coherence (k-NN based)
+    TopologyCoherence,
+    /// H0 persistence death sum
+    H0Persistence,
+    /// Information bottleneck KL
+    IBKL,
+    /// Diffusion energy
+    DiffusionEnergy,
+    /// Fisher-Rao distance
+    FisherRao,
+    /// Attention entropy
+    AttentionEntropy,
+}
+
+/// A metric value with metadata
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetricValue {
+    /// Metric type
+    pub metric_type: MetricType,
+    /// Raw value
+    pub value: f32,
+    /// Normalized value (0-1)
+    pub normalized: f32,
+    /// Whether this is healthy (within expected range)
+    pub is_healthy: bool,
+    /// Warning threshold
+    pub warning_threshold: f32,
+    /// Critical threshold
+    pub critical_threshold: f32,
+}
+
+impl MetricValue {
+    /// Create new metric value
+    pub fn new(
+        metric_type: MetricType,
+        value: f32,
+        min_expected: f32,
+        max_expected: f32,
+        warning_threshold: f32,
+        critical_threshold: f32,
+    ) -> Self {
+        let range = max_expected - min_expected;
+        let normalized = if range > 0.0 {
+            ((value - min_expected) / range).clamp(0.0, 1.0)
+        } else {
+            0.5
+        };
+
+        let is_healthy = value >= min_expected && value <= max_expected;
+
+        Self {
+            metric_type,
+            value,
+            normalized,
+            is_healthy,
+            warning_threshold,
+            critical_threshold,
+        }
+    }
+
+    /// Check if metric is in warning state
+    pub fn is_warning(&self) -> bool {
+        self.value > self.warning_threshold && self.value <= self.critical_threshold
+    }
+
+    /// Check if metric is in critical state
+    pub fn is_critical(&self) -> bool {
+        self.value > self.critical_threshold
+    }
+
+    /// Get status string
+    pub fn status(&self) -> &'static str {
+        if self.is_critical() {
+            "CRITICAL"
+        } else if self.is_warning() {
+            "WARNING"
+        } else if self.is_healthy {
+            "OK"
+        } else {
+            "UNKNOWN"
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_metric_value() {
+        let metric = MetricValue::new(MetricType::TopologyCoherence, 0.7, 0.0, 1.0, 0.3, 0.1);
+
+        assert_eq!(metric.metric_type, MetricType::TopologyCoherence);
+        assert!((metric.normalized - 0.7).abs() < 1e-5);
+        assert!(metric.is_healthy);
+    }
+
+    #[test]
+    fn test_warning_critical() {
+        let metric = MetricValue::new(
+            MetricType::OTDistance,
+            5.0, // High OT distance
+            0.0,
+            10.0,
+            3.0, // Warning at 3
+            7.0, // Critical at 7
+        );
+
+        assert!(metric.is_warning());
+        assert!(!metric.is_critical());
+        assert_eq!(metric.status(), "WARNING");
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/unified_report/mod.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/unified_report/mod.rs
@@ -0,0 +1,32 @@
+//! Unified Geometry Report
+//!
+//! Combines all geometric metrics into a single diagnostic surface.
+//!
+//! ## Metrics Included
+//!
+//! 1. **OT Distance**: Sliced Wasserstein mean absolute distance
+//! 2. **Topology Coherence**: k-NN boundary mass ratio
+//! 3. **H0 Persistence**: Sum of death times (structural complexity)
+//! 4. **IB KL**: Information bottleneck compression term
+//! 5. **Diffusion Energy**: Smoothness on key graph
+//!
+//! ## Use Cases
+//!
+//! - Routing decisions in MoE
+//! - Gating signals for attention modes
+//! - Monitoring attention health
+//! - Debugging attention patterns
+
+mod metrics;
+mod report;
+
+pub use metrics::{MetricType, MetricValue};
+pub use report::{AttentionRecommendation, GeometryReport, ReportBuilder, ReportConfig};
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_module_exists() {
+        assert!(true);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/unified_report/report.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/unified_report/report.rs
@@ -0,0 +1,494 @@
+//! Unified Geometry Report Builder
+
+use super::metrics::{MetricType, MetricValue};
+use crate::info_bottleneck::KLDivergence;
+use crate::pde_attention::GraphLaplacian;
+use crate::topology::WindowCoherence;
+use serde::{Deserialize, Serialize};
+
+/// Report configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReportConfig {
+    /// Number of OT projections
+    pub ot_projections: usize,
+    /// k for k-NN coherence
+    pub knn_k: usize,
+    /// Sigma for diffusion
+    pub diffusion_sigma: f32,
+    /// Whether to compute H0 persistence (expensive)
+    pub compute_persistence: bool,
+    /// Random seed
+    pub seed: u64,
+}
+
+impl Default for ReportConfig {
+    fn default() -> Self {
+        Self {
+            ot_projections: 8,
+            knn_k: 8,
+            diffusion_sigma: 1.0,
+            compute_persistence: false,
+            seed: 42,
+        }
+    }
+}
+
+/// Unified geometry report
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GeometryReport {
+    /// OT sliced Wasserstein mean distance
+    pub ot_mean_distance: f32,
+    /// Topology coherence score
+    pub topology_coherence: f32,
+    /// H0 persistence death sum (if computed)
+    pub h0_death_sum: Option<f32>,
+    /// Information bottleneck KL
+    pub ib_kl: f32,
+    /// Diffusion energy
+    pub diffusion_energy: f32,
+    /// Attention entropy
+    pub attention_entropy: f32,
+    /// All metrics with thresholds
+    pub metrics: Vec<MetricValue>,
+    /// Overall health score (0-1)
+    pub health_score: f32,
+    /// Recommended action
+    pub recommendation: AttentionRecommendation,
+}
+
+/// Recommended action based on report
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum AttentionRecommendation {
+    /// Full attention, normal operation
+    Stable,
+    /// Reduce attention width
+    Cautious,
+    /// Retrieval only, no updates
+    Freeze,
+    /// Increase temperature
+    IncreaseTemperature,
+    /// Decrease temperature
+    DecreaseTemperature,
+    /// Add regularization
+    AddRegularization,
+}
+
+/// Report builder
+pub struct ReportBuilder {
+    config: ReportConfig,
+}
+
+impl ReportBuilder {
+    /// Create new report builder
+    pub fn new(config: ReportConfig) -> Self {
+        Self { config }
+    }
+
+    /// Build report from query and keys
+    pub fn build(
+        &self,
+        query: &[f32],
+        keys: &[&[f32]],
+        attention_weights: Option<&[f32]>,
+        ib_mean: Option<&[f32]>,
+        ib_log_var: Option<&[f32]>,
+    ) -> GeometryReport {
+        let n = keys.len();
+        if n == 0 {
+            return GeometryReport::empty();
+        }
+
+        let _dim = keys[0].len();
+
+        // 1. OT distance (simplified sliced Wasserstein)
+        let ot_mean = self.compute_ot_distance(query, keys);
+
+        // 2. Topology coherence
+        let coherence = self.compute_coherence(keys);
+
+        // 3. H0 persistence (optional)
+        let h0_sum = if self.config.compute_persistence {
+            Some(self.compute_h0_persistence(keys))
+        } else {
+            None
+        };
+
+        // 4. IB KL
+        let ib_kl = match (ib_mean, ib_log_var) {
+            (Some(m), Some(v)) => KLDivergence::gaussian_to_unit_arrays(m, v),
+            _ => 0.0,
+        };
+
+        // 5. Diffusion energy
+        let diffusion_energy = self.compute_diffusion_energy(query, keys);
+
+        // 6. Attention entropy
+        let entropy = match attention_weights {
+            Some(w) => self.compute_entropy(w),
+            None => (n as f32).ln(), // Max entropy
+        };
+
+        // Build metrics
+        let mut metrics = vec![
+            MetricValue::new(MetricType::OTDistance, ot_mean, 0.0, 10.0, 5.0, 8.0),
+            MetricValue::new(MetricType::TopologyCoherence, coherence, 0.0, 1.0, 0.3, 0.1),
+            MetricValue::new(MetricType::IBKL, ib_kl, 0.0, 100.0, 50.0, 80.0),
+            MetricValue::new(
+                MetricType::DiffusionEnergy,
+                diffusion_energy,
+                0.0,
+                100.0,
+                50.0,
+                80.0,
+            ),
+            MetricValue::new(
+                MetricType::AttentionEntropy,
+                entropy,
+                0.0,
+                (n as f32).ln().max(1.0),
+                0.5,
+                0.2,
+            ),
+        ];
+
+        if let Some(h0) = h0_sum {
+            metrics.push(MetricValue::new(
+                MetricType::H0Persistence,
+                h0,
+                0.0,
+                100.0,
+                50.0,
+                80.0,
+            ));
+        }
+
+        // Compute health score
+        let health_score = self.compute_health_score(&metrics);
+
+        // Determine recommendation
+        let recommendation = self.determine_recommendation(&metrics, coherence, entropy, n);
+
+        GeometryReport {
+            ot_mean_distance: ot_mean,
+            topology_coherence: coherence,
+            h0_death_sum: h0_sum,
+            ib_kl,
+            diffusion_energy,
+            attention_entropy: entropy,
+            metrics,
+            health_score,
+            recommendation,
+        }
+    }
+
+    /// Simplified sliced Wasserstein distance
+    fn compute_ot_distance(&self, query: &[f32], keys: &[&[f32]]) -> f32 {
+        let dim = query.len();
+        let n = keys.len();
+        if n == 0 {
+            return 0.0;
+        }
+
+        // Generate random projections
+        let mut rng_state = self.config.seed;
+        let projections: Vec<Vec<f32>> = (0..self.config.ot_projections)
+            .map(|_| self.random_unit_vector(dim, &mut rng_state))
+            .collect();
+
+        // Project query
+        let q_projs: Vec<f32> = projections.iter().map(|p| Self::dot(query, p)).collect();
+
+        // Mean absolute distance over keys
+        let mut total = 0.0f32;
+        for key in keys {
+            let mut dist = 0.0f32;
+            for (i, proj) in projections.iter().enumerate() {
+                let k_proj = Self::dot(key, proj);
+                dist += (q_projs[i] - k_proj).abs();
+            }
+            total += dist / self.config.ot_projections as f32;
+        }
+
+        total / n as f32
+    }
+
+    /// Compute coherence using WindowCoherence
+    fn compute_coherence(&self, keys: &[&[f32]]) -> f32 {
+        use crate::topology::CoherenceMetric;
+
+        let coherence = WindowCoherence::compute(
+            keys,
+            self.config.knn_k,
+            &[
+                CoherenceMetric::BoundaryMass,
+                CoherenceMetric::SimilarityVariance,
+            ],
+        );
+
+        coherence.score
+    }
+
+    /// Compute H0 persistence (expensive)
+    fn compute_h0_persistence(&self, keys: &[&[f32]]) -> f32 {
+        let n = keys.len();
+        if n <= 1 {
+            return 0.0;
+        }
+
+        // Build distance matrix
+        let mut edges: Vec<(f32, usize, usize)> = Vec::new();
+        for i in 0..n {
+            for j in (i + 1)..n {
+                let dist = Self::l2_distance(keys[i], keys[j]);
+                edges.push((dist, i, j));
+            }
+        }
+
+        edges.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Union-Find for Kruskal's algorithm
+        let mut parent: Vec<usize> = (0..n).collect();
+        let mut rank = vec![0u8; n];
+        let mut deaths = Vec::new();
+
+        fn find(parent: &mut [usize], x: usize) -> usize {
+            if parent[x] != x {
+                parent[x] = find(parent, parent[x]);
+            }
+            parent[x]
+        }
+
+        fn union(parent: &mut [usize], rank: &mut [u8], a: usize, b: usize) -> bool {
+            let mut ra = find(parent, a);
+            let mut rb = find(parent, b);
+            if ra == rb {
+                return false;
+            }
+            if rank[ra] < rank[rb] {
+                std::mem::swap(&mut ra, &mut rb);
+            }
+            parent[rb] = ra;
+            if rank[ra] == rank[rb] {
+                rank[ra] += 1;
+            }
+            true
+        }
+
+        for (w, i, j) in edges {
+            if union(&mut parent, &mut rank, i, j) {
+                deaths.push(w);
+                if deaths.len() == n - 1 {
+                    break;
+                }
+            }
+        }
+
+        // Remove last (infinite lifetime component)
+        if !deaths.is_empty() {
+            deaths.pop();
+        }
+
+        deaths.iter().sum()
+    }
+
+    /// Compute diffusion energy
+    fn compute_diffusion_energy(&self, query: &[f32], keys: &[&[f32]]) -> f32 {
+        use crate::pde_attention::LaplacianType;
+
+        let n = keys.len();
+        if n == 0 {
+            return 0.0;
+        }
+
+        // Initial logits
+        let x: Vec<f32> = keys.iter().map(|k| Self::dot(query, k)).collect();
+
+        // Build Laplacian
+        let lap = GraphLaplacian::from_keys(
+            keys,
+            self.config.diffusion_sigma,
+            LaplacianType::Unnormalized,
+        );
+
+        // Energy = x^T L x
+        let lx = lap.apply(&x);
+        Self::dot(&x, &lx)
+    }
+
+    /// Compute entropy
+    fn compute_entropy(&self, weights: &[f32]) -> f32 {
+        let eps = 1e-10;
+        let mut entropy = 0.0f32;
+
+        for &w in weights {
+            if w > eps {
+                entropy -= w * w.ln();
+            }
+        }
+
+        entropy.max(0.0)
+    }
+
+    /// Compute overall health score
+    fn compute_health_score(&self, metrics: &[MetricValue]) -> f32 {
+        if metrics.is_empty() {
+            return 1.0;
+        }
+
+        let healthy_count = metrics.iter().filter(|m| m.is_healthy).count();
+        healthy_count as f32 / metrics.len() as f32
+    }
+
+    /// Determine recommendation
+    fn determine_recommendation(
+        &self,
+        metrics: &[MetricValue],
+        coherence: f32,
+        entropy: f32,
+        n: usize,
+    ) -> AttentionRecommendation {
+        let max_entropy = (n as f32).ln().max(1.0);
+        let entropy_ratio = entropy / max_entropy;
+
+        // Check for critical conditions
+        let has_critical = metrics.iter().any(|m| m.is_critical());
+        if has_critical {
+            return AttentionRecommendation::Freeze;
+        }
+
+        // Low coherence = cautious mode
+        if coherence < 0.3 {
+            return AttentionRecommendation::Cautious;
+        }
+
+        // Very low entropy = temperature too low
+        if entropy_ratio < 0.2 {
+            return AttentionRecommendation::IncreaseTemperature;
+        }
+
+        // Very high entropy = temperature too high
+        if entropy_ratio > 0.9 {
+            return AttentionRecommendation::DecreaseTemperature;
+        }
+
+        // Check for warnings
+        let has_warning = metrics.iter().any(|m| m.is_warning());
+        if has_warning {
+            return AttentionRecommendation::AddRegularization;
+        }
+
+        AttentionRecommendation::Stable
+    }
+
+    /// Generate random unit vector
+    fn random_unit_vector(&self, dim: usize, state: &mut u64) -> Vec<f32> {
+        let mut v = vec![0.0f32; dim];
+        for i in 0..dim {
+            // XorShift
+            *state ^= *state << 13;
+            *state ^= *state >> 7;
+            *state ^= *state << 17;
+            let u = (*state & 0x00FF_FFFF) as f32 / 16_777_216.0;
+            v[i] = u * 2.0 - 1.0;
+        }
+
+        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+        if norm > 0.0 {
+            for x in v.iter_mut() {
+                *x /= norm;
+            }
+        }
+
+        v
+    }
+
+    /// Dot product
+    #[inline]
+    fn dot(a: &[f32], b: &[f32]) -> f32 {
+        a.iter().zip(b.iter()).map(|(&ai, &bi)| ai * bi).sum()
+    }
+
+    /// L2 distance
+    #[inline]
+    fn l2_distance(a: &[f32], b: &[f32]) -> f32 {
+        a.iter()
+            .zip(b.iter())
+            .map(|(&ai, &bi)| (ai - bi) * (ai - bi))
+            .sum::<f32>()
+            .sqrt()
+    }
+}
+
+impl GeometryReport {
+    /// Create empty report
+    pub fn empty() -> Self {
+        Self {
+            ot_mean_distance: 0.0,
+            topology_coherence: 1.0,
+            h0_death_sum: None,
+            ib_kl: 0.0,
+            diffusion_energy: 0.0,
+            attention_entropy: 0.0,
+            metrics: vec![],
+            health_score: 1.0,
+            recommendation: AttentionRecommendation::Stable,
+        }
+    }
+
+    /// Check if attention is healthy
+    pub fn is_healthy(&self) -> bool {
+        self.health_score > 0.7
+    }
+
+    /// Get all warning metrics
+    pub fn warnings(&self) -> Vec<&MetricValue> {
+        self.metrics.iter().filter(|m| m.is_warning()).collect()
+    }
+
+    /// Get all critical metrics
+    pub fn criticals(&self) -> Vec<&MetricValue> {
+        self.metrics.iter().filter(|m| m.is_critical()).collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_report_builder() {
+        let builder = ReportBuilder::new(ReportConfig::default());
+
+        let query = vec![1.0f32; 16];
+        let keys: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32 * 0.1; 16]).collect();
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+
+        let report = builder.build(&query, &keys_refs, None, None, None);
+
+        assert!(report.topology_coherence >= 0.0);
+        assert!(report.topology_coherence <= 1.0);
+        assert!(report.health_score >= 0.0);
+        assert!(report.health_score <= 1.0);
+    }
+
+    #[test]
+    fn test_empty_report() {
+        let report = GeometryReport::empty();
+        assert!(report.is_healthy());
+        assert_eq!(report.recommendation, AttentionRecommendation::Stable);
+    }
+
+    #[test]
+    fn test_with_attention_weights() {
+        let builder = ReportBuilder::new(ReportConfig::default());
+
+        let query = vec![1.0f32; 8];
+        let keys: Vec<Vec<f32>> = vec![vec![1.0; 8], vec![0.9; 8], vec![0.1; 8]];
+        let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+        let weights = vec![0.6, 0.3, 0.1];
+
+        let report = builder.build(&query, &keys_refs, Some(&weights), None, None);
+
+        assert!(report.attention_entropy > 0.0);
+    }
+}
--- a/vendor/ruvector/crates/ruvector-attention/src/utils.rs
+++ b/vendor/ruvector/crates/ruvector-attention/src/utils.rs
@@ -0,0 +1,384 @@
+//! Utility functions for attention mechanisms.
+//!
+//! This module provides common utilities like softmax, masking, and
+//! numerical stability helpers used across attention implementations.
+
+use crate::error::{AttentionError, AttentionResult};
+
+/// Stable softmax that returns Vec<f32> directly (no Result)
+/// Used by sparse, moe, and graph modules
+#[inline]
+pub fn stable_softmax(values: &[f32]) -> Vec<f32> {
+    if values.is_empty() {
+        return vec![];
+    }
+
+    // Find maximum for numerical stability
+    let max_val = values
+        .iter()
+        .copied()
+        .filter(|x| x.is_finite())
+        .fold(f32::NEG_INFINITY, f32::max);
+
+    if !max_val.is_finite() {
+        // All values are -inf or invalid, return uniform
+        let n = values.len();
+        return vec![1.0 / n as f32; n];
+    }
+
+    // Compute exp(x - max) and sum
+    let mut exp_values: Vec<f32> = values
+        .iter()
+        .map(|&x| {
+            if x.is_finite() {
+                (x - max_val).exp()
+            } else {
+                0.0
+            }
+        })
+        .collect();
+
+    let sum: f32 = exp_values.iter().sum();
+
+    if sum <= 1e-10 || !sum.is_finite() {
+        // Fallback to uniform
+        let n = values.len();
+        return vec![1.0 / n as f32; n];
+    }
+
+    // Normalize
+    let inv_sum = 1.0 / sum;
+    exp_values.iter_mut().for_each(|x| *x *= inv_sum);
+
+    exp_values
+}
+
+/// Computes softmax over a slice of values.
+///
+/// Uses the numerically stable variant: softmax(x) = exp(x - max(x)) / sum(exp(x - max(x)))
+///
+/// # Arguments
+///
+/// * `values` - Input values
+///
+/// # Returns
+///
+/// Softmax-normalized values
+#[inline]
+pub fn softmax(values: &[f32]) -> AttentionResult<Vec<f32>> {
+    if values.is_empty() {
+        return Err(AttentionError::EmptyInput(
+            "cannot compute softmax of empty slice".to_string(),
+        ));
+    }
+
+    // Find maximum for numerical stability
+    let max_val = values.iter().copied().fold(f32::NEG_INFINITY, f32::max);
+
+    if !max_val.is_finite() {
+        return Err(AttentionError::NumericalInstability(
+            "non-finite values in softmax input".to_string(),
+        ));
+    }
+
+    // Compute exp(x - max) and sum
+    let mut exp_values: Vec<f32> = values.iter().map(|&x| (x - max_val).exp()).collect();
+
+    let sum: f32 = exp_values.iter().sum();
+
+    if sum <= 0.0 || !sum.is_finite() {
+        return Err(AttentionError::NumericalInstability(
+            "invalid sum in softmax computation".to_string(),
+        ));
+    }
+
+    // Normalize
+    let inv_sum = 1.0 / sum;
+    exp_values.iter_mut().for_each(|x| *x *= inv_sum);
+
+    Ok(exp_values)
+}
+
+/// Computes softmax with masking support.
+///
+/// Masked positions are set to negative infinity before softmax,
+/// resulting in zero attention weights.
+///
+/// # Arguments
+///
+/// * `values` - Input values
+/// * `mask` - Optional mask (true = attend, false = mask out)
+///
+/// # Returns
+///
+/// Masked and softmax-normalized values
+#[inline]
+pub fn masked_softmax(values: &[f32], mask: Option<&[bool]>) -> AttentionResult<Vec<f32>> {
+    if values.is_empty() {
+        return Err(AttentionError::EmptyInput(
+            "cannot compute softmax of empty slice".to_string(),
+        ));
+    }
+
+    let masked_values = if let Some(m) = mask {
+        if m.len() != values.len() {
+            return Err(AttentionError::InvalidMask {
+                expected: format!("{}", values.len()),
+                actual: format!("{}", m.len()),
+            });
+        }
+
+        values
+            .iter()
+            .zip(m.iter())
+            .map(|(&v, &keep)| if keep { v } else { f32::NEG_INFINITY })
+            .collect::<Vec<_>>()
+    } else {
+        values.to_vec()
+    };
+
+    softmax(&masked_values)
+}
+
+/// Applies causal masking to attention scores.
+///
+/// For position i, only positions 0..=i can be attended to.
+///
+/// # Arguments
+///
+/// * `scores` - Attention scores matrix [query_len, key_len]
+/// * `query_len` - Number of query positions
+/// * `key_len` - Number of key positions
+///
+/// # Returns
+///
+/// Causally masked scores
+pub fn apply_causal_mask(
+    scores: &mut [f32],
+    query_len: usize,
+    key_len: usize,
+) -> AttentionResult<()> {
+    if scores.len() != query_len * key_len {
+        return Err(AttentionError::InvalidMask {
+            expected: format!("{}x{}", query_len, key_len),
+            actual: format!("{}", scores.len()),
+        });
+    }
+
+    for i in 0..query_len {
+        for j in (i + 1)..key_len {
+            scores[i * key_len + j] = f32::NEG_INFINITY;
+        }
+    }
+
+    Ok(())
+}
+
+/// Computes dot product between two vectors.
+///
+/// # Arguments
+///
+/// * `a` - First vector
+/// * `b` - Second vector
+///
+/// # Returns
+///
+/// Dot product value
+#[inline]
+pub fn dot_product(a: &[f32], b: &[f32]) -> AttentionResult<f32> {
+    if a.len() != b.len() {
+        return Err(AttentionError::DimensionMismatch {
+            expected: a.len(),
+            actual: b.len(),
+        });
+    }
+
+    Ok(a.iter().zip(b.iter()).map(|(x, y)| x * y).sum())
+}
+
+/// Scales a vector by a scalar value.
+///
+/// # Arguments
+///
+/// * `vector` - Input vector (modified in place)
+/// * `scale` - Scale factor
+#[inline]
+pub fn scale_vector(vector: &mut [f32], scale: f32) {
+    vector.iter_mut().for_each(|x| *x *= scale);
+}
+
+/// Adds two vectors element-wise.
+///
+/// # Arguments
+///
+/// * `a` - First vector
+/// * `b` - Second vector
+///
+/// # Returns
+///
+/// Sum vector
+#[inline]
+pub fn add_vectors(a: &[f32], b: &[f32]) -> AttentionResult<Vec<f32>> {
+    if a.len() != b.len() {
+        return Err(AttentionError::DimensionMismatch {
+            expected: a.len(),
+            actual: b.len(),
+        });
+    }
+
+    Ok(a.iter().zip(b.iter()).map(|(x, y)| x + y).collect())
+}
+
+/// Computes L2 norm of a vector.
+///
+/// # Arguments
+///
+/// * `vector` - Input vector
+///
+/// # Returns
+///
+/// L2 norm value
+#[inline]
+pub fn l2_norm(vector: &[f32]) -> f32 {
+    vector.iter().map(|x| x * x).sum::<f32>().sqrt()
+}
+
+/// Normalizes a vector to unit length.
+///
+/// # Arguments
+///
+/// * `vector` - Input vector (modified in place)
+///
+/// # Returns
+///
+/// Original norm before normalization
+pub fn normalize_vector(vector: &mut [f32]) -> AttentionResult<f32> {
+    let norm = l2_norm(vector);
+
+    if norm <= 0.0 || !norm.is_finite() {
+        return Err(AttentionError::NumericalInstability(
+            "cannot normalize zero or non-finite vector".to_string(),
+        ));
+    }
+
+    let inv_norm = 1.0 / norm;
+    vector.iter_mut().for_each(|x| *x *= inv_norm);
+
+    Ok(norm)
+}
+
+/// Applies dropout to a vector during training.
+///
+/// # Arguments
+///
+/// * `vector` - Input vector (modified in place)
+/// * `dropout_prob` - Dropout probability (0.0 to 1.0)
+/// * `training` - Whether in training mode
+/// * `rng` - Random number generator
+pub fn apply_dropout(
+    vector: &mut [f32],
+    dropout_prob: f32,
+    training: bool,
+    rng: &mut impl rand::Rng,
+) {
+    if !training || dropout_prob == 0.0 {
+        return;
+    }
+
+    let scale = 1.0 / (1.0 - dropout_prob);
+    for x in vector.iter_mut() {
+        if rng.gen::<f32>() < dropout_prob {
+            *x = 0.0;
+        } else {
+            *x *= scale;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use approx::assert_relative_eq;
+
+    #[test]
+    fn test_softmax() {
+        let values = vec![1.0, 2.0, 3.0];
+        let result = softmax(&values).unwrap();
+
+        // Sum should be approximately 1.0
+        let sum: f32 = result.iter().sum();
+        assert_relative_eq!(sum, 1.0, epsilon = 1e-6);
+
+        // Values should be in ascending order
+        assert!(result[0] < result[1]);
+        assert!(result[1] < result[2]);
+    }
+
+    #[test]
+    fn test_softmax_numerical_stability() {
+        let values = vec![1000.0, 1001.0, 1002.0];
+        let result = softmax(&values).unwrap();
+
+        let sum: f32 = result.iter().sum();
+        assert_relative_eq!(sum, 1.0, epsilon = 1e-6);
+    }
+
+    #[test]
+    fn test_masked_softmax() {
+        let values = vec![1.0, 2.0, 3.0, 4.0];
+        let mask = vec![true, true, false, false];
+        let result = masked_softmax(&values, Some(&mask)).unwrap();
+
+        // Masked positions should be zero
+        assert_relative_eq!(result[2], 0.0, epsilon = 1e-6);
+        assert_relative_eq!(result[3], 0.0, epsilon = 1e-6);
+
+        // Unmasked positions should sum to 1
+        let sum: f32 = result[0] + result[1];
+        assert_relative_eq!(sum, 1.0, epsilon = 1e-6);
+    }
+
+    #[test]
+    fn test_dot_product() {
+        let a = vec![1.0, 2.0, 3.0];
+        let b = vec![4.0, 5.0, 6.0];
+        let result = dot_product(&a, &b).unwrap();
+
+        assert_relative_eq!(result, 32.0, epsilon = 1e-6);
+    }
+
+    #[test]
+    fn test_scale_vector() {
+        let mut vector = vec![1.0, 2.0, 3.0];
+        scale_vector(&mut vector, 2.0);
+
+        assert_relative_eq!(vector[0], 2.0);
+        assert_relative_eq!(vector[1], 4.0);
+        assert_relative_eq!(vector[2], 6.0);
+    }
+
+    #[test]
+    fn test_normalize_vector() {
+        let mut vector = vec![3.0, 4.0];
+        let norm = normalize_vector(&mut vector).unwrap();
+
+        assert_relative_eq!(norm, 5.0, epsilon = 1e-6);
+        assert_relative_eq!(l2_norm(&vector), 1.0, epsilon = 1e-6);
+    }
+
+    #[test]
+    fn test_causal_mask() {
+        let mut scores = vec![0.0; 9]; // 3x3 matrix
+        apply_causal_mask(&mut scores, 3, 3).unwrap();
+
+        // Check upper triangle is masked
+        assert_eq!(scores[1], f32::NEG_INFINITY); // (0, 1)
+        assert_eq!(scores[2], f32::NEG_INFINITY); // (0, 2)
+        assert_eq!(scores[5], f32::NEG_INFINITY); // (1, 2)
+
+        // Check diagonal and lower triangle are not masked
+        assert_eq!(scores[0], 0.0); // (0, 0)
+        assert_eq!(scores[4], 0.0); // (1, 1)
+        assert_eq!(scores[8], 0.0); // (2, 2)
+    }
+}